Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7efd91d4b4 | |||
| 0aa1269e56 | |||
| 3c29834354 | |||
| 0eb85906b0 | |||
| ff9b0528a2 | |||
| 8feaa7cd1b | |||
| 57a2b97ae8 | |||
| bd9afb027a |
@@ -52,10 +52,6 @@ ignored/
|
||||
.worktrees/
|
||||
environments/benchmarks/evals/
|
||||
|
||||
# Compression eval run outputs (harness lives in scripts/compression_eval/)
|
||||
scripts/compression_eval/results/*
|
||||
!scripts/compression_eval/results/.gitkeep
|
||||
|
||||
# Web UI build output
|
||||
hermes_cli/web_dist/
|
||||
|
||||
|
||||
+4
-12
@@ -10,11 +10,9 @@ ENV PYTHONUNBUFFERED=1
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright
|
||||
|
||||
# Install system dependencies in one layer, clear APT cache
|
||||
# tini reaps orphaned zombie processes (MCP stdio subprocesses, git, bun, etc.)
|
||||
# that would otherwise accumulate when hermes runs as PID 1. See #15012.
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli tini && \
|
||||
build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Non-root user for runtime; UID can be overridden via HERMES_UID at runtime
|
||||
@@ -43,15 +41,9 @@ COPY --chown=hermes:hermes . .
|
||||
# Build web dashboard (Vite outputs to hermes_cli/web_dist/)
|
||||
RUN cd web && npm run build
|
||||
|
||||
# ---------- Permissions ----------
|
||||
# Make install dir world-readable so any HERMES_UID can read it at runtime.
|
||||
# The venv needs to be traversable too.
|
||||
USER root
|
||||
RUN chmod -R a+rX /opt/hermes
|
||||
# Start as root so the entrypoint can usermod/groupmod + gosu.
|
||||
# If HERMES_UID is unset, the entrypoint drops to the default hermes user (10000).
|
||||
|
||||
# ---------- Python virtualenv ----------
|
||||
RUN chown hermes:hermes /opt/hermes
|
||||
USER hermes
|
||||
RUN uv venv && \
|
||||
uv pip install --no-cache-dir -e ".[all]"
|
||||
|
||||
@@ -60,4 +52,4 @@ ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist
|
||||
ENV HERMES_HOME=/opt/data
|
||||
ENV PATH="/opt/data/.local/bin:${PATH}"
|
||||
VOLUME [ "/opt/data" ]
|
||||
ENTRYPOINT [ "/usr/bin/tini", "-g", "--", "/opt/hermes/docker/entrypoint.sh" ]
|
||||
ENTRYPOINT [ "/opt/hermes/docker/entrypoint.sh" ]
|
||||
|
||||
@@ -60,7 +60,7 @@ from acp_adapter.events import (
|
||||
make_tool_progress_cb,
|
||||
)
|
||||
from acp_adapter.permissions import make_approval_callback
|
||||
from acp_adapter.session import SessionManager, SessionState, _expand_acp_enabled_toolsets
|
||||
from acp_adapter.session import SessionManager, SessionState
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -287,11 +287,7 @@ class HermesACPAgent(acp.Agent):
|
||||
try:
|
||||
from model_tools import get_tool_definitions
|
||||
|
||||
enabled_toolsets = _expand_acp_enabled_toolsets(
|
||||
getattr(state.agent, "enabled_toolsets", None) or ["hermes-acp"],
|
||||
mcp_server_names=[server.name for server in mcp_servers],
|
||||
)
|
||||
state.agent.enabled_toolsets = enabled_toolsets
|
||||
enabled_toolsets = getattr(state.agent, "enabled_toolsets", None) or ["hermes-acp"]
|
||||
disabled_toolsets = getattr(state.agent, "disabled_toolsets", None)
|
||||
state.agent.tools = get_tool_definitions(
|
||||
enabled_toolsets=enabled_toolsets,
|
||||
@@ -758,9 +754,7 @@ class HermesACPAgent(acp.Agent):
|
||||
def _cmd_tools(self, args: str, state: SessionState) -> str:
|
||||
try:
|
||||
from model_tools import get_tool_definitions
|
||||
toolsets = _expand_acp_enabled_toolsets(
|
||||
getattr(state.agent, "enabled_toolsets", None) or ["hermes-acp"]
|
||||
)
|
||||
toolsets = getattr(state.agent, "enabled_toolsets", None) or ["hermes-acp"]
|
||||
tools = get_tool_definitions(enabled_toolsets=toolsets, quiet_mode=True)
|
||||
if not tools:
|
||||
return "No tools available."
|
||||
|
||||
+1
-28
@@ -106,24 +106,6 @@ def _register_task_cwd(task_id: str, cwd: str) -> None:
|
||||
logger.debug("Failed to register ACP task cwd override", exc_info=True)
|
||||
|
||||
|
||||
def _expand_acp_enabled_toolsets(
|
||||
toolsets: List[str] | None = None,
|
||||
mcp_server_names: List[str] | None = None,
|
||||
) -> List[str]:
|
||||
"""Return ACP toolsets plus explicit MCP server toolsets for this session."""
|
||||
expanded: List[str] = []
|
||||
for name in list(toolsets or ["hermes-acp"]):
|
||||
if name and name not in expanded:
|
||||
expanded.append(name)
|
||||
|
||||
for server_name in list(mcp_server_names or []):
|
||||
toolset_name = f"mcp-{server_name}"
|
||||
if server_name and toolset_name not in expanded:
|
||||
expanded.append(toolset_name)
|
||||
|
||||
return expanded
|
||||
|
||||
|
||||
def _clear_task_cwd(task_id: str) -> None:
|
||||
"""Remove task-specific cwd overrides for an ACP session."""
|
||||
if not task_id:
|
||||
@@ -555,18 +537,9 @@ class SessionManager:
|
||||
elif isinstance(model_cfg, str) and model_cfg.strip():
|
||||
default_model = model_cfg.strip()
|
||||
|
||||
configured_mcp_servers = [
|
||||
name
|
||||
for name, cfg in (config.get("mcp_servers") or {}).items()
|
||||
if not isinstance(cfg, dict) or cfg.get("enabled", True) is not False
|
||||
]
|
||||
|
||||
kwargs = {
|
||||
"platform": "acp",
|
||||
"enabled_toolsets": _expand_acp_enabled_toolsets(
|
||||
["hermes-acp"],
|
||||
mcp_server_names=configured_mcp_servers,
|
||||
),
|
||||
"enabled_toolsets": ["hermes-acp"],
|
||||
"quiet_mode": True,
|
||||
"session_id": session_id,
|
||||
"model": model or default_model,
|
||||
|
||||
@@ -14,8 +14,6 @@ import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
@@ -279,9 +277,8 @@ def _is_oauth_token(key: str) -> bool:
|
||||
Positively identifies Anthropic OAuth tokens by their key format:
|
||||
- ``sk-ant-`` prefix (but NOT ``sk-ant-api``) → setup tokens, managed keys
|
||||
- ``eyJ`` prefix → JWTs from the Anthropic OAuth flow
|
||||
- ``cc-`` prefix → Claude Code OAuth access tokens (from CLAUDE_CODE_OAUTH_TOKEN)
|
||||
|
||||
Non-Anthropic keys (MiniMax, Alibaba, etc.) don't match any pattern
|
||||
Non-Anthropic keys (MiniMax, Alibaba, etc.) don't match either pattern
|
||||
and correctly return False.
|
||||
"""
|
||||
if not key:
|
||||
@@ -295,9 +292,6 @@ def _is_oauth_token(key: str) -> bool:
|
||||
# JWTs from Anthropic OAuth flow
|
||||
if key.startswith("eyJ"):
|
||||
return True
|
||||
# Claude Code OAuth access tokens (opaque, from CLAUDE_CODE_OAUTH_TOKEN)
|
||||
if key.startswith("cc-"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@@ -467,72 +461,8 @@ def build_anthropic_bedrock_client(region: str):
|
||||
)
|
||||
|
||||
|
||||
def _read_claude_code_credentials_from_keychain() -> Optional[Dict[str, Any]]:
|
||||
"""Read Claude Code OAuth credentials from the macOS Keychain.
|
||||
|
||||
Claude Code >=2.1.114 stores credentials in the macOS Keychain under the
|
||||
service name "Claude Code-credentials" rather than (or in addition to)
|
||||
the JSON file at ~/.claude/.credentials.json.
|
||||
|
||||
The password field contains a JSON string with the same claudeAiOauth
|
||||
structure as the JSON file.
|
||||
|
||||
Returns dict with {accessToken, refreshToken?, expiresAt?} or None.
|
||||
"""
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
if platform.system() != "Darwin":
|
||||
return None
|
||||
|
||||
try:
|
||||
# Read the "Claude Code-credentials" generic password entry
|
||||
result = subprocess.run(
|
||||
["security", "find-generic-password",
|
||||
"-s", "Claude Code-credentials",
|
||||
"-w"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
logger.debug("Keychain: security command not available or timed out")
|
||||
return None
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.debug("Keychain: no entry found for 'Claude Code-credentials'")
|
||||
return None
|
||||
|
||||
raw = result.stdout.strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
logger.debug("Keychain: credentials payload is not valid JSON")
|
||||
return None
|
||||
|
||||
oauth_data = data.get("claudeAiOauth")
|
||||
if oauth_data and isinstance(oauth_data, dict):
|
||||
access_token = oauth_data.get("accessToken", "")
|
||||
if access_token:
|
||||
return {
|
||||
"accessToken": access_token,
|
||||
"refreshToken": oauth_data.get("refreshToken", ""),
|
||||
"expiresAt": oauth_data.get("expiresAt", 0),
|
||||
"source": "macos_keychain",
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
|
||||
"""Read refreshable Claude Code OAuth credentials.
|
||||
|
||||
Checks two sources in order:
|
||||
1. macOS Keychain (Darwin only) — "Claude Code-credentials" entry
|
||||
2. ~/.claude/.credentials.json file
|
||||
"""Read refreshable Claude Code OAuth credentials from ~/.claude/.credentials.json.
|
||||
|
||||
This intentionally excludes ~/.claude.json primaryApiKey. Opencode's
|
||||
subscription flow is OAuth/setup-token based with refreshable credentials,
|
||||
@@ -541,12 +471,6 @@ def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
|
||||
|
||||
Returns dict with {accessToken, refreshToken?, expiresAt?} or None.
|
||||
"""
|
||||
# Try macOS Keychain first (covers Claude Code >=2.1.114)
|
||||
kc_creds = _read_claude_code_credentials_from_keychain()
|
||||
if kc_creds:
|
||||
return kc_creds
|
||||
|
||||
# Fall back to JSON file
|
||||
cred_path = Path.home() / ".claude" / ".credentials.json"
|
||||
if cred_path.exists():
|
||||
try:
|
||||
@@ -717,9 +641,7 @@ def _write_claude_code_credentials(
|
||||
existing["claudeAiOauth"] = oauth_data
|
||||
|
||||
cred_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
_tmp_cred = cred_path.with_suffix(".tmp")
|
||||
_tmp_cred.write_text(json.dumps(existing, indent=2), encoding="utf-8")
|
||||
_tmp_cred.replace(cred_path)
|
||||
cred_path.write_text(json.dumps(existing, indent=2), encoding="utf-8")
|
||||
# Restrict permissions (credentials file)
|
||||
cred_path.chmod(0o600)
|
||||
except (OSError, IOError) as e:
|
||||
@@ -1676,3 +1598,4 @@ def build_anthropic_kwargs(
|
||||
return kwargs
|
||||
|
||||
|
||||
|
||||
|
||||
+8
-197
@@ -74,12 +74,6 @@ _PROVIDER_ALIASES = {
|
||||
"minimax_cn": "minimax-cn",
|
||||
"claude": "anthropic",
|
||||
"claude-code": "anthropic",
|
||||
"github": "copilot",
|
||||
"github-copilot": "copilot",
|
||||
"github-model": "copilot",
|
||||
"github-models": "copilot",
|
||||
"github-copilot-acp": "copilot-acp",
|
||||
"copilot-acp-agent": "copilot-acp",
|
||||
}
|
||||
|
||||
|
||||
@@ -95,11 +89,10 @@ def _normalize_aux_provider(provider: Optional[str]) -> str:
|
||||
if normalized == "main":
|
||||
# Resolve to the user's actual main provider so named custom providers
|
||||
# and non-aggregator providers (DeepSeek, Alibaba, etc.) work correctly.
|
||||
main_prov = (_read_main_provider() or "").strip().lower()
|
||||
main_prov = _read_main_provider()
|
||||
if main_prov and main_prov not in ("auto", "main", ""):
|
||||
normalized = main_prov
|
||||
else:
|
||||
return "custom"
|
||||
return main_prov
|
||||
return "custom"
|
||||
return _PROVIDER_ALIASES.get(normalized, normalized)
|
||||
|
||||
|
||||
@@ -1349,68 +1342,6 @@ def _is_auth_error(exc: Exception) -> bool:
|
||||
return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
|
||||
|
||||
|
||||
def _evict_cached_clients(provider: str) -> None:
|
||||
"""Drop cached auxiliary clients for a provider so fresh creds are used."""
|
||||
normalized = _normalize_aux_provider(provider)
|
||||
with _client_cache_lock:
|
||||
stale_keys = [
|
||||
key for key in _client_cache
|
||||
if _normalize_aux_provider(str(key[0])) == normalized
|
||||
]
|
||||
for key in stale_keys:
|
||||
client = _client_cache.get(key, (None, None, None))[0]
|
||||
if client is not None:
|
||||
_force_close_async_httpx(client)
|
||||
try:
|
||||
close_fn = getattr(client, "close", None)
|
||||
if callable(close_fn):
|
||||
close_fn()
|
||||
except Exception:
|
||||
pass
|
||||
_client_cache.pop(key, None)
|
||||
|
||||
|
||||
def _refresh_provider_credentials(provider: str) -> bool:
|
||||
"""Refresh short-lived credentials for OAuth-backed auxiliary providers."""
|
||||
normalized = _normalize_aux_provider(provider)
|
||||
try:
|
||||
if normalized == "openai-codex":
|
||||
from hermes_cli.auth import resolve_codex_runtime_credentials
|
||||
|
||||
creds = resolve_codex_runtime_credentials(force_refresh=True)
|
||||
if not str(creds.get("api_key", "") or "").strip():
|
||||
return False
|
||||
_evict_cached_clients(normalized)
|
||||
return True
|
||||
if normalized == "nous":
|
||||
from hermes_cli.auth import resolve_nous_runtime_credentials
|
||||
|
||||
creds = resolve_nous_runtime_credentials(
|
||||
min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
|
||||
timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
|
||||
force_mint=True,
|
||||
)
|
||||
if not str(creds.get("api_key", "") or "").strip():
|
||||
return False
|
||||
_evict_cached_clients(normalized)
|
||||
return True
|
||||
if normalized == "anthropic":
|
||||
from agent.anthropic_adapter import read_claude_code_credentials, _refresh_oauth_token, resolve_anthropic_token
|
||||
|
||||
creds = read_claude_code_credentials()
|
||||
token = _refresh_oauth_token(creds) if isinstance(creds, dict) and creds.get("refreshToken") else None
|
||||
if not str(token or "").strip():
|
||||
token = resolve_anthropic_token()
|
||||
if not str(token or "").strip():
|
||||
return False
|
||||
_evict_cached_clients(normalized)
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.debug("Auxiliary provider credential refresh failed for %s: %s", normalized, exc)
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
def _try_payment_fallback(
|
||||
failed_provider: str,
|
||||
task: str = None,
|
||||
@@ -1805,7 +1736,7 @@ def resolve_provider_client(
|
||||
"but no endpoint credentials found")
|
||||
return None, None
|
||||
|
||||
# ── Named custom providers (config.yaml providers dict / custom_providers list) ───
|
||||
# ── Named custom providers (config.yaml custom_providers list) ───
|
||||
try:
|
||||
from hermes_cli.runtime_provider import _get_named_custom_provider
|
||||
custom_entry = _get_named_custom_provider(provider)
|
||||
@@ -1816,51 +1747,16 @@ def resolve_provider_client(
|
||||
if not custom_key and custom_key_env:
|
||||
custom_key = os.getenv(custom_key_env, "").strip()
|
||||
custom_key = custom_key or "no-key-required"
|
||||
# An explicit per-task api_mode override (from _resolve_task_provider_model)
|
||||
# wins; otherwise fall back to what the provider entry declared.
|
||||
entry_api_mode = (api_mode or custom_entry.get("api_mode") or "").strip()
|
||||
if custom_base:
|
||||
final_model = _normalize_resolved_model(
|
||||
model or custom_entry.get("model") or _read_main_model() or "gpt-4o-mini",
|
||||
provider,
|
||||
)
|
||||
logger.debug(
|
||||
"resolve_provider_client: named custom provider %r (%s, api_mode=%s)",
|
||||
provider, final_model, entry_api_mode or "chat_completions")
|
||||
# anthropic_messages: route through the Anthropic Messages API
|
||||
# via AnthropicAuxiliaryClient. Mirrors the anonymous-custom
|
||||
# branch in _try_custom_endpoint(). See #15033.
|
||||
if entry_api_mode == "anthropic_messages":
|
||||
try:
|
||||
from agent.anthropic_adapter import build_anthropic_client
|
||||
real_client = build_anthropic_client(custom_key, custom_base)
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"Named custom provider %r declares api_mode="
|
||||
"anthropic_messages but the anthropic SDK is not "
|
||||
"installed — falling back to OpenAI-wire.",
|
||||
provider,
|
||||
)
|
||||
client = OpenAI(api_key=custom_key, base_url=custom_base)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
else (client, final_model))
|
||||
sync_anthropic = AnthropicAuxiliaryClient(
|
||||
real_client, final_model, custom_key, custom_base, is_oauth=False,
|
||||
)
|
||||
if async_mode:
|
||||
return AsyncAnthropicAuxiliaryClient(sync_anthropic), final_model
|
||||
return sync_anthropic, final_model
|
||||
client = OpenAI(api_key=custom_key, base_url=custom_base)
|
||||
# codex_responses or inherited auto-detect (via _wrap_if_needed).
|
||||
# _wrap_if_needed reads the closed-over `api_mode` (the task-level
|
||||
# override). Named-provider entry api_mode=codex_responses also
|
||||
# flows through here.
|
||||
if entry_api_mode == "codex_responses" and not isinstance(
|
||||
client, CodexAuxiliaryClient
|
||||
):
|
||||
client = CodexAuxiliaryClient(client, final_model)
|
||||
else:
|
||||
client = _wrap_if_needed(client, final_model, custom_base)
|
||||
client = _wrap_if_needed(client, final_model, custom_base)
|
||||
logger.debug(
|
||||
"resolve_provider_client: named custom provider %r (%s)",
|
||||
provider, final_model)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
else (client, final_model))
|
||||
logger.warning(
|
||||
@@ -2961,49 +2857,6 @@ def call_llm(
|
||||
return _validate_llm_response(
|
||||
refreshed_client.chat.completions.create(**kwargs), task)
|
||||
|
||||
# ── Auth refresh retry ───────────────────────────────────────
|
||||
if (_is_auth_error(first_err)
|
||||
and resolved_provider not in ("auto", "", None)
|
||||
and not client_is_nous):
|
||||
if _refresh_provider_credentials(resolved_provider):
|
||||
logger.info(
|
||||
"Auxiliary %s: refreshed %s credentials after auth error, retrying",
|
||||
task or "call", resolved_provider,
|
||||
)
|
||||
retry_client, retry_model = (
|
||||
resolve_vision_provider_client(
|
||||
provider=resolved_provider,
|
||||
model=final_model,
|
||||
async_mode=False,
|
||||
)[1:]
|
||||
if task == "vision"
|
||||
else _get_cached_client(
|
||||
resolved_provider,
|
||||
resolved_model,
|
||||
base_url=resolved_base_url,
|
||||
api_key=resolved_api_key,
|
||||
api_mode=resolved_api_mode,
|
||||
main_runtime=main_runtime,
|
||||
)
|
||||
)
|
||||
if retry_client is not None:
|
||||
retry_kwargs = _build_call_kwargs(
|
||||
resolved_provider,
|
||||
retry_model or final_model,
|
||||
messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
tools=tools,
|
||||
timeout=effective_timeout,
|
||||
extra_body=effective_extra_body,
|
||||
base_url=resolved_base_url,
|
||||
)
|
||||
_retry_base = str(getattr(retry_client, "base_url", "") or "")
|
||||
if _is_anthropic_compat_endpoint(resolved_provider, _retry_base):
|
||||
retry_kwargs["messages"] = _convert_openai_images_to_anthropic(retry_kwargs["messages"])
|
||||
return _validate_llm_response(
|
||||
retry_client.chat.completions.create(**retry_kwargs), task)
|
||||
|
||||
# ── Payment / credit exhaustion fallback ──────────────────────
|
||||
# When the resolved provider returns 402 or a credit-related error,
|
||||
# try alternative providers instead of giving up. This handles the
|
||||
@@ -3224,48 +3077,6 @@ async def async_call_llm(
|
||||
return _validate_llm_response(
|
||||
await refreshed_client.chat.completions.create(**kwargs), task)
|
||||
|
||||
# ── Auth refresh retry (mirrors sync call_llm) ───────────────
|
||||
if (_is_auth_error(first_err)
|
||||
and resolved_provider not in ("auto", "", None)
|
||||
and not client_is_nous):
|
||||
if _refresh_provider_credentials(resolved_provider):
|
||||
logger.info(
|
||||
"Auxiliary %s (async): refreshed %s credentials after auth error, retrying",
|
||||
task or "call", resolved_provider,
|
||||
)
|
||||
if task == "vision":
|
||||
_, retry_client, retry_model = resolve_vision_provider_client(
|
||||
provider=resolved_provider,
|
||||
model=final_model,
|
||||
async_mode=True,
|
||||
)
|
||||
else:
|
||||
retry_client, retry_model = _get_cached_client(
|
||||
resolved_provider,
|
||||
resolved_model,
|
||||
async_mode=True,
|
||||
base_url=resolved_base_url,
|
||||
api_key=resolved_api_key,
|
||||
api_mode=resolved_api_mode,
|
||||
)
|
||||
if retry_client is not None:
|
||||
retry_kwargs = _build_call_kwargs(
|
||||
resolved_provider,
|
||||
retry_model or final_model,
|
||||
messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
tools=tools,
|
||||
timeout=effective_timeout,
|
||||
extra_body=effective_extra_body,
|
||||
base_url=resolved_base_url,
|
||||
)
|
||||
_retry_base = str(getattr(retry_client, "base_url", "") or "")
|
||||
if _is_anthropic_compat_endpoint(resolved_provider, _retry_base):
|
||||
retry_kwargs["messages"] = _convert_openai_images_to_anthropic(retry_kwargs["messages"])
|
||||
return _validate_llm_response(
|
||||
await retry_client.chat.completions.create(**retry_kwargs), task)
|
||||
|
||||
# ── Payment / connection fallback (mirrors sync call_llm) ─────
|
||||
should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
|
||||
is_auto = resolved_provider in ("auto", "", None)
|
||||
|
||||
@@ -1099,21 +1099,6 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
||||
|
||||
return max(cut_idx, head_end + 1)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# ContextEngine: manual /compress preflight
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool:
|
||||
"""Return True if there is a non-empty middle region to compact.
|
||||
|
||||
Overrides the ABC default so the gateway ``/compress`` guard can
|
||||
skip the LLM call when the transcript is still entirely inside
|
||||
the protected head/tail.
|
||||
"""
|
||||
compress_start = self._align_boundary_forward(messages, self.protect_first_n)
|
||||
compress_end = self._find_tail_cut_by_tokens(messages, compress_start)
|
||||
return compress_start < compress_end
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Main compression entry point
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@@ -78,7 +78,6 @@ class ContextEngine(ABC):
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
current_tokens: int = None,
|
||||
focus_topic: str = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Compact the message list and return the new message list.
|
||||
|
||||
@@ -87,12 +86,6 @@ class ContextEngine(ABC):
|
||||
context budget. The implementation is free to summarize, build a
|
||||
DAG, or do anything else — as long as the returned list is a valid
|
||||
OpenAI-format message sequence.
|
||||
|
||||
Args:
|
||||
focus_topic: Optional topic string from manual ``/compress <focus>``.
|
||||
Engines that support guided compression should prioritise
|
||||
preserving information related to this topic. Engines that
|
||||
don't support it may simply ignore this argument.
|
||||
"""
|
||||
|
||||
# -- Optional: pre-flight check ----------------------------------------
|
||||
@@ -105,21 +98,6 @@ class ContextEngine(ABC):
|
||||
"""
|
||||
return False
|
||||
|
||||
# -- Optional: manual /compress preflight ------------------------------
|
||||
|
||||
def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool:
|
||||
"""Quick check: is there anything in ``messages`` that can be compacted?
|
||||
|
||||
Used by the gateway ``/compress`` command as a preflight guard —
|
||||
returning False lets the gateway report "nothing to compress yet"
|
||||
without making an LLM call.
|
||||
|
||||
Default returns True (always attempt). Engines with a cheap way
|
||||
to introspect their own head/tail boundaries should override this
|
||||
to return False when the transcript is still entirely protected.
|
||||
"""
|
||||
return True
|
||||
|
||||
# -- Optional: session lifecycle ---------------------------------------
|
||||
|
||||
def on_session_start(self, session_id: str, **kwargs) -> None:
|
||||
|
||||
@@ -46,47 +46,6 @@ def _resolve_args() -> list[str]:
|
||||
return shlex.split(raw)
|
||||
|
||||
|
||||
def _resolve_home_dir() -> str:
|
||||
"""Return a stable HOME for child ACP processes."""
|
||||
|
||||
try:
|
||||
from hermes_constants import get_subprocess_home
|
||||
|
||||
profile_home = get_subprocess_home()
|
||||
if profile_home:
|
||||
return profile_home
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
home = os.environ.get("HOME", "").strip()
|
||||
if home:
|
||||
return home
|
||||
|
||||
expanded = os.path.expanduser("~")
|
||||
if expanded and expanded != "~":
|
||||
return expanded
|
||||
|
||||
try:
|
||||
import pwd
|
||||
|
||||
resolved = pwd.getpwuid(os.getuid()).pw_dir.strip()
|
||||
if resolved:
|
||||
return resolved
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Last resort: /tmp (writable on any POSIX system). Avoids crashing the
|
||||
# subprocess with no HOME; callers can set HERMES_HOME explicitly if they
|
||||
# need a different writable dir.
|
||||
return "/tmp"
|
||||
|
||||
|
||||
def _build_subprocess_env() -> dict[str, str]:
|
||||
env = os.environ.copy()
|
||||
env["HOME"] = _resolve_home_dir()
|
||||
return env
|
||||
|
||||
|
||||
def _jsonrpc_error(message_id: Any, code: int, message: str) -> dict[str, Any]:
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
@@ -423,7 +382,6 @@ class CopilotACPClient:
|
||||
text=True,
|
||||
bufsize=1,
|
||||
cwd=self._acp_cwd,
|
||||
env=_build_subprocess_env(),
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
|
||||
+3
-108
@@ -455,61 +455,6 @@ class CredentialPool:
|
||||
logger.debug("Failed to sync from credentials file: %s", exc)
|
||||
return entry
|
||||
|
||||
def _sync_nous_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
|
||||
"""Sync a Nous pool entry from auth.json if tokens differ.
|
||||
|
||||
Nous OAuth refresh tokens are single-use. When another process
|
||||
(e.g. a concurrent cron) refreshes the token via
|
||||
``resolve_nous_runtime_credentials``, it writes fresh tokens to
|
||||
auth.json under ``_auth_store_lock``. The pool entry's tokens
|
||||
become stale. This method detects that and adopts the newer pair,
|
||||
avoiding a "refresh token reuse" revocation on the Nous Portal.
|
||||
"""
|
||||
if self.provider != "nous" or entry.source != "device_code":
|
||||
return entry
|
||||
try:
|
||||
with _auth_store_lock():
|
||||
auth_store = _load_auth_store()
|
||||
state = _load_provider_state(auth_store, "nous")
|
||||
if not state:
|
||||
return entry
|
||||
store_refresh = state.get("refresh_token", "")
|
||||
store_access = state.get("access_token", "")
|
||||
if store_refresh and store_refresh != entry.refresh_token:
|
||||
logger.debug(
|
||||
"Pool entry %s: syncing tokens from auth.json (Nous refresh token changed)",
|
||||
entry.id,
|
||||
)
|
||||
field_updates: Dict[str, Any] = {
|
||||
"access_token": store_access,
|
||||
"refresh_token": store_refresh,
|
||||
"last_status": None,
|
||||
"last_status_at": None,
|
||||
"last_error_code": None,
|
||||
}
|
||||
if state.get("expires_at"):
|
||||
field_updates["expires_at"] = state["expires_at"]
|
||||
if state.get("agent_key"):
|
||||
field_updates["agent_key"] = state["agent_key"]
|
||||
if state.get("agent_key_expires_at"):
|
||||
field_updates["agent_key_expires_at"] = state["agent_key_expires_at"]
|
||||
if state.get("inference_base_url"):
|
||||
field_updates["inference_base_url"] = state["inference_base_url"]
|
||||
extra_updates = dict(entry.extra)
|
||||
for extra_key in ("obtained_at", "expires_in", "agent_key_id",
|
||||
"agent_key_expires_in", "agent_key_reused",
|
||||
"agent_key_obtained_at"):
|
||||
val = state.get(extra_key)
|
||||
if val is not None:
|
||||
extra_updates[extra_key] = val
|
||||
updated = replace(entry, extra=extra_updates, **field_updates)
|
||||
self._replace_entry(entry, updated)
|
||||
self._persist()
|
||||
return updated
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to sync Nous entry from auth.json: %s", exc)
|
||||
return entry
|
||||
|
||||
def _sync_device_code_entry_to_auth_store(self, entry: PooledCredential) -> None:
|
||||
"""Write refreshed pool entry tokens back to auth.json providers.
|
||||
|
||||
@@ -616,9 +561,6 @@ class CredentialPool:
|
||||
last_refresh=refreshed.get("last_refresh"),
|
||||
)
|
||||
elif self.provider == "nous":
|
||||
synced = self._sync_nous_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
nous_state = {
|
||||
"access_token": entry.access_token,
|
||||
"refresh_token": entry.refresh_token,
|
||||
@@ -693,26 +635,6 @@ class CredentialPool:
|
||||
# Credentials file had a valid (non-expired) token — use it directly
|
||||
logger.debug("Credentials file has valid token, using without refresh")
|
||||
return synced
|
||||
# For nous: another process may have consumed the refresh token
|
||||
# between our proactive sync and the HTTP call. Re-sync from
|
||||
# auth.json and adopt the fresh tokens if available.
|
||||
if self.provider == "nous":
|
||||
synced = self._sync_nous_entry_from_auth_store(entry)
|
||||
if synced.refresh_token != entry.refresh_token:
|
||||
logger.debug("Nous refresh failed but auth.json has newer tokens — adopting")
|
||||
updated = replace(
|
||||
synced,
|
||||
last_status=STATUS_OK,
|
||||
last_status_at=None,
|
||||
last_error_code=None,
|
||||
last_error_reason=None,
|
||||
last_error_message=None,
|
||||
last_error_reset_at=None,
|
||||
)
|
||||
self._replace_entry(synced, updated)
|
||||
self._persist()
|
||||
self._sync_device_code_entry_to_auth_store(updated)
|
||||
return updated
|
||||
self._mark_exhausted(entry, None)
|
||||
return None
|
||||
|
||||
@@ -776,17 +698,6 @@ class CredentialPool:
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
cleared_any = True
|
||||
# For nous entries, sync from auth.json before status checks.
|
||||
# Another process may have successfully refreshed via
|
||||
# resolve_nous_runtime_credentials(), making this entry's
|
||||
# exhausted status stale.
|
||||
if (self.provider == "nous"
|
||||
and entry.source == "device_code"
|
||||
and entry.last_status == STATUS_EXHAUSTED):
|
||||
synced = self._sync_nous_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
cleared_any = True
|
||||
if entry.last_status == STATUS_EXHAUSTED:
|
||||
exhausted_until = _exhausted_until(entry)
|
||||
if exhausted_until is not None and now < exhausted_until:
|
||||
@@ -828,11 +739,8 @@ class CredentialPool:
|
||||
|
||||
if self._strategy == STRATEGY_LEAST_USED and len(available) > 1:
|
||||
entry = min(available, key=lambda e: e.request_count)
|
||||
# Increment usage counter so subsequent selections distribute load
|
||||
updated = replace(entry, request_count=entry.request_count + 1)
|
||||
self._replace_entry(entry, updated)
|
||||
self._current_id = entry.id
|
||||
return updated
|
||||
return entry
|
||||
|
||||
if self._strategy == STRATEGY_ROUND_ROBIN and len(available) > 1:
|
||||
entry = available[0]
|
||||
@@ -1148,18 +1056,6 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
|
||||
"inference_base_url": state.get("inference_base_url"),
|
||||
"agent_key": state.get("agent_key"),
|
||||
"agent_key_expires_at": state.get("agent_key_expires_at"),
|
||||
# Carry the mint/refresh timestamps into the pool so
|
||||
# freshness-sensitive consumers (self-heal hooks, pool
|
||||
# pruning by age) can distinguish just-minted credentials
|
||||
# from stale ones. Without these, fresh device_code
|
||||
# entries get obtained_at=None and look older than they
|
||||
# are (#15099).
|
||||
"obtained_at": state.get("obtained_at"),
|
||||
"expires_in": state.get("expires_in"),
|
||||
"agent_key_id": state.get("agent_key_id"),
|
||||
"agent_key_expires_in": state.get("agent_key_expires_in"),
|
||||
"agent_key_reused": state.get("agent_key_reused"),
|
||||
"agent_key_obtained_at": state.get("agent_key_obtained_at"),
|
||||
"tls": state.get("tls") if isinstance(state.get("tls"), dict) else None,
|
||||
"label": seeded_label,
|
||||
},
|
||||
@@ -1170,10 +1066,9 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
|
||||
# env vars (COPILOT_GITHUB_TOKEN / GH_TOKEN). They don't live in
|
||||
# the auth store or credential pool, so we resolve them here.
|
||||
try:
|
||||
from hermes_cli.copilot_auth import resolve_copilot_token, get_copilot_api_token
|
||||
from hermes_cli.copilot_auth import resolve_copilot_token
|
||||
token, source = resolve_copilot_token()
|
||||
if token:
|
||||
api_token = get_copilot_api_token(token)
|
||||
source_name = "gh_cli" if "gh" in source.lower() else f"env:{source}"
|
||||
if not _is_suppressed(provider, source_name):
|
||||
active_sources.add(source_name)
|
||||
@@ -1185,7 +1080,7 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
|
||||
{
|
||||
"source": source_name,
|
||||
"auth_type": AUTH_TYPE_API_KEY,
|
||||
"access_token": api_token,
|
||||
"access_token": token,
|
||||
"base_url": pconfig.inference_base_url if pconfig else "",
|
||||
"label": source,
|
||||
},
|
||||
|
||||
@@ -343,11 +343,6 @@ def classify_api_error(
|
||||
"""
|
||||
status_code = _extract_status_code(error)
|
||||
error_type = type(error).__name__
|
||||
# Copilot/GitHub Models RateLimitError may not set .status_code; force 429
|
||||
# so downstream rate-limit handling (classifier reason, pool rotation,
|
||||
# fallback gating) fires correctly instead of misclassifying as generic.
|
||||
if status_code is None and error_type == "RateLimitError":
|
||||
status_code = 429
|
||||
body = _extract_error_body(error)
|
||||
error_code = _extract_error_code(body)
|
||||
|
||||
|
||||
@@ -44,97 +44,6 @@ def is_native_gemini_base_url(base_url: str) -> bool:
|
||||
return not normalized.endswith("/openai")
|
||||
|
||||
|
||||
def probe_gemini_tier(
|
||||
api_key: str,
|
||||
base_url: str = DEFAULT_GEMINI_BASE_URL,
|
||||
*,
|
||||
model: str = "gemini-2.5-flash",
|
||||
timeout: float = 10.0,
|
||||
) -> str:
|
||||
"""Probe a Google AI Studio API key and return its tier.
|
||||
|
||||
Returns one of:
|
||||
|
||||
- ``"free"`` -- key is on the free tier (unusable with Hermes)
|
||||
- ``"paid"`` -- key is on a paid tier
|
||||
- ``"unknown"`` -- probe failed; callers should proceed without blocking.
|
||||
"""
|
||||
key = (api_key or "").strip()
|
||||
if not key:
|
||||
return "unknown"
|
||||
|
||||
normalized_base = str(base_url or DEFAULT_GEMINI_BASE_URL).strip().rstrip("/")
|
||||
if not normalized_base:
|
||||
normalized_base = DEFAULT_GEMINI_BASE_URL
|
||||
if normalized_base.lower().endswith("/openai"):
|
||||
normalized_base = normalized_base[: -len("/openai")]
|
||||
|
||||
url = f"{normalized_base}/models/{model}:generateContent"
|
||||
payload = {
|
||||
"contents": [{"role": "user", "parts": [{"text": "hi"}]}],
|
||||
"generationConfig": {"maxOutputTokens": 1},
|
||||
}
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=timeout) as client:
|
||||
resp = client.post(
|
||||
url,
|
||||
params={"key": key},
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("probe_gemini_tier: network error: %s", exc)
|
||||
return "unknown"
|
||||
|
||||
headers_lower = {k.lower(): v for k, v in resp.headers.items()}
|
||||
rpd_header = headers_lower.get("x-ratelimit-limit-requests-per-day")
|
||||
if rpd_header:
|
||||
try:
|
||||
rpd_val = int(rpd_header)
|
||||
except (TypeError, ValueError):
|
||||
rpd_val = None
|
||||
# Published free-tier daily caps (Dec 2025):
|
||||
# gemini-2.5-pro: 100, gemini-2.5-flash: 250, flash-lite: 1000
|
||||
# Tier 1 starts at ~1500+ for Flash. We treat <= 1000 as free.
|
||||
if rpd_val is not None and rpd_val <= 1000:
|
||||
return "free"
|
||||
if rpd_val is not None and rpd_val > 1000:
|
||||
return "paid"
|
||||
|
||||
if resp.status_code == 429:
|
||||
body_text = ""
|
||||
try:
|
||||
body_text = resp.text or ""
|
||||
except Exception:
|
||||
body_text = ""
|
||||
if "free_tier" in body_text.lower():
|
||||
return "free"
|
||||
return "paid"
|
||||
|
||||
if 200 <= resp.status_code < 300:
|
||||
return "paid"
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
def is_free_tier_quota_error(error_message: str) -> bool:
|
||||
"""Return True when a Gemini 429 message indicates free-tier exhaustion."""
|
||||
if not error_message:
|
||||
return False
|
||||
return "free_tier" in error_message.lower()
|
||||
|
||||
|
||||
_FREE_TIER_GUIDANCE = (
|
||||
"\n\nYour Google API key is on the free tier (<= 250 requests/day for "
|
||||
"gemini-2.5-flash). Hermes typically makes 3-10 API calls per user turn, "
|
||||
"so the free tier is exhausted in a handful of messages and cannot sustain "
|
||||
"an agent session. Enable billing on your Google Cloud project and "
|
||||
"regenerate the key in a billing-enabled project: "
|
||||
"https://aistudio.google.com/apikey"
|
||||
)
|
||||
|
||||
|
||||
class GeminiAPIError(Exception):
|
||||
"""Error shape compatible with Hermes retry/error classification."""
|
||||
|
||||
@@ -741,12 +650,6 @@ def gemini_http_error(response: httpx.Response) -> GeminiAPIError:
|
||||
else:
|
||||
message = f"Gemini returned HTTP {status}: {body_text[:500]}"
|
||||
|
||||
# Free-tier quota exhaustion -> append actionable guidance so users who
|
||||
# bypassed the setup wizard (direct GOOGLE_API_KEY in .env) still learn
|
||||
# that the free tier cannot sustain an agent session.
|
||||
if status == 429 and is_free_tier_quota_error(err_message or body_text):
|
||||
message = message + _FREE_TIER_GUIDANCE
|
||||
|
||||
return GeminiAPIError(
|
||||
message,
|
||||
code=code,
|
||||
@@ -801,13 +704,6 @@ class GeminiNativeClient:
|
||||
http_client: Optional[httpx.Client] = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
if not (api_key or "").strip():
|
||||
raise RuntimeError(
|
||||
"Gemini native client requires an API key, but none was provided. "
|
||||
"Set GOOGLE_API_KEY or GEMINI_API_KEY in your environment / ~/.hermes/.env "
|
||||
"(get one at https://aistudio.google.com/app/apikey), or run `hermes setup` "
|
||||
"to configure the Google provider."
|
||||
)
|
||||
self.api_key = api_key
|
||||
normalized_base = (base_url or DEFAULT_GEMINI_BASE_URL).rstrip("/")
|
||||
if normalized_base.endswith("/openai"):
|
||||
|
||||
@@ -73,20 +73,6 @@ def sanitize_gemini_schema(schema: Any) -> Dict[str, Any]:
|
||||
]
|
||||
continue
|
||||
cleaned[key] = value
|
||||
|
||||
# Gemini's Schema validator requires every ``enum`` entry to be a string,
|
||||
# even when the parent ``type`` is ``integer`` / ``number`` / ``boolean``.
|
||||
# OpenAI / OpenRouter / Anthropic accept typed enums (e.g. Discord's
|
||||
# ``auto_archive_duration: {type: integer, enum: [60, 1440, 4320, 10080]}``),
|
||||
# so we only drop the ``enum`` when it would collide with Gemini's rule.
|
||||
# Keeping ``type: integer`` plus the human-readable description gives the
|
||||
# model enough guidance; the tool handler still validates the value.
|
||||
enum_val = cleaned.get("enum")
|
||||
type_val = cleaned.get("type")
|
||||
if isinstance(enum_val, list) and type_val in {"integer", "number", "boolean"}:
|
||||
if any(not isinstance(item, str) for item in enum_val):
|
||||
cleaned.pop("enum", None)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
|
||||
+6
-72
@@ -6,7 +6,6 @@ and run_agent.py for pre-flight context checks.
|
||||
|
||||
import ipaddress
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -22,25 +21,6 @@ from hermes_constants import OPENROUTER_MODELS_URL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _resolve_requests_verify() -> bool | str:
|
||||
"""Resolve SSL verify setting for `requests` calls from env vars.
|
||||
|
||||
The `requests` library only honours REQUESTS_CA_BUNDLE / CURL_CA_BUNDLE
|
||||
by default. Hermes also honours HERMES_CA_BUNDLE (its own convention)
|
||||
and SSL_CERT_FILE (used by the stdlib `ssl` module and by httpx), so
|
||||
that a single env var can cover both `requests` and `httpx` callsites
|
||||
inside the same process.
|
||||
|
||||
Returns either a filesystem path to a CA bundle, or True to defer to
|
||||
the requests default (certifi).
|
||||
"""
|
||||
for env_var in ("HERMES_CA_BUNDLE", "REQUESTS_CA_BUNDLE", "SSL_CERT_FILE"):
|
||||
val = os.getenv(env_var)
|
||||
if val and os.path.isfile(val):
|
||||
return val
|
||||
return True
|
||||
|
||||
# Provider names that can appear as a "provider:" prefix before a model ID.
|
||||
# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
|
||||
# are preserved so the full model name reaches cache lookups and server queries.
|
||||
@@ -515,7 +495,7 @@ def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any
|
||||
return _model_metadata_cache
|
||||
|
||||
try:
|
||||
response = requests.get(OPENROUTER_MODELS_URL, timeout=10, verify=_resolve_requests_verify())
|
||||
response = requests.get(OPENROUTER_MODELS_URL, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
@@ -582,7 +562,6 @@ def fetch_endpoint_model_metadata(
|
||||
server_url.rstrip("/") + "/api/v1/models",
|
||||
headers=headers,
|
||||
timeout=10,
|
||||
verify=_resolve_requests_verify(),
|
||||
)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
@@ -631,7 +610,7 @@ def fetch_endpoint_model_metadata(
|
||||
for candidate in candidates:
|
||||
url = candidate.rstrip("/") + "/models"
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10, verify=_resolve_requests_verify())
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
cache: Dict[str, Dict[str, Any]] = {}
|
||||
@@ -662,10 +641,9 @@ def fetch_endpoint_model_metadata(
|
||||
try:
|
||||
# Try /v1/props first (current llama.cpp); fall back to /props for older builds
|
||||
base = candidate.rstrip("/").replace("/v1", "")
|
||||
_verify = _resolve_requests_verify()
|
||||
props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5, verify=_verify)
|
||||
props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5)
|
||||
if not props_resp.ok:
|
||||
props_resp = requests.get(base + "/props", headers=headers, timeout=5, verify=_verify)
|
||||
props_resp = requests.get(base + "/props", headers=headers, timeout=5)
|
||||
if props_resp.ok:
|
||||
props = props_resp.json()
|
||||
gen_settings = props.get("default_generation_settings", {})
|
||||
@@ -737,22 +715,6 @@ def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
|
||||
return cache.get(key)
|
||||
|
||||
|
||||
def _invalidate_cached_context_length(model: str, base_url: str) -> None:
|
||||
"""Drop a stale cache entry so it gets re-resolved on the next lookup."""
|
||||
key = f"{model}@{base_url}"
|
||||
cache = _load_context_cache()
|
||||
if key not in cache:
|
||||
return
|
||||
del cache[key]
|
||||
path = _get_context_cache_path()
|
||||
try:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to invalidate context length cache entry %s: %s", key, e)
|
||||
|
||||
|
||||
def get_next_probe_tier(current_length: int) -> Optional[int]:
|
||||
"""Return the next lower probe tier, or None if already at minimum."""
|
||||
for tier in CONTEXT_PROBE_TIERS:
|
||||
@@ -1030,7 +992,7 @@ def _query_anthropic_context_length(model: str, base_url: str, api_key: str) ->
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
}
|
||||
resp = requests.get(url, headers=headers, timeout=10, verify=_resolve_requests_verify())
|
||||
resp = requests.get(url, headers=headers, timeout=10)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
data = resp.json()
|
||||
@@ -1092,7 +1054,6 @@ def _fetch_codex_oauth_context_lengths(access_token: str) -> Dict[str, int]:
|
||||
"https://chatgpt.com/backend-api/codex/models?client_version=1.0.0",
|
||||
headers={"Authorization": f"Bearer {access_token}"},
|
||||
timeout=10,
|
||||
verify=_resolve_requests_verify(),
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.debug(
|
||||
@@ -1221,21 +1182,7 @@ def get_model_context_length(
|
||||
if base_url:
|
||||
cached = get_cached_context_length(model, base_url)
|
||||
if cached is not None:
|
||||
# Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
|
||||
# resolved gpt-5.x to the direct-API value (e.g. 1.05M) via
|
||||
# models.dev and persisted it. Codex OAuth caps at 272K for every
|
||||
# slug, so any cached Codex entry at or above 400K is a leftover
|
||||
# from the old resolution path. Drop it and fall through to the
|
||||
# live /models probe in step 5 below.
|
||||
if provider == "openai-codex" and cached >= 400_000:
|
||||
logger.info(
|
||||
"Dropping stale Codex cache entry %s@%s -> %s (pre-fix value); "
|
||||
"re-resolving via live /models probe",
|
||||
model, base_url, f"{cached:,}",
|
||||
)
|
||||
_invalidate_cached_context_length(model, base_url)
|
||||
else:
|
||||
return cached
|
||||
return cached
|
||||
|
||||
# 2. Active endpoint metadata for truly custom/unknown endpoints.
|
||||
# Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
|
||||
@@ -1308,19 +1255,6 @@ def get_model_context_length(
|
||||
if inferred:
|
||||
effective_provider = inferred
|
||||
|
||||
# 5a. Copilot live /models API — max_prompt_tokens from the user's account.
|
||||
# This catches account-specific models (e.g. claude-opus-4.6-1m) that
|
||||
# don't exist in models.dev. For models that ARE in models.dev, this
|
||||
# returns the provider-enforced limit which is what users can actually use.
|
||||
if effective_provider in ("copilot", "copilot-acp", "github-copilot"):
|
||||
try:
|
||||
from hermes_cli.models import get_copilot_model_context
|
||||
ctx = get_copilot_model_context(model, api_key=api_key)
|
||||
if ctx:
|
||||
return ctx
|
||||
except Exception:
|
||||
pass # Fall through to models.dev
|
||||
|
||||
if effective_provider == "nous":
|
||||
ctx = _resolve_nous_context_length(model)
|
||||
if ctx:
|
||||
|
||||
+26
-2
@@ -1,13 +1,15 @@
|
||||
"""Shared slash command helpers for skills.
|
||||
"""Shared slash command helpers for skills and built-in prompt-style modes.
|
||||
|
||||
Shared between CLI (cli.py) and gateway (gateway/run.py) so both surfaces
|
||||
can invoke skills via /skill-name commands.
|
||||
can invoke skills via /skill-name commands and prompt-only built-ins like
|
||||
/plan.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
@@ -16,6 +18,7 @@ from hermes_constants import display_hermes_home
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_skill_commands: Dict[str, Dict[str, Any]] = {}
|
||||
_PLAN_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
||||
# Patterns for sanitizing skill names into clean hyphen-separated slugs.
|
||||
_SKILL_INVALID_CHARS = re.compile(r"[^a-z0-9-]")
|
||||
_SKILL_MULTI_HYPHEN = re.compile(r"-{2,}")
|
||||
@@ -125,6 +128,27 @@ def _expand_inline_shell(
|
||||
return _INLINE_SHELL_RE.sub(_replace, content)
|
||||
|
||||
|
||||
def build_plan_path(
|
||||
user_instruction: str = "",
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> Path:
|
||||
"""Return the default workspace-relative markdown path for a /plan invocation.
|
||||
|
||||
Relative paths are intentional: file tools are task/backend-aware and resolve
|
||||
them against the active working directory for local, docker, ssh, modal,
|
||||
daytona, and similar terminal backends. That keeps the plan with the active
|
||||
workspace instead of the Hermes host's global home directory.
|
||||
"""
|
||||
slug_source = (user_instruction or "").strip().splitlines()[0] if user_instruction else ""
|
||||
slug = _PLAN_SLUG_RE.sub("-", slug_source.lower()).strip("-")
|
||||
if slug:
|
||||
slug = "-".join(part for part in slug.split("-")[:8] if part)[:48].strip("-")
|
||||
slug = slug or "conversation-plan"
|
||||
timestamp = (now or datetime.now()).strftime("%Y-%m-%d_%H%M%S")
|
||||
return Path(".hermes") / "plans" / f"{timestamp}-{slug}.md"
|
||||
|
||||
|
||||
def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tuple[dict[str, Any], Path | None, str] | None:
|
||||
"""Load a skill by name/path and return (loaded_payload, skill_dir, display_name)."""
|
||||
raw_identifier = (skill_identifier or "").strip()
|
||||
|
||||
@@ -326,16 +326,6 @@ compression:
|
||||
# To pin a specific model/provider for compression summaries, use the
|
||||
# auxiliary section below (auxiliary.compression.provider / model).
|
||||
|
||||
# =============================================================================
|
||||
# Anthropic prompt caching TTL
|
||||
# =============================================================================
|
||||
# When prompt caching is active (Claude via OpenRouter or native Anthropic),
|
||||
# Anthropic supports two TTL tiers for cached prefixes: "5m" (default) and
|
||||
# "1h". Other values are ignored and "5m" is used.
|
||||
#
|
||||
prompt_caching:
|
||||
cache_ttl: "5m" # use "1h" for long sessions with pauses between turns
|
||||
|
||||
# =============================================================================
|
||||
# Auxiliary Models (Advanced — Experimental)
|
||||
# =============================================================================
|
||||
|
||||
@@ -1688,6 +1688,7 @@ def _looks_like_slash_command(text: str) -> bool:
|
||||
from agent.skill_commands import (
|
||||
scan_skill_commands,
|
||||
build_skill_invocation_message,
|
||||
build_plan_path,
|
||||
build_preloaded_skills_prompt,
|
||||
)
|
||||
|
||||
@@ -3083,8 +3084,6 @@ class HermesCLI:
|
||||
format_runtime_provider_error,
|
||||
)
|
||||
|
||||
_primary_exc = None
|
||||
runtime = None
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=self.requested_provider,
|
||||
@@ -3092,34 +3091,7 @@ class HermesCLI:
|
||||
explicit_base_url=self._explicit_base_url,
|
||||
)
|
||||
except Exception as exc:
|
||||
_primary_exc = exc
|
||||
|
||||
# Primary provider auth failed — try fallback providers before giving up.
|
||||
if runtime is None and _primary_exc is not None:
|
||||
from hermes_cli.auth import AuthError
|
||||
if isinstance(_primary_exc, AuthError):
|
||||
_fb_chain = self._fallback_model if isinstance(self._fallback_model, list) else []
|
||||
for _fb in _fb_chain:
|
||||
_fb_provider = (_fb.get("provider") or "").strip().lower()
|
||||
_fb_model = (_fb.get("model") or "").strip()
|
||||
if not _fb_provider or not _fb_model:
|
||||
continue
|
||||
try:
|
||||
runtime = resolve_runtime_provider(requested=_fb_provider)
|
||||
logger.warning(
|
||||
"Primary provider auth failed (%s). Falling through to fallback: %s/%s",
|
||||
_primary_exc, _fb_provider, _fb_model,
|
||||
)
|
||||
_cprint(f"⚠️ Primary auth failed — switching to fallback: {_fb_provider} / {_fb_model}")
|
||||
self.requested_provider = _fb_provider
|
||||
self.model = _fb_model
|
||||
_primary_exc = None
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if runtime is None:
|
||||
message = format_runtime_provider_error(_primary_exc) if _primary_exc else "Provider resolution failed."
|
||||
message = format_runtime_provider_error(exc)
|
||||
ChatConsole().print(f"[bold red]{message}[/]")
|
||||
return False
|
||||
|
||||
@@ -3282,23 +3254,6 @@ class HermesCLI:
|
||||
_cprint(f"\033[1;31mSession not found: {self.session_id}{_RST}")
|
||||
_cprint(f"{_DIM}Use a session ID from a previous CLI run (hermes sessions list).{_RST}")
|
||||
return False
|
||||
# If the requested session is the (empty) head of a compression
|
||||
# chain, walk to the descendant that actually holds the messages.
|
||||
# See #15000 and SessionDB.resolve_resume_session_id.
|
||||
try:
|
||||
resolved_id = self._session_db.resolve_resume_session_id(self.session_id)
|
||||
except Exception:
|
||||
resolved_id = self.session_id
|
||||
if resolved_id and resolved_id != self.session_id:
|
||||
ChatConsole().print(
|
||||
f"[{_DIM}]Session {_escape(self.session_id)} was compressed into "
|
||||
f"{_escape(resolved_id)}; resuming the descendant with your "
|
||||
f"transcript.[/]"
|
||||
)
|
||||
self.session_id = resolved_id
|
||||
resolved_meta = self._session_db.get_session(self.session_id)
|
||||
if resolved_meta:
|
||||
session_meta = resolved_meta
|
||||
restored = self._session_db.get_messages_as_conversation(self.session_id)
|
||||
if restored:
|
||||
restored = [m for m in restored if m.get("role") != "session_meta"]
|
||||
@@ -3517,22 +3472,6 @@ class HermesCLI:
|
||||
)
|
||||
return False
|
||||
|
||||
# If the requested session is the (empty) head of a compression chain,
|
||||
# walk to the descendant that actually holds the messages. See #15000.
|
||||
try:
|
||||
resolved_id = self._session_db.resolve_resume_session_id(self.session_id)
|
||||
except Exception:
|
||||
resolved_id = self.session_id
|
||||
if resolved_id and resolved_id != self.session_id:
|
||||
self._console_print(
|
||||
f"[dim]Session {self.session_id} was compressed into "
|
||||
f"{resolved_id}; resuming the descendant with your transcript.[/]"
|
||||
)
|
||||
self.session_id = resolved_id
|
||||
resolved_meta = self._session_db.get_session(self.session_id)
|
||||
if resolved_meta:
|
||||
session_meta = resolved_meta
|
||||
|
||||
restored = self._session_db.get_messages_as_conversation(self.session_id)
|
||||
if restored:
|
||||
restored = [m for m in restored if m.get("role") != "session_meta"]
|
||||
@@ -4747,22 +4686,6 @@ class HermesCLI:
|
||||
_cprint(" Use /history or `hermes sessions list` to see available sessions.")
|
||||
return
|
||||
|
||||
# If the target is the empty head of a compression chain, redirect to
|
||||
# the descendant that actually holds the transcript. See #15000.
|
||||
try:
|
||||
resolved_id = self._session_db.resolve_resume_session_id(target_id)
|
||||
except Exception:
|
||||
resolved_id = target_id
|
||||
if resolved_id and resolved_id != target_id:
|
||||
_cprint(
|
||||
f" Session {target_id} was compressed into {resolved_id}; "
|
||||
f"resuming the descendant with your transcript."
|
||||
)
|
||||
target_id = resolved_id
|
||||
resolved_meta = self._session_db.get_session(target_id)
|
||||
if resolved_meta:
|
||||
session_meta = resolved_meta
|
||||
|
||||
if target_id == self.session_id:
|
||||
_cprint(" Already on that session.")
|
||||
return
|
||||
@@ -5455,6 +5378,79 @@ class HermesCLI:
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _show_model_and_providers(self):
|
||||
"""Show current model + provider and list all authenticated providers.
|
||||
|
||||
Shows current model + provider, then lists all authenticated
|
||||
providers with their available models.
|
||||
"""
|
||||
from hermes_cli.models import (
|
||||
curated_models_for_provider, list_available_providers,
|
||||
normalize_provider, _PROVIDER_LABELS,
|
||||
get_pricing_for_provider, format_model_pricing_table,
|
||||
)
|
||||
from hermes_cli.auth import resolve_provider as _resolve_provider
|
||||
|
||||
# Resolve current provider
|
||||
raw_provider = normalize_provider(self.provider)
|
||||
if raw_provider == "auto":
|
||||
try:
|
||||
current = _resolve_provider(
|
||||
self.requested_provider,
|
||||
explicit_api_key=self._explicit_api_key,
|
||||
explicit_base_url=self._explicit_base_url,
|
||||
)
|
||||
except Exception:
|
||||
current = "openrouter"
|
||||
else:
|
||||
current = raw_provider
|
||||
current_label = _PROVIDER_LABELS.get(current, current)
|
||||
|
||||
print(f"\n Current: {self.model} via {current_label}")
|
||||
print()
|
||||
|
||||
# Show all authenticated providers with their models
|
||||
providers = list_available_providers()
|
||||
authed = [p for p in providers if p["authenticated"]]
|
||||
unauthed = [p for p in providers if not p["authenticated"]]
|
||||
|
||||
if authed:
|
||||
print(" Authenticated providers & models:")
|
||||
for p in authed:
|
||||
is_active = p["id"] == current
|
||||
marker = " ← active" if is_active else ""
|
||||
print(f" [{p['id']}]{marker}")
|
||||
curated = curated_models_for_provider(p["id"])
|
||||
# Fetch pricing for providers that support it (openrouter, nous)
|
||||
pricing_map = get_pricing_for_provider(p["id"]) if p["id"] in ("openrouter", "nous") else {}
|
||||
if curated and pricing_map:
|
||||
cur_model = self.model if is_active else ""
|
||||
for line in format_model_pricing_table(curated, pricing_map, current_model=cur_model):
|
||||
print(line)
|
||||
elif curated:
|
||||
for mid, desc in curated:
|
||||
current_marker = " ← current" if (is_active and mid == self.model) else ""
|
||||
print(f" {mid}{current_marker}")
|
||||
elif p["id"] == "custom":
|
||||
from hermes_cli.models import _get_custom_base_url
|
||||
custom_url = _get_custom_base_url()
|
||||
if custom_url:
|
||||
print(f" endpoint: {custom_url}")
|
||||
if is_active:
|
||||
print(f" model: {self.model} ← current")
|
||||
print(" (use hermes model to change)")
|
||||
else:
|
||||
print(" (use hermes model to change)")
|
||||
print()
|
||||
|
||||
if unauthed:
|
||||
names = ", ".join(p["label"] for p in unauthed)
|
||||
print(f" Not configured: {names}")
|
||||
print(" Run: hermes setup")
|
||||
print()
|
||||
|
||||
print(" To change model or provider, use: hermes model")
|
||||
|
||||
def _output_console(self):
|
||||
"""Use prompt_toolkit-safe Rich rendering once the TUI is live."""
|
||||
if getattr(self, "_app", None):
|
||||
@@ -6030,12 +6026,16 @@ class HermesCLI:
|
||||
self._handle_resume_command(cmd_original)
|
||||
elif canonical == "model":
|
||||
self._handle_model_switch(cmd_original)
|
||||
elif canonical == "provider":
|
||||
self._show_model_and_providers()
|
||||
elif canonical == "gquota":
|
||||
self._handle_gquota_command(cmd_original)
|
||||
|
||||
elif canonical == "personality":
|
||||
# Use original case (handler lowercases the personality name itself)
|
||||
self._handle_personality_command(cmd_original)
|
||||
elif canonical == "plan":
|
||||
self._handle_plan_command(cmd_original)
|
||||
elif canonical == "retry":
|
||||
retry_msg = self.retry_last()
|
||||
if retry_msg and hasattr(self, '_pending_input'):
|
||||
@@ -6270,6 +6270,32 @@ class HermesCLI:
|
||||
|
||||
return True
|
||||
|
||||
def _handle_plan_command(self, cmd: str):
|
||||
"""Handle /plan [request] — load the bundled plan skill."""
|
||||
parts = cmd.strip().split(maxsplit=1)
|
||||
user_instruction = parts[1].strip() if len(parts) > 1 else ""
|
||||
|
||||
plan_path = build_plan_path(user_instruction)
|
||||
msg = build_skill_invocation_message(
|
||||
"/plan",
|
||||
user_instruction,
|
||||
task_id=self.session_id,
|
||||
runtime_note=(
|
||||
"Save the markdown plan with write_file to this exact relative path "
|
||||
f"inside the active workspace/backend cwd: {plan_path}"
|
||||
),
|
||||
)
|
||||
|
||||
if not msg:
|
||||
ChatConsole().print("[bold red]Failed to load the bundled /plan skill[/]")
|
||||
return
|
||||
|
||||
_cprint(f" 📝 Plan mode queued via skill. Markdown plan target: {plan_path}")
|
||||
if hasattr(self, '_pending_input'):
|
||||
self._pending_input.put(msg)
|
||||
else:
|
||||
ChatConsole().print("[bold red]Plan mode unavailable: input queue not initialized[/]")
|
||||
|
||||
def _handle_background_command(self, cmd: str):
|
||||
"""Handle /background <prompt> — run a prompt in a separate background session.
|
||||
|
||||
|
||||
@@ -371,39 +371,6 @@ def save_jobs(jobs: List[Dict[str, Any]]):
|
||||
raise
|
||||
|
||||
|
||||
def _normalize_workdir(workdir: Optional[str]) -> Optional[str]:
|
||||
"""Normalize and validate a cron job workdir.
|
||||
|
||||
Rules:
|
||||
- Empty / None → None (feature off, preserves old behaviour).
|
||||
- ``~`` is expanded. Relative paths are rejected — cron jobs run detached
|
||||
from any shell cwd, so relative paths have no stable meaning.
|
||||
- The path must exist and be a directory at create/update time. We do
|
||||
NOT re-check at run time (a user might briefly unmount the dir; the
|
||||
scheduler will just fall back to old behaviour with a logged warning).
|
||||
|
||||
Returns the absolute path string, or None when disabled.
|
||||
Raises ValueError on invalid input.
|
||||
"""
|
||||
if workdir is None:
|
||||
return None
|
||||
raw = str(workdir).strip()
|
||||
if not raw:
|
||||
return None
|
||||
expanded = Path(raw).expanduser()
|
||||
if not expanded.is_absolute():
|
||||
raise ValueError(
|
||||
f"Cron workdir must be an absolute path (got {raw!r}). "
|
||||
f"Cron jobs run detached from any shell cwd, so relative paths are ambiguous."
|
||||
)
|
||||
resolved = expanded.resolve()
|
||||
if not resolved.exists():
|
||||
raise ValueError(f"Cron workdir does not exist: {resolved}")
|
||||
if not resolved.is_dir():
|
||||
raise ValueError(f"Cron workdir is not a directory: {resolved}")
|
||||
return str(resolved)
|
||||
|
||||
|
||||
def create_job(
|
||||
prompt: str,
|
||||
schedule: str,
|
||||
@@ -418,7 +385,6 @@ def create_job(
|
||||
base_url: Optional[str] = None,
|
||||
script: Optional[str] = None,
|
||||
enabled_toolsets: Optional[List[str]] = None,
|
||||
workdir: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a new cron job.
|
||||
@@ -441,12 +407,6 @@ def create_job(
|
||||
enabled_toolsets: Optional list of toolset names to restrict the agent to.
|
||||
When set, only tools from these toolsets are loaded, reducing
|
||||
token overhead. When omitted, all default tools are loaded.
|
||||
workdir: Optional absolute path. When set, the job runs as if launched
|
||||
from that directory: AGENTS.md / CLAUDE.md / .cursorrules from
|
||||
that directory are injected into the system prompt, and the
|
||||
terminal/file/code_exec tools use it as their working directory
|
||||
(via TERMINAL_CWD). When unset, the old behaviour is preserved
|
||||
(no context files injected, tools use the scheduler's cwd).
|
||||
|
||||
Returns:
|
||||
The created job dict
|
||||
@@ -479,7 +439,6 @@ def create_job(
|
||||
normalized_script = normalized_script or None
|
||||
normalized_toolsets = [str(t).strip() for t in enabled_toolsets if str(t).strip()] if enabled_toolsets else None
|
||||
normalized_toolsets = normalized_toolsets or None
|
||||
normalized_workdir = _normalize_workdir(workdir)
|
||||
|
||||
label_source = (prompt or (normalized_skills[0] if normalized_skills else None)) or "cron job"
|
||||
job = {
|
||||
@@ -512,7 +471,6 @@ def create_job(
|
||||
"deliver": deliver,
|
||||
"origin": origin, # Tracks where job was created for "origin" delivery
|
||||
"enabled_toolsets": normalized_toolsets,
|
||||
"workdir": normalized_workdir,
|
||||
}
|
||||
|
||||
jobs = load_jobs()
|
||||
@@ -546,15 +504,6 @@ def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]
|
||||
if job["id"] != job_id:
|
||||
continue
|
||||
|
||||
# Validate / normalize workdir if present in updates. Empty string or
|
||||
# None both mean "clear the field" (restore old behaviour).
|
||||
if "workdir" in updates:
|
||||
_wd = updates["workdir"]
|
||||
if _wd in (None, "", False):
|
||||
updates["workdir"] = None
|
||||
else:
|
||||
updates["workdir"] = _normalize_workdir(_wd)
|
||||
|
||||
updated = _apply_skill_fields({**job, **updates})
|
||||
schedule_changed = "schedule" in updates
|
||||
|
||||
|
||||
+9
-81
@@ -795,30 +795,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
chat_name=origin.get("chat_name", "") if origin else "",
|
||||
)
|
||||
|
||||
# Per-job working directory. When set (and validated at create/update
|
||||
# time), we point TERMINAL_CWD at it so:
|
||||
# - build_context_files_prompt() picks up AGENTS.md / CLAUDE.md /
|
||||
# .cursorrules from the job's project dir, AND
|
||||
# - the terminal, file, and code-exec tools run commands from there.
|
||||
#
|
||||
# tick() serializes workdir-jobs outside the parallel pool, so mutating
|
||||
# os.environ["TERMINAL_CWD"] here is safe for those jobs. For workdir-less
|
||||
# jobs we leave TERMINAL_CWD untouched — preserves the original behaviour
|
||||
# (skip_context_files=True, tools use whatever cwd the scheduler has).
|
||||
_job_workdir = (job.get("workdir") or "").strip() or None
|
||||
if _job_workdir and not Path(_job_workdir).is_dir():
|
||||
# Directory was removed between create-time validation and now. Log
|
||||
# and drop back to old behaviour rather than crashing the job.
|
||||
logger.warning(
|
||||
"Job '%s': configured workdir %r no longer exists — running without it",
|
||||
job_id, _job_workdir,
|
||||
)
|
||||
_job_workdir = None
|
||||
_prior_terminal_cwd = os.environ.get("TERMINAL_CWD", "_UNSET_")
|
||||
if _job_workdir:
|
||||
os.environ["TERMINAL_CWD"] = _job_workdir
|
||||
logger.info("Job '%s': using workdir %s", job_id, _job_workdir)
|
||||
|
||||
try:
|
||||
# Re-read .env and config.yaml fresh every run so provider/key
|
||||
# changes take effect without a gateway restart.
|
||||
@@ -895,7 +871,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
resolve_runtime_provider,
|
||||
format_runtime_provider_error,
|
||||
)
|
||||
from hermes_cli.auth import AuthError
|
||||
try:
|
||||
runtime_kwargs = {
|
||||
"requested": job.get("provider") or os.getenv("HERMES_INFERENCE_PROVIDER"),
|
||||
@@ -903,28 +878,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
if job.get("base_url"):
|
||||
runtime_kwargs["explicit_base_url"] = job.get("base_url")
|
||||
runtime = resolve_runtime_provider(**runtime_kwargs)
|
||||
except AuthError as auth_exc:
|
||||
# Primary provider auth failed — try fallback chain before giving up.
|
||||
logger.warning("Job '%s': primary auth failed (%s), trying fallback", job_id, auth_exc)
|
||||
fb = _cfg.get("fallback_providers") or _cfg.get("fallback_model")
|
||||
fb_list = (fb if isinstance(fb, list) else [fb]) if fb else []
|
||||
runtime = None
|
||||
for entry in fb_list:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
try:
|
||||
fb_kwargs = {"requested": entry.get("provider")}
|
||||
if entry.get("base_url"):
|
||||
fb_kwargs["explicit_base_url"] = entry["base_url"]
|
||||
if entry.get("api_key"):
|
||||
fb_kwargs["explicit_api_key"] = entry["api_key"]
|
||||
runtime = resolve_runtime_provider(**fb_kwargs)
|
||||
logger.info("Job '%s': fallback resolved to %s", job_id, runtime.get("provider"))
|
||||
break
|
||||
except Exception as fb_exc:
|
||||
logger.debug("Job '%s': fallback %s failed: %s", job_id, entry.get("provider"), fb_exc)
|
||||
if runtime is None:
|
||||
raise RuntimeError(format_runtime_provider_error(auth_exc)) from auth_exc
|
||||
except Exception as exc:
|
||||
message = format_runtime_provider_error(exc)
|
||||
raise RuntimeError(message) from exc
|
||||
@@ -967,10 +920,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
enabled_toolsets=_resolve_cron_enabled_toolsets(job, _cfg),
|
||||
disabled_toolsets=["cronjob", "messaging", "clarify"],
|
||||
quiet_mode=True,
|
||||
# When a workdir is configured, inject AGENTS.md / CLAUDE.md /
|
||||
# .cursorrules from that directory; otherwise preserve the old
|
||||
# behaviour (don't inject SOUL.md/AGENTS.md from the scheduler cwd).
|
||||
skip_context_files=not bool(_job_workdir),
|
||||
skip_context_files=True, # Don't inject SOUL.md/AGENTS.md from scheduler cwd
|
||||
skip_memory=True, # Cron system prompts would corrupt user representations
|
||||
platform="cron",
|
||||
session_id=_cron_session_id,
|
||||
@@ -1109,14 +1059,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
return False, output, "", error_msg
|
||||
|
||||
finally:
|
||||
# Restore TERMINAL_CWD to whatever it was before this job ran. We
|
||||
# only ever mutate it when the job has a workdir; see the setup block
|
||||
# at the top of run_job for the serialization guarantee.
|
||||
if _job_workdir:
|
||||
if _prior_terminal_cwd == "_UNSET_":
|
||||
os.environ.pop("TERMINAL_CWD", None)
|
||||
else:
|
||||
os.environ["TERMINAL_CWD"] = _prior_terminal_cwd
|
||||
# Clean up ContextVar session/delivery state for this job.
|
||||
clear_session_vars(_ctx_tokens)
|
||||
if _session_db:
|
||||
@@ -1244,28 +1186,14 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
|
||||
mark_job_run(job["id"], False, str(e))
|
||||
return False
|
||||
|
||||
# Partition due jobs: those with a per-job workdir mutate
|
||||
# os.environ["TERMINAL_CWD"] inside run_job, which is process-global —
|
||||
# so they MUST run sequentially to avoid corrupting each other. Jobs
|
||||
# without a workdir leave env untouched and stay parallel-safe.
|
||||
workdir_jobs = [j for j in due_jobs if (j.get("workdir") or "").strip()]
|
||||
parallel_jobs = [j for j in due_jobs if not (j.get("workdir") or "").strip()]
|
||||
|
||||
_results: list = []
|
||||
|
||||
# Sequential pass for workdir jobs.
|
||||
for job in workdir_jobs:
|
||||
_ctx = contextvars.copy_context()
|
||||
_results.append(_ctx.run(_process_job, job))
|
||||
|
||||
# Parallel pass for the rest — same behaviour as before.
|
||||
if parallel_jobs:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=_max_workers) as _tick_pool:
|
||||
_futures = []
|
||||
for job in parallel_jobs:
|
||||
_ctx = contextvars.copy_context()
|
||||
_futures.append(_tick_pool.submit(_ctx.run, _process_job, job))
|
||||
_results.extend(f.result() for f in _futures)
|
||||
# Run all due jobs concurrently, each in its own ContextVar copy
|
||||
# so session/delivery state stays isolated per-thread.
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=_max_workers) as _tick_pool:
|
||||
_futures = []
|
||||
for job in due_jobs:
|
||||
_ctx = contextvars.copy_context()
|
||||
_futures.append(_tick_pool.submit(_ctx.run, _process_job, job))
|
||||
_results = [f.result() for f in _futures]
|
||||
|
||||
return sum(_results)
|
||||
finally:
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
#
|
||||
# docker-compose.yml for Hermes Agent
|
||||
#
|
||||
# Usage:
|
||||
# HERMES_UID=$(id -u) HERMES_GID=$(id -g) docker compose up -d
|
||||
#
|
||||
# Set HERMES_UID / HERMES_GID to the host user that owns ~/.hermes so
|
||||
# files created inside the container stay readable/writable on the host.
|
||||
# The entrypoint remaps the internal `hermes` user to these values via
|
||||
# usermod/groupmod + gosu.
|
||||
#
|
||||
# Security notes:
|
||||
# - The dashboard service binds to 127.0.0.1 by default. It stores API
|
||||
# keys; exposing it on LAN without auth is unsafe. If you want remote
|
||||
# access, use an SSH tunnel or put it behind a reverse proxy that
|
||||
# adds authentication — do NOT pass --insecure --host 0.0.0.0.
|
||||
# - The gateway's API server is off unless you uncomment API_SERVER_KEY
|
||||
# and API_SERVER_HOST. See docs/user-guide/api-server.md before doing
|
||||
# this on an internet-facing host.
|
||||
#
|
||||
services:
|
||||
gateway:
|
||||
build: .
|
||||
image: hermes-agent
|
||||
container_name: hermes
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
volumes:
|
||||
- ~/.hermes:/opt/data
|
||||
environment:
|
||||
- HERMES_UID=${HERMES_UID:-10000}
|
||||
- HERMES_GID=${HERMES_GID:-10000}
|
||||
# To expose the OpenAI-compatible API server beyond localhost,
|
||||
# uncomment BOTH lines (API_SERVER_KEY is mandatory for auth):
|
||||
# - API_SERVER_HOST=0.0.0.0
|
||||
# - API_SERVER_KEY=${API_SERVER_KEY}
|
||||
command: ["gateway", "run"]
|
||||
|
||||
dashboard:
|
||||
image: hermes-agent
|
||||
container_name: hermes-dashboard
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
depends_on:
|
||||
- gateway
|
||||
volumes:
|
||||
- ~/.hermes:/opt/data
|
||||
environment:
|
||||
- HERMES_UID=${HERMES_UID:-10000}
|
||||
- HERMES_GID=${HERMES_GID:-10000}
|
||||
# Localhost-only. For remote access, tunnel via `ssh -L 9119:localhost:9119`.
|
||||
command: ["dashboard", "--host", "127.0.0.1", "--no-open"]
|
||||
+2
-11
@@ -22,18 +22,9 @@ if [ "$(id -u)" = "0" ]; then
|
||||
groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Fix ownership of the data volume. When HERMES_UID remaps the hermes user,
|
||||
# files created by previous runs (under the old UID) become inaccessible.
|
||||
# Always chown -R when UID was remapped; otherwise only if top-level is wrong.
|
||||
actual_hermes_uid=$(id -u hermes)
|
||||
needs_chown=false
|
||||
if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "10000" ]; then
|
||||
needs_chown=true
|
||||
elif [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then
|
||||
needs_chown=true
|
||||
fi
|
||||
if [ "$needs_chown" = true ]; then
|
||||
echo "Fixing ownership of $HERMES_HOME to hermes ($actual_hermes_uid)"
|
||||
if [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then
|
||||
echo "$HERMES_HOME is not owned by $actual_hermes_uid, fixing"
|
||||
# In rootless Podman the container's "root" is mapped to an unprivileged
|
||||
# host UID — chown will fail. That's fine: the volume is already owned
|
||||
# by the mapped user on the host side.
|
||||
|
||||
@@ -2440,6 +2440,9 @@ class BasePlatformAdapter(ABC):
|
||||
user_id_alt: Optional[str] = None,
|
||||
chat_id_alt: Optional[str] = None,
|
||||
is_bot: bool = False,
|
||||
guild_id: Optional[str] = None,
|
||||
parent_chat_id: Optional[str] = None,
|
||||
message_id: Optional[str] = None,
|
||||
) -> SessionSource:
|
||||
"""Helper to build a SessionSource for this platform."""
|
||||
# Normalize empty topic to None
|
||||
@@ -2457,6 +2460,9 @@ class BasePlatformAdapter(ABC):
|
||||
user_id_alt=user_id_alt,
|
||||
chat_id_alt=chat_id_alt,
|
||||
is_bot=is_bot,
|
||||
guild_id=str(guild_id) if guild_id else None,
|
||||
parent_chat_id=str(parent_chat_id) if parent_chat_id else None,
|
||||
message_id=str(message_id) if message_id else None,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -2246,6 +2246,10 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
async def slash_usage(interaction: discord.Interaction):
|
||||
await self._run_simple_slash(interaction, "/usage")
|
||||
|
||||
@tree.command(name="provider", description="Show available providers")
|
||||
async def slash_provider(interaction: discord.Interaction):
|
||||
await self._run_simple_slash(interaction, "/provider")
|
||||
|
||||
@tree.command(name="help", description="Show available commands")
|
||||
async def slash_help(interaction: discord.Interaction):
|
||||
await self._run_simple_slash(interaction, "/help")
|
||||
@@ -2715,12 +2719,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
return os.getenv("DISCORD_REQUIRE_MENTION", "true").lower() not in ("false", "0", "no", "off")
|
||||
|
||||
def _discord_free_response_channels(self) -> set:
|
||||
"""Return Discord channel IDs where no bot mention is required.
|
||||
|
||||
A single ``"*"`` entry (either from a list or a comma-separated
|
||||
string) is preserved in the returned set so callers can short-circuit
|
||||
on wildcard membership, consistent with ``allowed_channels``.
|
||||
"""
|
||||
"""Return Discord channel IDs where no bot mention is required."""
|
||||
raw = self.config.extra.get("free_response_channels")
|
||||
if raw is None:
|
||||
raw = os.getenv("DISCORD_FREE_RESPONSE_CHANNELS", "")
|
||||
@@ -3213,14 +3212,14 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
allowed_channels_raw = os.getenv("DISCORD_ALLOWED_CHANNELS", "")
|
||||
if allowed_channels_raw:
|
||||
allowed_channels = {ch.strip() for ch in allowed_channels_raw.split(",") if ch.strip()}
|
||||
if "*" not in allowed_channels and not (channel_ids & allowed_channels):
|
||||
if not (channel_ids & allowed_channels):
|
||||
logger.debug("[%s] Ignoring message in non-allowed channel: %s", self.name, channel_ids)
|
||||
return
|
||||
|
||||
# Check ignored channels - never respond even when mentioned
|
||||
ignored_channels_raw = os.getenv("DISCORD_IGNORED_CHANNELS", "")
|
||||
ignored_channels = {ch.strip() for ch in ignored_channels_raw.split(",") if ch.strip()}
|
||||
if "*" in ignored_channels or (channel_ids & ignored_channels):
|
||||
if channel_ids & ignored_channels:
|
||||
logger.debug("[%s] Ignoring message in ignored channel: %s", self.name, channel_ids)
|
||||
return
|
||||
|
||||
@@ -3234,11 +3233,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
voice_linked_ids = {str(ch_id) for ch_id in self._voice_text_channels.values()}
|
||||
current_channel_id = str(message.channel.id)
|
||||
is_voice_linked_channel = current_channel_id in voice_linked_ids
|
||||
is_free_channel = (
|
||||
"*" in free_channels
|
||||
or bool(channel_ids & free_channels)
|
||||
or is_voice_linked_channel
|
||||
)
|
||||
is_free_channel = bool(channel_ids & free_channels) or is_voice_linked_channel
|
||||
|
||||
# Skip the mention check if the message is in a thread where
|
||||
# the bot has previously participated (auto-created or replied in).
|
||||
@@ -3261,6 +3256,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
if auto_thread and not skip_thread and not is_voice_linked_channel and not is_reply_message:
|
||||
thread = await self._auto_create_thread(message)
|
||||
if thread:
|
||||
parent_channel_id = str(message.channel.id)
|
||||
is_thread = True
|
||||
thread_id = str(thread.id)
|
||||
auto_threaded_channel = thread
|
||||
@@ -3320,6 +3316,9 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
thread_id=thread_id,
|
||||
chat_topic=chat_topic,
|
||||
is_bot=getattr(message.author, "bot", False),
|
||||
guild_id=str(message.guild.id) if message.guild else None,
|
||||
parent_chat_id=parent_channel_id,
|
||||
message_id=str(message.id),
|
||||
)
|
||||
|
||||
# Build media URLs -- download image attachments to local cache so the
|
||||
@@ -3871,15 +3870,6 @@ if DISCORD_AVAILABLE:
|
||||
|
||||
self.resolved = True
|
||||
model_id = interaction.data["values"][0]
|
||||
self.clear_items()
|
||||
await interaction.response.edit_message(
|
||||
embed=discord.Embed(
|
||||
title="⚙ Switching Model",
|
||||
description=f"Switching to `{model_id}`...",
|
||||
color=discord.Color.blue(),
|
||||
),
|
||||
view=None,
|
||||
)
|
||||
|
||||
try:
|
||||
result_text = await self.on_model_selected(
|
||||
@@ -3890,13 +3880,14 @@ if DISCORD_AVAILABLE:
|
||||
except Exception as exc:
|
||||
result_text = f"Error switching model: {exc}"
|
||||
|
||||
await interaction.edit_original_response(
|
||||
self.clear_items()
|
||||
await interaction.response.edit_message(
|
||||
embed=discord.Embed(
|
||||
title="⚙ Model Switched",
|
||||
description=result_text,
|
||||
color=discord.Color.green(),
|
||||
),
|
||||
view=None,
|
||||
view=self,
|
||||
)
|
||||
|
||||
async def _on_back(self, interaction: discord.Interaction):
|
||||
|
||||
+89
-115
@@ -14,7 +14,6 @@ Usage:
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -350,30 +349,16 @@ _AGENT_PENDING_SENTINEL = object()
|
||||
|
||||
|
||||
def _resolve_runtime_agent_kwargs() -> dict:
|
||||
"""Resolve provider credentials for gateway-created AIAgent instances.
|
||||
|
||||
If the primary provider fails with an authentication error, attempt to
|
||||
resolve credentials using the fallback provider chain from config.yaml
|
||||
before giving up.
|
||||
"""
|
||||
"""Resolve provider credentials for gateway-created AIAgent instances."""
|
||||
from hermes_cli.runtime_provider import (
|
||||
resolve_runtime_provider,
|
||||
format_runtime_provider_error,
|
||||
)
|
||||
from hermes_cli.auth import AuthError
|
||||
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=os.getenv("HERMES_INFERENCE_PROVIDER"),
|
||||
)
|
||||
except AuthError as auth_exc:
|
||||
# Primary provider auth failed (expired token, revoked key, etc.).
|
||||
# Try the fallback provider chain before raising.
|
||||
logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc)
|
||||
fb_config = _try_resolve_fallback_provider()
|
||||
if fb_config is not None:
|
||||
return fb_config
|
||||
raise RuntimeError(format_runtime_provider_error(auth_exc)) from auth_exc
|
||||
except Exception as exc:
|
||||
raise RuntimeError(format_runtime_provider_error(exc)) from exc
|
||||
|
||||
@@ -388,48 +373,6 @@ def _resolve_runtime_agent_kwargs() -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _try_resolve_fallback_provider() -> dict | None:
|
||||
"""Attempt to resolve credentials from the fallback_model/fallback_providers config."""
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
try:
|
||||
import yaml as _y
|
||||
cfg_path = _hermes_home / "config.yaml"
|
||||
if not cfg_path.exists():
|
||||
return None
|
||||
with open(cfg_path, encoding="utf-8") as _f:
|
||||
cfg = _y.safe_load(_f) or {}
|
||||
fb = cfg.get("fallback_providers") or cfg.get("fallback_model")
|
||||
if not fb:
|
||||
return None
|
||||
# Normalize to list
|
||||
fb_list = fb if isinstance(fb, list) else [fb]
|
||||
for entry in fb_list:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=entry.get("provider"),
|
||||
explicit_base_url=entry.get("base_url"),
|
||||
explicit_api_key=entry.get("api_key"),
|
||||
)
|
||||
logger.info("Fallback provider resolved: %s", runtime.get("provider"))
|
||||
return {
|
||||
"api_key": runtime.get("api_key"),
|
||||
"base_url": runtime.get("base_url"),
|
||||
"provider": runtime.get("provider"),
|
||||
"api_mode": runtime.get("api_mode"),
|
||||
"command": runtime.get("command"),
|
||||
"args": list(runtime.get("args") or []),
|
||||
"credential_pool": runtime.get("credential_pool"),
|
||||
}
|
||||
except Exception as fb_exc:
|
||||
logger.debug("Fallback entry %s failed: %s", entry.get("provider"), fb_exc)
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _build_media_placeholder(event) -> str:
|
||||
"""Build a text placeholder for media-only events so they aren't dropped.
|
||||
|
||||
@@ -2366,17 +2309,6 @@ class GatewayRunner:
|
||||
for key, entry in _expired_entries:
|
||||
try:
|
||||
await self._async_flush_memories(entry.session_id, key)
|
||||
try:
|
||||
from hermes_cli.plugins import invoke_hook as _invoke_hook
|
||||
_parts = key.split(":")
|
||||
_platform = _parts[2] if len(_parts) > 2 else ""
|
||||
_invoke_hook(
|
||||
"on_session_finalize",
|
||||
session_id=entry.session_id,
|
||||
platform=_platform,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
# Shut down memory provider and close tool resources
|
||||
# on the cached agent. Idle agents live in
|
||||
# _agent_cache (not _running_agents), so look there.
|
||||
@@ -3213,50 +3145,7 @@ class GatewayRunner:
|
||||
|
||||
# Internal events (e.g. background-process completion notifications)
|
||||
# are system-generated and must skip user authorization.
|
||||
is_internal = bool(getattr(event, "internal", False))
|
||||
|
||||
# Fire pre_gateway_dispatch plugin hook for user-originated messages.
|
||||
# Plugins receive the MessageEvent and may return a dict influencing flow:
|
||||
# {"action": "skip", "reason": ...} -> drop (no reply, plugin handled)
|
||||
# {"action": "rewrite", "text": ...} -> replace event.text, continue
|
||||
# {"action": "allow"} / None -> normal dispatch
|
||||
# Hook runs BEFORE auth so plugins can handle unauthorized senders
|
||||
# (e.g. customer handover ingest) without triggering the pairing flow.
|
||||
if not is_internal:
|
||||
try:
|
||||
from hermes_cli.plugins import invoke_hook as _invoke_hook
|
||||
_hook_results = _invoke_hook(
|
||||
"pre_gateway_dispatch",
|
||||
event=event,
|
||||
gateway=self,
|
||||
session_store=self.session_store,
|
||||
)
|
||||
except Exception as _hook_exc:
|
||||
logger.warning("pre_gateway_dispatch invocation failed: %s", _hook_exc)
|
||||
_hook_results = []
|
||||
|
||||
for _result in _hook_results:
|
||||
if not isinstance(_result, dict):
|
||||
continue
|
||||
_action = _result.get("action")
|
||||
if _action == "skip":
|
||||
logger.info(
|
||||
"pre_gateway_dispatch skip: reason=%s platform=%s chat=%s",
|
||||
_result.get("reason"),
|
||||
source.platform.value if source.platform else "unknown",
|
||||
source.chat_id or "unknown",
|
||||
)
|
||||
return None
|
||||
if _action == "rewrite":
|
||||
_new_text = _result.get("text")
|
||||
if isinstance(_new_text, str):
|
||||
event = dataclasses.replace(event, text=_new_text)
|
||||
source = event.source
|
||||
break
|
||||
if _action == "allow":
|
||||
break
|
||||
|
||||
if is_internal:
|
||||
if getattr(event, "internal", False):
|
||||
pass
|
||||
elif source.user_id is None:
|
||||
# Messages with no user identity (Telegram service messages,
|
||||
@@ -3553,7 +3442,7 @@ class GatewayRunner:
|
||||
# running-agent guard. Reject gracefully rather than falling
|
||||
# through to interrupt + discard. Without this, commands
|
||||
# like /model, /reasoning, /voice, /insights, /title,
|
||||
# /resume, /retry, /undo, /compress, /usage,
|
||||
# /resume, /retry, /undo, /compress, /usage, /provider,
|
||||
# /reload-mcp, /sethome, /reset (all registered as Discord
|
||||
# slash commands) would interrupt the agent AND get
|
||||
# silently discarded by the slash-command safety net,
|
||||
@@ -3740,9 +3629,34 @@ class GatewayRunner:
|
||||
if canonical == "model":
|
||||
return await self._handle_model_command(event)
|
||||
|
||||
if canonical == "provider":
|
||||
return await self._handle_provider_command(event)
|
||||
|
||||
if canonical == "personality":
|
||||
return await self._handle_personality_command(event)
|
||||
|
||||
if canonical == "plan":
|
||||
try:
|
||||
from agent.skill_commands import build_plan_path, build_skill_invocation_message
|
||||
|
||||
user_instruction = event.get_command_args().strip()
|
||||
plan_path = build_plan_path(user_instruction)
|
||||
event.text = build_skill_invocation_message(
|
||||
"/plan",
|
||||
user_instruction,
|
||||
task_id=_quick_key,
|
||||
runtime_note=(
|
||||
"Save the markdown plan with write_file to this exact relative path "
|
||||
f"inside the active workspace/backend cwd: {plan_path}"
|
||||
),
|
||||
)
|
||||
if not event.text:
|
||||
return "Failed to load the bundled /plan skill."
|
||||
canonical = None
|
||||
except Exception as e:
|
||||
logger.exception("Failed to prepare /plan command")
|
||||
return f"Failed to enter plan mode: {e}"
|
||||
|
||||
if canonical == "retry":
|
||||
return await self._handle_retry_command(event)
|
||||
|
||||
@@ -5865,6 +5779,63 @@ class GatewayRunner:
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
async def _handle_provider_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /provider command - show available providers."""
|
||||
import yaml
|
||||
from hermes_cli.models import (
|
||||
list_available_providers,
|
||||
normalize_provider,
|
||||
_PROVIDER_LABELS,
|
||||
)
|
||||
|
||||
# Resolve current provider from config
|
||||
current_provider = "openrouter"
|
||||
model_cfg = {}
|
||||
config_path = _hermes_home / 'config.yaml'
|
||||
try:
|
||||
if config_path.exists():
|
||||
with open(config_path, encoding="utf-8") as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
model_cfg = cfg.get("model", {})
|
||||
if isinstance(model_cfg, dict):
|
||||
current_provider = model_cfg.get("provider", current_provider)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
current_provider = normalize_provider(current_provider)
|
||||
if current_provider == "auto":
|
||||
try:
|
||||
from hermes_cli.auth import resolve_provider as _resolve_provider
|
||||
current_provider = _resolve_provider(current_provider)
|
||||
except Exception:
|
||||
current_provider = "openrouter"
|
||||
|
||||
# Detect custom endpoint from config base_url
|
||||
if current_provider == "openrouter":
|
||||
_cfg_base = model_cfg.get("base_url", "") if isinstance(model_cfg, dict) else ""
|
||||
if _cfg_base and "openrouter.ai" not in _cfg_base:
|
||||
current_provider = "custom"
|
||||
|
||||
current_label = _PROVIDER_LABELS.get(current_provider, current_provider)
|
||||
|
||||
lines = [
|
||||
f"🔌 **Current provider:** {current_label} (`{current_provider}`)",
|
||||
"",
|
||||
"**Available providers:**",
|
||||
]
|
||||
|
||||
providers = list_available_providers()
|
||||
for p in providers:
|
||||
marker = " ← active" if p["id"] == current_provider else ""
|
||||
auth = "✅" if p["authenticated"] else "❌"
|
||||
aliases = f" _(also: {', '.join(p['aliases'])})_" if p["aliases"] else ""
|
||||
lines.append(f"{auth} `{p['id']}` — {p['label']}{aliases}{marker}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("Switch: `/model provider:model-name`")
|
||||
lines.append("Setup: `hermes setup`")
|
||||
return "\n".join(lines)
|
||||
|
||||
async def _handle_personality_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /personality command - list or set a personality."""
|
||||
import yaml
|
||||
@@ -7131,7 +7102,10 @@ class GatewayRunner:
|
||||
tmp_agent._print_fn = lambda *a, **kw: None
|
||||
|
||||
compressor = tmp_agent.context_compressor
|
||||
if not compressor.has_content_to_compress(msgs):
|
||||
compress_start = compressor.protect_first_n
|
||||
compress_start = compressor._align_boundary_forward(msgs, compress_start)
|
||||
compress_end = compressor._find_tail_cut_by_tokens(msgs, compress_start)
|
||||
if compress_start >= compress_end:
|
||||
return "Nothing to compress yet (the transcript is still all protected context)."
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
+41
-9
@@ -83,6 +83,9 @@ class SessionSource:
|
||||
user_id_alt: Optional[str] = None # Platform-specific stable alt ID (Signal UUID, Feishu union_id)
|
||||
chat_id_alt: Optional[str] = None # Signal group internal ID
|
||||
is_bot: bool = False # True when the message author is a bot/webhook (Discord)
|
||||
guild_id: Optional[str] = None # Discord guild / Slack workspace / Matrix server scope
|
||||
parent_chat_id: Optional[str] = None # Parent channel when chat_id refers to a thread
|
||||
message_id: Optional[str] = None # ID of the triggering message (for pin/reply/react)
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
@@ -120,8 +123,14 @@ class SessionSource:
|
||||
d["user_id_alt"] = self.user_id_alt
|
||||
if self.chat_id_alt:
|
||||
d["chat_id_alt"] = self.chat_id_alt
|
||||
if self.guild_id:
|
||||
d["guild_id"] = self.guild_id
|
||||
if self.parent_chat_id:
|
||||
d["parent_chat_id"] = self.parent_chat_id
|
||||
if self.message_id:
|
||||
d["message_id"] = self.message_id
|
||||
return d
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "SessionSource":
|
||||
return cls(
|
||||
@@ -135,6 +144,9 @@ class SessionSource:
|
||||
chat_topic=data.get("chat_topic"),
|
||||
user_id_alt=data.get("user_id_alt"),
|
||||
chat_id_alt=data.get("chat_id_alt"),
|
||||
guild_id=data.get("guild_id"),
|
||||
parent_chat_id=data.get("parent_chat_id"),
|
||||
message_id=data.get("message_id"),
|
||||
)
|
||||
|
||||
|
||||
@@ -273,14 +285,34 @@ def build_session_context_prompt(
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
elif context.source.platform == Platform.DISCORD:
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"**Platform notes:** You are running inside Discord. "
|
||||
"You do NOT have access to Discord-specific APIs — you cannot search "
|
||||
"channel history, pin messages, manage roles, or list server members. "
|
||||
"Do not promise to perform these actions. If the user asks, explain "
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
# The discord tool self-gates on DISCORD_BOT_TOKEN at registry
|
||||
# check time. Match that condition so the prompt stays honest:
|
||||
# with a token the agent has fetch_messages/search_members/
|
||||
# create_thread (and optionally discord_admin) and should know
|
||||
# the IDs it can call them with; without one it really is
|
||||
# limited to reading/replying via the gateway.
|
||||
if (os.environ.get("DISCORD_BOT_TOKEN") or "").strip():
|
||||
src = context.source
|
||||
id_lines = ["", "**Discord IDs (for the `discord` / `discord_admin` tools):**"]
|
||||
if src.guild_id:
|
||||
id_lines.append(f" - Guild: `{src.guild_id}`")
|
||||
if src.thread_id and src.parent_chat_id:
|
||||
id_lines.append(f" - Parent channel: `{src.parent_chat_id}`")
|
||||
id_lines.append(f" - Thread: `{src.thread_id}` (use as `channel_id` for fetch_messages etc.)")
|
||||
else:
|
||||
id_lines.append(f" - Channel: `{src.chat_id}`")
|
||||
if src.message_id:
|
||||
id_lines.append(f" - Triggering message: `{src.message_id}`")
|
||||
lines.extend(id_lines)
|
||||
else:
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"**Platform notes:** You are running inside Discord. "
|
||||
"You do NOT have access to Discord-specific APIs — you cannot search "
|
||||
"channel history, pin messages, manage roles, or list server members. "
|
||||
"Do not promise to perform these actions. If the user asks, explain "
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
|
||||
# Connected platforms
|
||||
platforms_list = ["local (files on this machine)"]
|
||||
|
||||
+98
-910
File diff suppressed because it is too large
Load Diff
@@ -110,40 +110,18 @@ def _display_source(source: str) -> str:
|
||||
return source.split(":", 1)[1] if source.startswith("manual:") else source
|
||||
|
||||
|
||||
def _classify_exhausted_status(entry) -> tuple[str, bool]:
|
||||
code = getattr(entry, "last_error_code", None)
|
||||
reason = str(getattr(entry, "last_error_reason", "") or "").strip().lower()
|
||||
message = str(getattr(entry, "last_error_message", "") or "").strip().lower()
|
||||
|
||||
if code == 429 or any(token in reason for token in ("rate_limit", "usage_limit", "quota", "exhausted")) or any(
|
||||
token in message for token in ("rate limit", "usage limit", "quota", "too many requests")
|
||||
):
|
||||
return "rate-limited", True
|
||||
|
||||
if code in {401, 403} or any(token in reason for token in ("invalid_token", "invalid_grant", "unauthorized", "forbidden", "auth")) or any(
|
||||
token in message for token in ("unauthorized", "forbidden", "expired", "revoked", "invalid token", "authentication")
|
||||
):
|
||||
return "auth failed", False
|
||||
|
||||
return "exhausted", True
|
||||
|
||||
|
||||
|
||||
def _format_exhausted_status(entry) -> str:
|
||||
if entry.last_status != STATUS_EXHAUSTED:
|
||||
return ""
|
||||
label, show_retry_window = _classify_exhausted_status(entry)
|
||||
reason = getattr(entry, "last_error_reason", None)
|
||||
reason_text = f" {reason}" if isinstance(reason, str) and reason.strip() else ""
|
||||
code = f" ({entry.last_error_code})" if entry.last_error_code else ""
|
||||
if not show_retry_window:
|
||||
return f" {label}{reason_text}{code} (re-auth may be required)"
|
||||
exhausted_until = _exhausted_until(entry)
|
||||
if exhausted_until is None:
|
||||
return f" {label}{reason_text}{code}"
|
||||
return f" exhausted{reason_text}{code}"
|
||||
remaining = max(0, int(math.ceil(exhausted_until - time.time())))
|
||||
if remaining <= 0:
|
||||
return f" {label}{reason_text}{code} (ready to retry)"
|
||||
return f" exhausted{reason_text}{code} (ready to retry)"
|
||||
minutes, seconds = divmod(remaining, 60)
|
||||
hours, minutes = divmod(minutes, 60)
|
||||
days, hours = divmod(hours, 24)
|
||||
@@ -155,7 +133,7 @@ def _format_exhausted_status(entry) -> str:
|
||||
wait = f"{minutes}m {seconds}s"
|
||||
else:
|
||||
wait = f"{seconds}s"
|
||||
return f" {label}{reason_text}{code} ({wait} left)"
|
||||
return f" exhausted{reason_text}{code} ({wait} left)"
|
||||
|
||||
|
||||
def auth_add_command(args) -> None:
|
||||
@@ -408,44 +386,6 @@ def auth_reset_command(args) -> None:
|
||||
print(f"Reset status on {count} {provider} credentials")
|
||||
|
||||
|
||||
def auth_status_command(args) -> None:
|
||||
provider = _normalize_provider(getattr(args, "provider", "") or "")
|
||||
if not provider:
|
||||
raise SystemExit("Provider is required. Example: `hermes auth status spotify`.")
|
||||
status = auth_mod.get_auth_status(provider)
|
||||
if not status.get("logged_in"):
|
||||
reason = status.get("error")
|
||||
if reason:
|
||||
print(f"{provider}: logged out ({reason})")
|
||||
else:
|
||||
print(f"{provider}: logged out")
|
||||
return
|
||||
|
||||
print(f"{provider}: logged in")
|
||||
for key in ("auth_type", "client_id", "redirect_uri", "scope", "expires_at", "api_base_url"):
|
||||
value = status.get(key)
|
||||
if value:
|
||||
print(f" {key}: {value}")
|
||||
|
||||
|
||||
def auth_logout_command(args) -> None:
|
||||
auth_mod.logout_command(SimpleNamespace(provider=getattr(args, "provider", None)))
|
||||
|
||||
|
||||
def auth_spotify_command(args) -> None:
|
||||
action = str(getattr(args, "spotify_action", "") or "login").strip().lower()
|
||||
if action in {"", "login"}:
|
||||
auth_mod.login_spotify_command(args)
|
||||
return
|
||||
if action == "status":
|
||||
auth_status_command(SimpleNamespace(provider="spotify"))
|
||||
return
|
||||
if action == "logout":
|
||||
auth_logout_command(SimpleNamespace(provider="spotify"))
|
||||
return
|
||||
raise SystemExit(f"Unknown Spotify auth action: {action}")
|
||||
|
||||
|
||||
def _interactive_auth() -> None:
|
||||
"""Interactive credential pool management when `hermes auth` is called bare."""
|
||||
# Show current pool status first
|
||||
@@ -643,14 +583,5 @@ def auth_command(args) -> None:
|
||||
if action == "reset":
|
||||
auth_reset_command(args)
|
||||
return
|
||||
if action == "status":
|
||||
auth_status_command(args)
|
||||
return
|
||||
if action == "logout":
|
||||
auth_logout_command(args)
|
||||
return
|
||||
if action == "spotify":
|
||||
auth_spotify_command(args)
|
||||
return
|
||||
# No subcommand — launch interactive mode
|
||||
_interactive_auth()
|
||||
|
||||
@@ -77,7 +77,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
CommandDef("rollback", "List or restore filesystem checkpoints", "Session",
|
||||
args_hint="[number]"),
|
||||
CommandDef("snapshot", "Create or restore state snapshots of Hermes config/state", "Session",
|
||||
cli_only=True, aliases=("snap",), args_hint="[create|restore <id>|prune]"),
|
||||
aliases=("snap",), args_hint="[create|restore <id>|prune]"),
|
||||
CommandDef("stop", "Kill all running background processes", "Session"),
|
||||
CommandDef("approve", "Approve a pending dangerous command", "Session",
|
||||
gateway_only=True, args_hint="[session|always]"),
|
||||
@@ -104,8 +104,9 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
CommandDef("config", "Show current configuration", "Configuration",
|
||||
cli_only=True),
|
||||
CommandDef("model", "Switch model for this session", "Configuration", args_hint="[model] [--provider name] [--global]"),
|
||||
CommandDef("gquota", "Show Google Gemini Code Assist quota usage", "Info",
|
||||
cli_only=True),
|
||||
CommandDef("provider", "Show available providers and current provider",
|
||||
"Configuration"),
|
||||
CommandDef("gquota", "Show Google Gemini Code Assist quota usage", "Info"),
|
||||
|
||||
CommandDef("personality", "Set a predefined personality", "Configuration",
|
||||
args_hint="[name]"),
|
||||
@@ -123,7 +124,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
args_hint="[normal|fast|status]",
|
||||
subcommands=("normal", "fast", "status", "on", "off")),
|
||||
CommandDef("skin", "Show or change the display skin/theme", "Configuration",
|
||||
cli_only=True, args_hint="[name]"),
|
||||
args_hint="[name]"),
|
||||
CommandDef("voice", "Toggle voice mode", "Configuration",
|
||||
args_hint="[on|off|tts|status]", subcommands=("on", "off", "tts", "status")),
|
||||
|
||||
@@ -138,8 +139,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
CommandDef("cron", "Manage scheduled tasks", "Tools & Skills",
|
||||
cli_only=True, args_hint="[subcommand]",
|
||||
subcommands=("list", "add", "create", "edit", "pause", "resume", "run", "remove")),
|
||||
CommandDef("reload", "Reload .env variables into the running session", "Tools & Skills",
|
||||
cli_only=True),
|
||||
CommandDef("reload", "Reload .env variables into the running session", "Tools & Skills"),
|
||||
CommandDef("reload-mcp", "Reload MCP servers from config", "Tools & Skills",
|
||||
aliases=("reload_mcp",)),
|
||||
CommandDef("browser", "Connect browser tools to your live Chrome via CDP", "Tools & Skills",
|
||||
@@ -317,7 +317,7 @@ def should_bypass_active_session(command_name: str | None) -> bool:
|
||||
safety net in gateway.run discards any command text that reaches
|
||||
the pending queue — which meant a mid-run /model (or /reasoning,
|
||||
/voice, /insights, /title, /resume, /retry, /undo, /compress,
|
||||
/usage, /reload-mcp, /sethome, /reset) would silently
|
||||
/usage, /provider, /reload-mcp, /sethome, /reset) would silently
|
||||
interrupt the agent AND get discarded, producing a zero-char
|
||||
response. See issue #5057 / PRs #6252, #10370, #4665.
|
||||
|
||||
|
||||
@@ -521,12 +521,6 @@ DEFAULT_CONFIG = {
|
||||
|
||||
},
|
||||
|
||||
# Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
|
||||
# cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored.
|
||||
"prompt_caching": {
|
||||
"cache_ttl": "5m",
|
||||
},
|
||||
|
||||
# AWS Bedrock provider configuration.
|
||||
# Only used when model.provider is "bedrock".
|
||||
"bedrock": {
|
||||
@@ -839,7 +833,7 @@ DEFAULT_CONFIG = {
|
||||
"auto_thread": True, # Auto-create threads on @mention in channels (like Slack)
|
||||
"reactions": True, # Add 👀/✅/❌ reactions to messages during processing
|
||||
"channel_prompts": {}, # Per-channel ephemeral system prompts (forum parents apply to child threads)
|
||||
# discord_server tool: restrict which actions the agent may call.
|
||||
# discord / discord_admin tools: restrict which actions the agent may call.
|
||||
# Default (empty) = all actions allowed (subject to bot privileged intents).
|
||||
# Accepts comma-separated string ("list_guilds,list_channels,fetch_messages")
|
||||
# or YAML list. Unknown names are dropped with a warning at load time.
|
||||
|
||||
@@ -275,99 +275,6 @@ def copilot_device_code_login(
|
||||
return None
|
||||
|
||||
|
||||
# ─── Copilot Token Exchange ────────────────────────────────────────────────
|
||||
|
||||
# Module-level cache for exchanged Copilot API tokens.
|
||||
# Maps raw_token_fingerprint -> (api_token, expires_at_epoch).
|
||||
_jwt_cache: dict[str, tuple[str, float]] = {}
|
||||
_JWT_REFRESH_MARGIN_SECONDS = 120 # refresh 2 min before expiry
|
||||
|
||||
# Token exchange endpoint and headers (matching VS Code / Copilot CLI)
|
||||
_TOKEN_EXCHANGE_URL = "https://api.github.com/copilot_internal/v2/token"
|
||||
_EDITOR_VERSION = "vscode/1.104.1"
|
||||
_EXCHANGE_USER_AGENT = "GitHubCopilotChat/0.26.7"
|
||||
|
||||
|
||||
def _token_fingerprint(raw_token: str) -> str:
|
||||
"""Short fingerprint of a raw token for cache keying (avoids storing full token)."""
|
||||
import hashlib
|
||||
return hashlib.sha256(raw_token.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def exchange_copilot_token(raw_token: str, *, timeout: float = 10.0) -> tuple[str, float]:
|
||||
"""Exchange a raw GitHub token for a short-lived Copilot API token.
|
||||
|
||||
Calls ``GET https://api.github.com/copilot_internal/v2/token`` with
|
||||
the raw GitHub token and returns ``(api_token, expires_at)``.
|
||||
|
||||
The returned token is a semicolon-separated string (not a standard JWT)
|
||||
used as ``Authorization: Bearer <token>`` for Copilot API requests.
|
||||
|
||||
Results are cached in-process and reused until close to expiry.
|
||||
Raises ``ValueError`` on failure.
|
||||
"""
|
||||
import urllib.request
|
||||
|
||||
fp = _token_fingerprint(raw_token)
|
||||
|
||||
# Check cache first
|
||||
cached = _jwt_cache.get(fp)
|
||||
if cached:
|
||||
api_token, expires_at = cached
|
||||
if time.time() < expires_at - _JWT_REFRESH_MARGIN_SECONDS:
|
||||
return api_token, expires_at
|
||||
|
||||
req = urllib.request.Request(
|
||||
_TOKEN_EXCHANGE_URL,
|
||||
method="GET",
|
||||
headers={
|
||||
"Authorization": f"token {raw_token}",
|
||||
"User-Agent": _EXCHANGE_USER_AGENT,
|
||||
"Accept": "application/json",
|
||||
"Editor-Version": _EDITOR_VERSION,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
except Exception as exc:
|
||||
raise ValueError(f"Copilot token exchange failed: {exc}") from exc
|
||||
|
||||
api_token = data.get("token", "")
|
||||
expires_at = data.get("expires_at", 0)
|
||||
if not api_token:
|
||||
raise ValueError("Copilot token exchange returned empty token")
|
||||
|
||||
# Convert expires_at to float if needed
|
||||
expires_at = float(expires_at) if expires_at else time.time() + 1800
|
||||
|
||||
_jwt_cache[fp] = (api_token, expires_at)
|
||||
logger.debug(
|
||||
"Copilot token exchanged, expires_at=%s",
|
||||
expires_at,
|
||||
)
|
||||
return api_token, expires_at
|
||||
|
||||
|
||||
def get_copilot_api_token(raw_token: str) -> str:
|
||||
"""Exchange a raw GitHub token for a Copilot API token, with fallback.
|
||||
|
||||
Convenience wrapper: returns the exchanged token on success, or the
|
||||
raw token unchanged if the exchange fails (e.g. network error, unsupported
|
||||
account type). This preserves existing behaviour for accounts that don't
|
||||
need exchange while enabling access to internal-only models for those that do.
|
||||
"""
|
||||
if not raw_token:
|
||||
return raw_token
|
||||
try:
|
||||
api_token, _ = exchange_copilot_token(raw_token)
|
||||
return api_token
|
||||
except Exception as exc:
|
||||
logger.debug("Copilot token exchange failed, using raw token: %s", exc)
|
||||
return raw_token
|
||||
|
||||
|
||||
# ─── Copilot API Headers ───────────────────────────────────────────────────
|
||||
|
||||
def copilot_request_headers(
|
||||
|
||||
@@ -93,9 +93,6 @@ def cron_list(show_all: bool = False):
|
||||
script = job.get("script")
|
||||
if script:
|
||||
print(f" Script: {script}")
|
||||
workdir = job.get("workdir")
|
||||
if workdir:
|
||||
print(f" Workdir: {workdir}")
|
||||
|
||||
# Execution history
|
||||
last_status = job.get("last_status")
|
||||
@@ -171,7 +168,6 @@ def cron_create(args):
|
||||
skill=getattr(args, "skill", None),
|
||||
skills=_normalize_skills(getattr(args, "skill", None), getattr(args, "skills", None)),
|
||||
script=getattr(args, "script", None),
|
||||
workdir=getattr(args, "workdir", None),
|
||||
)
|
||||
if not result.get("success"):
|
||||
print(color(f"Failed to create job: {result.get('error', 'unknown error')}", Colors.RED))
|
||||
@@ -184,8 +180,6 @@ def cron_create(args):
|
||||
job_data = result.get("job", {})
|
||||
if job_data.get("script"):
|
||||
print(f" Script: {job_data['script']}")
|
||||
if job_data.get("workdir"):
|
||||
print(f" Workdir: {job_data['workdir']}")
|
||||
print(f" Next run: {result['next_run_at']}")
|
||||
return 0
|
||||
|
||||
@@ -224,7 +218,6 @@ def cron_edit(args):
|
||||
repeat=getattr(args, "repeat", None),
|
||||
skills=final_skills,
|
||||
script=getattr(args, "script", None),
|
||||
workdir=getattr(args, "workdir", None),
|
||||
)
|
||||
if not result.get("success"):
|
||||
print(color(f"Failed to update job: {result.get('error', 'unknown error')}", Colors.RED))
|
||||
@@ -240,8 +233,6 @@ def cron_edit(args):
|
||||
print(" Skills: none")
|
||||
if updated.get("script"):
|
||||
print(f" Script: {updated['script']}")
|
||||
if updated.get("workdir"):
|
||||
print(f" Workdir: {updated['workdir']}")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
+8
-29
@@ -29,7 +29,6 @@ if _env_path.exists():
|
||||
load_dotenv(PROJECT_ROOT / ".env", override=False, encoding="utf-8")
|
||||
|
||||
from hermes_cli.colors import Colors, color
|
||||
from hermes_cli.models import _HERMES_USER_AGENT
|
||||
from hermes_constants import OPENROUTER_MODELS_URL
|
||||
from utils import base_url_host_matches
|
||||
|
||||
@@ -296,33 +295,16 @@ def run_doctor(args):
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
from hermes_cli.config import get_compatible_custom_providers as _compatible_custom_providers
|
||||
from hermes_cli.providers import resolve_provider_full as _resolve_provider_full
|
||||
from hermes_cli.auth import resolve_provider as _resolve_provider
|
||||
except Exception:
|
||||
_compatible_custom_providers = None
|
||||
_resolve_provider_full = None
|
||||
|
||||
custom_providers = []
|
||||
if _compatible_custom_providers is not None:
|
||||
try:
|
||||
custom_providers = _compatible_custom_providers(cfg)
|
||||
except Exception:
|
||||
custom_providers = []
|
||||
|
||||
user_providers = cfg.get("providers")
|
||||
if isinstance(user_providers, dict):
|
||||
known_providers.update(str(name).strip().lower() for name in user_providers if str(name).strip())
|
||||
for entry in custom_providers:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
name = str(entry.get("name") or "").strip()
|
||||
if name:
|
||||
known_providers.add("custom:" + name.lower().replace(" ", "-"))
|
||||
_resolve_provider = None
|
||||
|
||||
canonical_provider = provider
|
||||
if provider and _resolve_provider_full is not None and provider != "auto":
|
||||
provider_def = _resolve_provider_full(provider, user_providers, custom_providers)
|
||||
canonical_provider = provider_def.id if provider_def is not None else None
|
||||
if provider and _resolve_provider is not None and provider != "auto":
|
||||
try:
|
||||
canonical_provider = _resolve_provider(provider)
|
||||
except Exception:
|
||||
canonical_provider = None
|
||||
|
||||
if provider and provider != "auto":
|
||||
if canonical_provider is None or (known_providers and canonical_provider not in known_providers):
|
||||
@@ -975,10 +957,7 @@ def run_doctor(args):
|
||||
if base_url_host_matches(_base, "api.kimi.com") and _base.rstrip("/").endswith("/coding"):
|
||||
_base = _base.rstrip("/") + "/v1"
|
||||
_url = (_base.rstrip("/") + "/models") if _base else _default_url
|
||||
_headers = {
|
||||
"Authorization": f"Bearer {_key}",
|
||||
"User-Agent": _HERMES_USER_AGENT,
|
||||
}
|
||||
_headers = {"Authorization": f"Bearer {_key}"}
|
||||
if base_url_host_matches(_base, "api.kimi.com"):
|
||||
_headers["User-Agent"] = "claude-code/0.1.0"
|
||||
_resp = httpx.get(
|
||||
|
||||
@@ -267,8 +267,6 @@ def run_dump(args):
|
||||
("ANTHROPIC_API_KEY", "anthropic"),
|
||||
("ANTHROPIC_TOKEN", "anthropic_token"),
|
||||
("NOUS_API_KEY", "nous"),
|
||||
("GOOGLE_API_KEY", "google/gemini"),
|
||||
("GEMINI_API_KEY", "gemini"),
|
||||
("GLM_API_KEY", "glm/zai"),
|
||||
("ZAI_API_KEY", "zai"),
|
||||
("KIMI_API_KEY", "kimi"),
|
||||
|
||||
+10
-177
@@ -166,27 +166,6 @@ from hermes_cli.env_loader import load_hermes_dotenv
|
||||
|
||||
load_hermes_dotenv(project_env=PROJECT_ROOT / ".env")
|
||||
|
||||
# Bridge security.redact_secrets from config.yaml → HERMES_REDACT_SECRETS env
|
||||
# var BEFORE hermes_logging imports agent.redact (which snapshots the flag at
|
||||
# module-import time). Without this, config.yaml's toggle is ignored because
|
||||
# the setup_logging() call below imports agent.redact, which reads the env var
|
||||
# exactly once. Env var in .env still wins — this is config.yaml fallback only.
|
||||
try:
|
||||
if "HERMES_REDACT_SECRETS" not in os.environ:
|
||||
import yaml as _yaml_early
|
||||
_cfg_path = get_hermes_home() / "config.yaml"
|
||||
if _cfg_path.exists():
|
||||
with open(_cfg_path, encoding="utf-8") as _f:
|
||||
_early_sec_cfg = (_yaml_early.safe_load(_f) or {}).get("security", {})
|
||||
if isinstance(_early_sec_cfg, dict):
|
||||
_early_redact = _early_sec_cfg.get("redact_secrets")
|
||||
if _early_redact is not None:
|
||||
os.environ["HERMES_REDACT_SECRETS"] = str(_early_redact).lower()
|
||||
del _early_sec_cfg
|
||||
del _cfg_path
|
||||
except Exception:
|
||||
pass # best-effort — redaction stays at default (enabled) on config errors
|
||||
|
||||
# Initialize centralized file logging early — all `hermes` subcommands
|
||||
# (chat, setup, gateway, config, etc.) write to agent.log + errors.log.
|
||||
try:
|
||||
@@ -1450,7 +1429,6 @@ def select_provider_and_model(args=None):
|
||||
load_config,
|
||||
get_env_value,
|
||||
)
|
||||
from hermes_cli.providers import resolve_provider_full
|
||||
|
||||
config = load_config()
|
||||
current_model = config.get("model")
|
||||
@@ -1468,30 +1446,14 @@ def select_provider_and_model(args=None):
|
||||
effective_provider = (
|
||||
config_provider or os.getenv("HERMES_INFERENCE_PROVIDER") or "auto"
|
||||
)
|
||||
compatible_custom_providers = get_compatible_custom_providers(config)
|
||||
active = None
|
||||
if effective_provider != "auto":
|
||||
active_def = resolve_provider_full(
|
||||
effective_provider,
|
||||
config.get("providers"),
|
||||
compatible_custom_providers,
|
||||
)
|
||||
if active_def is not None:
|
||||
active = active_def.id
|
||||
else:
|
||||
warning = (
|
||||
f"Unknown provider '{effective_provider}'. Check 'hermes model' for "
|
||||
"available providers, or run 'hermes doctor' to diagnose config "
|
||||
"issues."
|
||||
)
|
||||
print(f"Warning: {warning} Falling back to auto provider detection.")
|
||||
if active is None:
|
||||
try:
|
||||
active = resolve_provider(effective_provider)
|
||||
except AuthError as exc:
|
||||
warning = format_auth_error(exc)
|
||||
print(f"Warning: {warning} Falling back to auto provider detection.")
|
||||
try:
|
||||
active = resolve_provider("auto")
|
||||
except AuthError as exc:
|
||||
if effective_provider == "auto":
|
||||
warning = format_auth_error(exc)
|
||||
print(f"Warning: {warning} Falling back to auto provider detection.")
|
||||
except AuthError:
|
||||
active = None # no provider yet; default to first in list
|
||||
|
||||
# Detect custom endpoint
|
||||
@@ -2349,41 +2311,7 @@ def _model_flow_openai_codex(config, current_model=""):
|
||||
from hermes_cli.codex_models import get_codex_model_ids
|
||||
|
||||
status = get_codex_auth_status()
|
||||
if status.get("logged_in"):
|
||||
print(" OpenAI Codex credentials: ✓")
|
||||
print()
|
||||
print(" 1. Use existing credentials")
|
||||
print(" 2. Reauthenticate (new OAuth login)")
|
||||
print(" 3. Cancel")
|
||||
print()
|
||||
try:
|
||||
choice = input(" Choice [1/2/3]: ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
choice = "1"
|
||||
|
||||
if choice == "2":
|
||||
print("Starting a fresh OpenAI Codex login...")
|
||||
print()
|
||||
try:
|
||||
mock_args = argparse.Namespace()
|
||||
_login_openai_codex(
|
||||
mock_args,
|
||||
PROVIDER_REGISTRY["openai-codex"],
|
||||
force_new_login=True,
|
||||
)
|
||||
except SystemExit:
|
||||
print("Login cancelled or failed.")
|
||||
return
|
||||
except Exception as exc:
|
||||
print(f"Login failed: {exc}")
|
||||
return
|
||||
status = get_codex_auth_status()
|
||||
if not status.get("logged_in"):
|
||||
print("Login failed.")
|
||||
return
|
||||
elif choice == "3":
|
||||
return
|
||||
else:
|
||||
if not status.get("logged_in"):
|
||||
print("Not logged into OpenAI Codex. Starting login...")
|
||||
print()
|
||||
try:
|
||||
@@ -2900,16 +2828,11 @@ def _model_flow_named_custom(config, provider_info):
|
||||
|
||||
name = provider_info["name"]
|
||||
base_url = provider_info["base_url"]
|
||||
api_mode = provider_info.get("api_mode", "")
|
||||
api_key = provider_info.get("api_key", "")
|
||||
key_env = provider_info.get("key_env", "")
|
||||
saved_model = provider_info.get("model", "")
|
||||
provider_key = (provider_info.get("provider_key") or "").strip()
|
||||
|
||||
# Resolve key from env var if api_key not set directly
|
||||
if not api_key and key_env:
|
||||
api_key = os.environ.get(key_env, "")
|
||||
|
||||
print(f" Provider: {name}")
|
||||
print(f" URL: {base_url}")
|
||||
if saved_model:
|
||||
@@ -2917,10 +2840,7 @@ def _model_flow_named_custom(config, provider_info):
|
||||
print()
|
||||
|
||||
print("Fetching available models...")
|
||||
models = fetch_api_models(
|
||||
api_key, base_url, timeout=8.0,
|
||||
api_mode=api_mode or None,
|
||||
)
|
||||
models = fetch_api_models(api_key, base_url, timeout=8.0)
|
||||
|
||||
if models:
|
||||
default_idx = 0
|
||||
@@ -4010,71 +3930,12 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
||||
print("Cancelled.")
|
||||
return
|
||||
save_env_value(key_env, new_key)
|
||||
existing_key = new_key
|
||||
print("API key saved.")
|
||||
print()
|
||||
else:
|
||||
print(f" {pconfig.name} API key: {existing_key[:8]}... ✓")
|
||||
print()
|
||||
|
||||
# Gemini free-tier gate: free-tier daily quotas (<= 250 RPD for Flash)
|
||||
# are exhausted in a handful of agent turns, so refuse to wire up the
|
||||
# provider with a free-tier key. Probe is best-effort; network or auth
|
||||
# errors fall through without blocking.
|
||||
if provider_id == "gemini" and existing_key:
|
||||
try:
|
||||
from agent.gemini_native_adapter import probe_gemini_tier
|
||||
except Exception:
|
||||
probe_gemini_tier = None
|
||||
if probe_gemini_tier is not None:
|
||||
print(" Checking Gemini API tier...")
|
||||
probe_base = (
|
||||
(get_env_value(base_url_env) if base_url_env else "")
|
||||
or os.getenv(base_url_env or "", "")
|
||||
or pconfig.inference_base_url
|
||||
)
|
||||
tier = probe_gemini_tier(existing_key, probe_base)
|
||||
if tier == "free":
|
||||
print()
|
||||
print(
|
||||
"❌ This Google API key is on the free tier "
|
||||
"(<= 250 requests/day for gemini-2.5-flash)."
|
||||
)
|
||||
print(
|
||||
" Hermes typically makes 3-10 API calls per user turn "
|
||||
"(tool iterations + auxiliary tasks),"
|
||||
)
|
||||
print(
|
||||
" so the free tier is exhausted after a handful of "
|
||||
"messages and cannot sustain"
|
||||
)
|
||||
print(" an agent session.")
|
||||
print()
|
||||
print(
|
||||
" To use Gemini with Hermes, enable billing on your "
|
||||
"Google Cloud project and regenerate"
|
||||
)
|
||||
print(
|
||||
" the key in a billing-enabled project: "
|
||||
"https://aistudio.google.com/apikey"
|
||||
)
|
||||
print()
|
||||
print(
|
||||
" Alternatives with workable free usage: DeepSeek, "
|
||||
"OpenRouter (free models), Groq, Nous."
|
||||
)
|
||||
print()
|
||||
print("Not saving Gemini as the default provider.")
|
||||
return
|
||||
if tier == "paid":
|
||||
print(" Tier check: paid ✓")
|
||||
else:
|
||||
# "unknown" -- network issue, auth problem, unexpected response.
|
||||
# Don't block; the runtime 429 handler will surface free-tier
|
||||
# guidance if the key turns out to be free tier.
|
||||
print(" Tier check: could not verify (proceeding anyway).")
|
||||
print()
|
||||
|
||||
# Optional base URL override
|
||||
current_base = ""
|
||||
if base_url_env:
|
||||
@@ -4316,8 +4177,6 @@ def _model_flow_anthropic(config, current_model=""):
|
||||
from agent.anthropic_adapter import (
|
||||
read_claude_code_credentials,
|
||||
is_claude_code_token_valid,
|
||||
_is_oauth_token,
|
||||
_resolve_claude_code_token_from_credentials,
|
||||
)
|
||||
|
||||
cc_creds = read_claude_code_credentials()
|
||||
@@ -4326,14 +4185,7 @@ def _model_flow_anthropic(config, current_model=""):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Stale-OAuth guard: if the only existing cred is an expired OAuth token
|
||||
# (no valid cc_creds to fall back on), treat it as missing so the re-auth
|
||||
# path is offered instead of silently accepting a broken token.
|
||||
existing_is_stale_oauth = False
|
||||
if existing_key and _is_oauth_token(existing_key) and not cc_available:
|
||||
existing_is_stale_oauth = True
|
||||
|
||||
has_creds = (bool(existing_key) and not existing_is_stale_oauth) or cc_available
|
||||
has_creds = bool(existing_key) or cc_available
|
||||
needs_auth = not has_creds
|
||||
|
||||
if has_creds:
|
||||
@@ -7333,7 +7185,7 @@ For more help on a command:
|
||||
)
|
||||
logout_parser.add_argument(
|
||||
"--provider",
|
||||
choices=["nous", "openai-codex", "spotify"],
|
||||
choices=["nous", "openai-codex"],
|
||||
default=None,
|
||||
help="Provider to log out from (default: active provider)",
|
||||
)
|
||||
@@ -7390,17 +7242,6 @@ For more help on a command:
|
||||
"reset", help="Clear exhaustion status for all credentials for a provider"
|
||||
)
|
||||
auth_reset.add_argument("provider", help="Provider id")
|
||||
auth_status = auth_subparsers.add_parser("status", help="Show auth status for a provider")
|
||||
auth_status.add_argument("provider", help="Provider id")
|
||||
auth_logout = auth_subparsers.add_parser("logout", help="Log out a provider and clear stored auth state")
|
||||
auth_logout.add_argument("provider", help="Provider id")
|
||||
auth_spotify = auth_subparsers.add_parser("spotify", help="Authenticate Hermes with Spotify via PKCE")
|
||||
auth_spotify.add_argument("spotify_action", nargs="?", choices=["login", "status", "logout"], default="login")
|
||||
auth_spotify.add_argument("--client-id", help="Spotify app client_id (or set HERMES_SPOTIFY_CLIENT_ID)")
|
||||
auth_spotify.add_argument("--redirect-uri", help="Allow-listed localhost redirect URI for your Spotify app")
|
||||
auth_spotify.add_argument("--scope", help="Override requested Spotify scopes")
|
||||
auth_spotify.add_argument("--no-browser", action="store_true", help="Do not attempt to open the browser automatically")
|
||||
auth_spotify.add_argument("--timeout", type=float, help="Callback/token exchange timeout in seconds")
|
||||
auth_parser.set_defaults(func=cmd_auth)
|
||||
|
||||
# =========================================================================
|
||||
@@ -7457,10 +7298,6 @@ For more help on a command:
|
||||
"--script",
|
||||
help="Path to a Python script whose stdout is injected into the prompt each run",
|
||||
)
|
||||
cron_create.add_argument(
|
||||
"--workdir",
|
||||
help="Absolute path for the job to run from. Injects AGENTS.md / CLAUDE.md / .cursorrules from that directory and uses it as the cwd for terminal/file/code_exec tools. Omit to preserve old behaviour (no project context files).",
|
||||
)
|
||||
|
||||
# cron edit
|
||||
cron_edit = cron_subparsers.add_parser(
|
||||
@@ -7499,10 +7336,6 @@ For more help on a command:
|
||||
"--script",
|
||||
help="Path to a Python script whose stdout is injected into the prompt each run. Pass empty string to clear.",
|
||||
)
|
||||
cron_edit.add_argument(
|
||||
"--workdir",
|
||||
help="Absolute path for the job to run from (injects AGENTS.md etc. and sets terminal cwd). Pass empty string to clear.",
|
||||
)
|
||||
|
||||
# lifecycle actions
|
||||
cron_pause = cron_subparsers.add_parser("pause", help="Pause a scheduled job")
|
||||
|
||||
@@ -12,12 +12,8 @@ Different LLM providers expect model identifiers in different formats:
|
||||
model IDs, but Claude still uses hyphenated native names like
|
||||
``claude-sonnet-4-6``.
|
||||
- **OpenCode Go** preserves dots in model names: ``minimax-m2.7``.
|
||||
- **DeepSeek** accepts ``deepseek-chat`` (V3), ``deepseek-reasoner``
|
||||
(R1-family), and the first-class V-series IDs (``deepseek-v4-pro``,
|
||||
``deepseek-v4-flash``, and any future ``deepseek-v<N>-*``). Older
|
||||
Hermes revisions folded every non-reasoner input into
|
||||
``deepseek-chat``, which on aggregators routes to V3 — so a user
|
||||
picking V4 Pro was silently downgraded.
|
||||
- **DeepSeek** only accepts two model identifiers:
|
||||
``deepseek-chat`` and ``deepseek-reasoner``.
|
||||
- **Custom** and remaining providers pass the name through as-is.
|
||||
|
||||
This module centralises that translation so callers can simply write::
|
||||
@@ -29,7 +25,6 @@ Inspired by Clawdbot's ``normalizeAnthropicModelId`` pattern.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -105,15 +100,6 @@ _MATCHING_PREFIX_STRIP_PROVIDERS: frozenset[str] = frozenset({
|
||||
"custom",
|
||||
})
|
||||
|
||||
# Providers whose APIs require lowercase model IDs. Xiaomi's
|
||||
# ``api.xiaomimimo.com`` rejects mixed-case names like ``MiMo-V2.5-Pro``
|
||||
# that users might copy from marketing docs — it only accepts
|
||||
# ``mimo-v2.5-pro``. After stripping a matching provider prefix, these
|
||||
# providers also get ``.lower()`` applied.
|
||||
_LOWERCASE_MODEL_PROVIDERS: frozenset[str] = frozenset({
|
||||
"xiaomi",
|
||||
})
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DeepSeek special handling
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -129,30 +115,17 @@ _DEEPSEEK_REASONER_KEYWORDS: frozenset[str] = frozenset({
|
||||
})
|
||||
|
||||
_DEEPSEEK_CANONICAL_MODELS: frozenset[str] = frozenset({
|
||||
"deepseek-chat", # V3 on DeepSeek direct and most aggregators
|
||||
"deepseek-reasoner", # R1-family reasoning model
|
||||
"deepseek-v4-pro", # V4 Pro — first-class model ID
|
||||
"deepseek-v4-flash", # V4 Flash — first-class model ID
|
||||
"deepseek-chat",
|
||||
"deepseek-reasoner",
|
||||
})
|
||||
|
||||
# First-class V-series IDs (``deepseek-v4-pro``, ``deepseek-v4-flash``,
|
||||
# future ``deepseek-v5-*``, dated variants like ``deepseek-v4-flash-20260423``).
|
||||
# Verified empirically 2026-04-24: DeepSeek's Chat Completions API returns
|
||||
# ``provider: DeepSeek`` / ``model: deepseek-v4-flash-20260423`` when called
|
||||
# with ``model=deepseek/deepseek-v4-flash``, so these names are not aliases
|
||||
# of ``deepseek-chat`` and must not be folded into it.
|
||||
_DEEPSEEK_V_SERIES_RE = re.compile(r"^deepseek-v\d+([-.].+)?$")
|
||||
|
||||
|
||||
def _normalize_for_deepseek(model_name: str) -> str:
|
||||
"""Map a model input to a DeepSeek-accepted identifier.
|
||||
"""Map any model input to one of DeepSeek's two accepted identifiers.
|
||||
|
||||
Rules:
|
||||
- Already a known canonical (``deepseek-chat``/``deepseek-reasoner``/
|
||||
``deepseek-v4-pro``/``deepseek-v4-flash``) -> pass through.
|
||||
- Matches the V-series pattern ``deepseek-v<digit>...`` -> pass through
|
||||
(covers future ``deepseek-v5-*`` and dated variants without a release).
|
||||
- Contains a reasoner keyword (r1, think, reasoning, cot, reasoner)
|
||||
- Already ``deepseek-chat`` or ``deepseek-reasoner`` -> pass through.
|
||||
- Contains any reasoner keyword (r1, think, reasoning, cot, reasoner)
|
||||
-> ``deepseek-reasoner``.
|
||||
- Everything else -> ``deepseek-chat``.
|
||||
|
||||
@@ -160,17 +133,13 @@ def _normalize_for_deepseek(model_name: str) -> str:
|
||||
model_name: The bare model name (vendor prefix already stripped).
|
||||
|
||||
Returns:
|
||||
A DeepSeek-accepted model identifier.
|
||||
One of ``"deepseek-chat"`` or ``"deepseek-reasoner"``.
|
||||
"""
|
||||
bare = _strip_vendor_prefix(model_name).lower()
|
||||
|
||||
if bare in _DEEPSEEK_CANONICAL_MODELS:
|
||||
return bare
|
||||
|
||||
# V-series first-class IDs (v4-pro, v4-flash, future v5-*, dated variants)
|
||||
if _DEEPSEEK_V_SERIES_RE.match(bare):
|
||||
return bare
|
||||
|
||||
# Check for reasoner-like keywords anywhere in the name
|
||||
for keyword in _DEEPSEEK_REASONER_KEYWORDS:
|
||||
if keyword in bare:
|
||||
@@ -378,9 +347,6 @@ def normalize_model_for_provider(model_input: str, target_provider: str) -> str:
|
||||
|
||||
>>> normalize_model_for_provider("claude-sonnet-4.6", "zai")
|
||||
'claude-sonnet-4.6'
|
||||
|
||||
>>> normalize_model_for_provider("MiMo-V2.5-Pro", "xiaomi")
|
||||
'mimo-v2.5-pro'
|
||||
"""
|
||||
name = (model_input or "").strip()
|
||||
if not name:
|
||||
@@ -444,12 +410,7 @@ def normalize_model_for_provider(model_input: str, target_provider: str) -> str:
|
||||
|
||||
# --- Direct providers: repair matching provider prefixes only ---
|
||||
if provider in _MATCHING_PREFIX_STRIP_PROVIDERS:
|
||||
result = _strip_matching_provider_prefix(name, provider)
|
||||
# Some providers require lowercase model IDs (e.g. Xiaomi's API
|
||||
# rejects "MiMo-V2.5-Pro" but accepts "mimo-v2.5-pro").
|
||||
if provider in _LOWERCASE_MODEL_PROVIDERS:
|
||||
result = result.lower()
|
||||
return result
|
||||
return _strip_matching_provider_prefix(name, provider)
|
||||
|
||||
# --- Authoritative native providers: preserve user-facing slugs as-is ---
|
||||
if provider in _AUTHORITATIVE_NATIVE_PROVIDERS:
|
||||
|
||||
@@ -771,10 +771,7 @@ def switch_model(
|
||||
|
||||
if provider_changed or explicit_provider:
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=target_provider,
|
||||
target_model=new_model,
|
||||
)
|
||||
runtime = resolve_runtime_provider(requested=target_provider)
|
||||
api_key = runtime.get("api_key", "")
|
||||
base_url = runtime.get("base_url", "")
|
||||
api_mode = runtime.get("api_mode", "")
|
||||
@@ -791,10 +788,7 @@ def switch_model(
|
||||
)
|
||||
else:
|
||||
try:
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=current_provider,
|
||||
target_model=new_model,
|
||||
)
|
||||
runtime = resolve_runtime_provider(requested=current_provider)
|
||||
api_key = runtime.get("api_key", "")
|
||||
base_url = runtime.get("base_url", "")
|
||||
api_mode = runtime.get("api_mode", "")
|
||||
@@ -821,7 +815,6 @@ def switch_model(
|
||||
target_provider,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
api_mode=api_mode or None,
|
||||
)
|
||||
except Exception as e:
|
||||
validation = {
|
||||
@@ -943,7 +936,7 @@ def list_authenticated_providers(
|
||||
from hermes_cli.auth import PROVIDER_REGISTRY
|
||||
from hermes_cli.models import (
|
||||
OPENROUTER_MODELS, _PROVIDER_MODELS,
|
||||
_MODELS_DEV_PREFERRED, _merge_with_models_dev, provider_model_ids,
|
||||
_MODELS_DEV_PREFERRED, _merge_with_models_dev,
|
||||
)
|
||||
|
||||
results: List[dict] = []
|
||||
@@ -991,14 +984,6 @@ def list_authenticated_providers(
|
||||
|
||||
# Check if any env var is set
|
||||
has_creds = any(os.environ.get(ev) for ev in env_vars)
|
||||
if not has_creds:
|
||||
try:
|
||||
from hermes_cli.auth import _load_auth_store
|
||||
store = _load_auth_store()
|
||||
if store and hermes_id in store.get("credential_pool", {}):
|
||||
has_creds = True
|
||||
except Exception:
|
||||
pass
|
||||
if not has_creds:
|
||||
continue
|
||||
|
||||
@@ -1110,14 +1095,11 @@ def list_authenticated_providers(
|
||||
if not has_creds:
|
||||
continue
|
||||
|
||||
if hermes_slug in {"copilot", "copilot-acp"}:
|
||||
model_ids = provider_model_ids(hermes_slug)
|
||||
else:
|
||||
# Use curated list — look up by Hermes slug, fall back to overlay key
|
||||
model_ids = curated.get(hermes_slug, []) or curated.get(pid, [])
|
||||
# Merge with models.dev for preferred providers (same rationale as above).
|
||||
if hermes_slug in _MODELS_DEV_PREFERRED:
|
||||
model_ids = _merge_with_models_dev(hermes_slug, model_ids)
|
||||
# Use curated list — look up by Hermes slug, fall back to overlay key
|
||||
model_ids = curated.get(hermes_slug, []) or curated.get(pid, [])
|
||||
# Merge with models.dev for preferred providers (same rationale as above).
|
||||
if hermes_slug in _MODELS_DEV_PREFERRED:
|
||||
model_ids = _merge_with_models_dev(hermes_slug, model_ids)
|
||||
total = len(model_ids)
|
||||
top = model_ids[:max_models]
|
||||
|
||||
@@ -1240,15 +1222,6 @@ def list_authenticated_providers(
|
||||
if m and m not in models_list:
|
||||
models_list.append(m)
|
||||
|
||||
# Official OpenAI API rows in providers: often have base_url but no
|
||||
# explicit models: dict — avoid a misleading zero count in /model.
|
||||
if not models_list:
|
||||
url_lower = str(api_url).strip().lower()
|
||||
if "api.openai.com" in url_lower:
|
||||
fb = curated.get("openai") or []
|
||||
if fb:
|
||||
models_list = list(fb)
|
||||
|
||||
# Try to probe /v1/models if URL is set (but don't block on it)
|
||||
# For now just show what we know from config
|
||||
results.append({
|
||||
|
||||
+10
-201
@@ -142,18 +142,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"openai/gpt-5.4-pro",
|
||||
"openai/gpt-5.4-nano",
|
||||
],
|
||||
# Native OpenAI Chat Completions (api.openai.com). Used by /model counts and
|
||||
# provider_model_ids fallback when /v1/models is unavailable.
|
||||
"openai": [
|
||||
"gpt-5.4",
|
||||
"gpt-5.4-mini",
|
||||
"gpt-5-mini",
|
||||
"gpt-5.3-codex",
|
||||
"gpt-5.2-codex",
|
||||
"gpt-4.1",
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
],
|
||||
"openai-codex": _codex_curated_models(),
|
||||
"copilot-acp": [
|
||||
"copilot-acp",
|
||||
@@ -167,13 +155,10 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"gpt-4.1",
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
"claude-opus-4.6",
|
||||
"claude-sonnet-4.6",
|
||||
"claude-sonnet-4",
|
||||
"claude-sonnet-4.5",
|
||||
"claude-haiku-4.5",
|
||||
"gemini-3.1-pro-preview",
|
||||
"gemini-3-pro-preview",
|
||||
"gemini-3-flash-preview",
|
||||
"gemini-2.5-pro",
|
||||
"grok-code-fast-1",
|
||||
],
|
||||
@@ -697,7 +682,7 @@ def get_nous_recommended_aux_model(
|
||||
# ---------------------------------------------------------------------------
|
||||
# Canonical provider list — single source of truth for provider identity.
|
||||
# Every code path that lists, displays, or iterates providers derives from
|
||||
# this list: hermes model, /model, list_authenticated_providers.
|
||||
# this list: hermes model, /model, /provider, list_authenticated_providers.
|
||||
#
|
||||
# Fields:
|
||||
# slug — internal provider ID (used in config.yaml, --provider flag)
|
||||
@@ -1125,10 +1110,7 @@ def fetch_models_with_pricing(
|
||||
return _pricing_cache[cache_key]
|
||||
|
||||
url = cache_key.rstrip("/") + "/v1/models"
|
||||
headers: dict[str, str] = {
|
||||
"Accept": "application/json",
|
||||
"User-Agent": _HERMES_USER_AGENT,
|
||||
}
|
||||
headers: dict[str, str] = {"Accept": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
@@ -1760,17 +1742,6 @@ def provider_model_ids(provider: Optional[str], *, force_refresh: bool = False)
|
||||
live = fetch_ollama_cloud_models(force_refresh=force_refresh)
|
||||
if live:
|
||||
return live
|
||||
if normalized == "openai":
|
||||
api_key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
if api_key:
|
||||
base_raw = os.getenv("OPENAI_BASE_URL", "").strip().rstrip("/")
|
||||
base = base_raw or "https://api.openai.com/v1"
|
||||
try:
|
||||
live = fetch_api_models(api_key, base)
|
||||
if live:
|
||||
return live
|
||||
except Exception:
|
||||
pass
|
||||
if normalized == "custom":
|
||||
base_url = _get_custom_base_url()
|
||||
if base_url:
|
||||
@@ -1925,51 +1896,6 @@ def fetch_github_model_catalog(
|
||||
return None
|
||||
|
||||
|
||||
# ─── Copilot catalog context-window helpers ─────────────────────────────────
|
||||
|
||||
# Module-level cache: {model_id: max_prompt_tokens}
|
||||
_copilot_context_cache: dict[str, int] = {}
|
||||
_copilot_context_cache_time: float = 0.0
|
||||
_COPILOT_CONTEXT_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
|
||||
def get_copilot_model_context(model_id: str, api_key: Optional[str] = None) -> Optional[int]:
|
||||
"""Look up max_prompt_tokens for a Copilot model from the live /models API.
|
||||
|
||||
Results are cached in-process for 1 hour to avoid repeated API calls.
|
||||
Returns the token limit or None if not found.
|
||||
"""
|
||||
global _copilot_context_cache, _copilot_context_cache_time
|
||||
|
||||
# Serve from cache if fresh
|
||||
if _copilot_context_cache and (time.time() - _copilot_context_cache_time < _COPILOT_CONTEXT_CACHE_TTL):
|
||||
if model_id in _copilot_context_cache:
|
||||
return _copilot_context_cache[model_id]
|
||||
# Cache is fresh but model not in it — don't re-fetch
|
||||
return None
|
||||
|
||||
# Fetch and populate cache
|
||||
catalog = fetch_github_model_catalog(api_key=api_key)
|
||||
if not catalog:
|
||||
return None
|
||||
|
||||
cache: dict[str, int] = {}
|
||||
for item in catalog:
|
||||
mid = str(item.get("id") or "").strip()
|
||||
if not mid:
|
||||
continue
|
||||
caps = item.get("capabilities") or {}
|
||||
limits = caps.get("limits") or {}
|
||||
max_prompt = limits.get("max_prompt_tokens")
|
||||
if isinstance(max_prompt, int) and max_prompt > 0:
|
||||
cache[mid] = max_prompt
|
||||
|
||||
_copilot_context_cache = cache
|
||||
_copilot_context_cache_time = time.time()
|
||||
|
||||
return cache.get(model_id)
|
||||
|
||||
|
||||
def _is_github_models_base_url(base_url: Optional[str]) -> bool:
|
||||
normalized = (base_url or "").strip().rstrip("/").lower()
|
||||
return (
|
||||
@@ -2003,7 +1929,6 @@ _COPILOT_MODEL_ALIASES = {
|
||||
"openai/o4-mini": "gpt-5-mini",
|
||||
"anthropic/claude-opus-4.6": "claude-opus-4.6",
|
||||
"anthropic/claude-sonnet-4.6": "claude-sonnet-4.6",
|
||||
"anthropic/claude-sonnet-4": "claude-sonnet-4",
|
||||
"anthropic/claude-sonnet-4.5": "claude-sonnet-4.5",
|
||||
"anthropic/claude-haiku-4.5": "claude-haiku-4.5",
|
||||
# Dash-notation fallbacks: Hermes' default Claude IDs elsewhere use
|
||||
@@ -2013,12 +1938,10 @@ _COPILOT_MODEL_ALIASES = {
|
||||
# "model_not_supported". See issue #6879.
|
||||
"claude-opus-4-6": "claude-opus-4.6",
|
||||
"claude-sonnet-4-6": "claude-sonnet-4.6",
|
||||
"claude-sonnet-4-0": "claude-sonnet-4",
|
||||
"claude-sonnet-4-5": "claude-sonnet-4.5",
|
||||
"claude-haiku-4-5": "claude-haiku-4.5",
|
||||
"anthropic/claude-opus-4-6": "claude-opus-4.6",
|
||||
"anthropic/claude-sonnet-4-6": "claude-sonnet-4.6",
|
||||
"anthropic/claude-sonnet-4-0": "claude-sonnet-4",
|
||||
"anthropic/claude-sonnet-4-5": "claude-sonnet-4.5",
|
||||
"anthropic/claude-haiku-4-5": "claude-haiku-4.5",
|
||||
}
|
||||
@@ -2243,15 +2166,8 @@ def probe_api_models(
|
||||
api_key: Optional[str],
|
||||
base_url: Optional[str],
|
||||
timeout: float = 5.0,
|
||||
api_mode: Optional[str] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Probe a ``/models`` endpoint with light URL heuristics.
|
||||
|
||||
For ``anthropic_messages`` mode, uses ``x-api-key`` and
|
||||
``anthropic-version`` headers (Anthropic's native auth) instead of
|
||||
``Authorization: Bearer``. The response shape (``data[].id``) is
|
||||
identical, so the same parser works for both.
|
||||
"""
|
||||
"""Probe an OpenAI-compatible ``/models`` endpoint with light URL heuristics."""
|
||||
normalized = (base_url or "").strip().rstrip("/")
|
||||
if not normalized:
|
||||
return {
|
||||
@@ -2283,10 +2199,7 @@ def probe_api_models(
|
||||
|
||||
tried: list[str] = []
|
||||
headers: dict[str, str] = {"User-Agent": _HERMES_USER_AGENT}
|
||||
if api_key and api_mode == "anthropic_messages":
|
||||
headers["x-api-key"] = api_key
|
||||
headers["anthropic-version"] = "2023-06-01"
|
||||
elif api_key:
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
if normalized.startswith(COPILOT_BASE_URL):
|
||||
headers.update(copilot_default_headers())
|
||||
@@ -2328,10 +2241,7 @@ def _fetch_ai_gateway_models(timeout: float = 5.0) -> Optional[list[str]]:
|
||||
base_url = AI_GATEWAY_BASE_URL
|
||||
|
||||
url = base_url.rstrip("/") + "/models"
|
||||
headers: dict[str, str] = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"User-Agent": _HERMES_USER_AGENT,
|
||||
}
|
||||
headers: dict[str, str] = {"Authorization": f"Bearer {api_key}"}
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
@@ -2351,14 +2261,13 @@ def fetch_api_models(
|
||||
api_key: Optional[str],
|
||||
base_url: Optional[str],
|
||||
timeout: float = 5.0,
|
||||
api_mode: Optional[str] = None,
|
||||
) -> Optional[list[str]]:
|
||||
"""Fetch the list of available model IDs from the provider's ``/models`` endpoint.
|
||||
|
||||
Returns a list of model ID strings, or ``None`` if the endpoint could not
|
||||
be reached (network error, timeout, auth failure, etc.).
|
||||
"""
|
||||
return probe_api_models(api_key, base_url, timeout=timeout, api_mode=api_mode).get("models")
|
||||
return probe_api_models(api_key, base_url, timeout=timeout).get("models")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -2486,7 +2395,6 @@ def validate_requested_model(
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
api_mode: Optional[str] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Validate a ``/model`` value for the active provider.
|
||||
@@ -2528,11 +2436,7 @@ def validate_requested_model(
|
||||
}
|
||||
|
||||
if normalized == "custom":
|
||||
# Try probing with correct auth for the api_mode.
|
||||
if api_mode == "anthropic_messages":
|
||||
probe = probe_api_models(api_key, base_url, api_mode=api_mode)
|
||||
else:
|
||||
probe = probe_api_models(api_key, base_url)
|
||||
probe = probe_api_models(api_key, base_url)
|
||||
api_models = probe.get("models")
|
||||
if api_models is not None:
|
||||
if requested_for_lookup in set(api_models):
|
||||
@@ -2581,17 +2485,12 @@ def validate_requested_model(
|
||||
f"Note: could not reach this custom endpoint's model listing at `{probe.get('probed_url')}`. "
|
||||
f"Hermes will still save `{requested}`, but the endpoint should expose `/models` for verification."
|
||||
)
|
||||
if api_mode == "anthropic_messages":
|
||||
message += (
|
||||
"\n Many Anthropic-compatible proxies do not implement the Models API "
|
||||
"(GET /v1/models). The model name has been accepted without verification."
|
||||
)
|
||||
if probe.get("suggested_base_url"):
|
||||
message += f"\n If this server expects `/v1`, try base URL: `{probe.get('suggested_base_url')}`"
|
||||
|
||||
return {
|
||||
"accepted": api_mode == "anthropic_messages",
|
||||
"persist": True,
|
||||
"accepted": False,
|
||||
"persist": False,
|
||||
"recognized": False,
|
||||
"message": message,
|
||||
}
|
||||
@@ -2679,100 +2578,10 @@ def validate_requested_model(
|
||||
),
|
||||
}
|
||||
|
||||
# Native Anthropic provider: /v1/models requires x-api-key (or Bearer for
|
||||
# OAuth) plus anthropic-version headers. The generic OpenAI-style probe
|
||||
# below uses plain Bearer auth and 401s against Anthropic, so dispatch to
|
||||
# the native fetcher which handles both API keys and Claude-Code OAuth
|
||||
# tokens. (The api_mode=="anthropic_messages" branch below handles the
|
||||
# Messages-API transport case separately.)
|
||||
if normalized == "anthropic":
|
||||
anthropic_models = _fetch_anthropic_models()
|
||||
if anthropic_models is not None:
|
||||
if requested_for_lookup in set(anthropic_models):
|
||||
return {
|
||||
"accepted": True,
|
||||
"persist": True,
|
||||
"recognized": True,
|
||||
"message": None,
|
||||
}
|
||||
auto = get_close_matches(requested_for_lookup, anthropic_models, n=1, cutoff=0.9)
|
||||
if auto:
|
||||
return {
|
||||
"accepted": True,
|
||||
"persist": True,
|
||||
"recognized": True,
|
||||
"corrected_model": auto[0],
|
||||
"message": f"Auto-corrected `{requested}` → `{auto[0]}`",
|
||||
}
|
||||
suggestions = get_close_matches(requested, anthropic_models, n=3, cutoff=0.5)
|
||||
suggestion_text = ""
|
||||
if suggestions:
|
||||
suggestion_text = "\n Similar models: " + ", ".join(f"`{s}`" for s in suggestions)
|
||||
# Accept anyway — Anthropic sometimes gates newer/preview models
|
||||
# (e.g. snapshot IDs, early-access releases) behind accounts
|
||||
# even though they aren't listed on /v1/models.
|
||||
return {
|
||||
"accepted": True,
|
||||
"persist": True,
|
||||
"recognized": False,
|
||||
"message": (
|
||||
f"Note: `{requested}` was not found in Anthropic's /v1/models listing. "
|
||||
f"It may still work if you have early-access or snapshot IDs."
|
||||
f"{suggestion_text}"
|
||||
),
|
||||
}
|
||||
# _fetch_anthropic_models returned None — no token resolvable or
|
||||
# network failure. Fall through to the generic warning below.
|
||||
|
||||
# Anthropic Messages API: many proxies don't implement /v1/models.
|
||||
# Try probing with correct auth; if it fails, accept with a warning.
|
||||
if api_mode == "anthropic_messages":
|
||||
api_models = fetch_api_models(api_key, base_url, api_mode=api_mode)
|
||||
if api_models is not None:
|
||||
if requested_for_lookup in set(api_models):
|
||||
return {
|
||||
"accepted": True,
|
||||
"persist": True,
|
||||
"recognized": True,
|
||||
"message": None,
|
||||
}
|
||||
auto = get_close_matches(requested_for_lookup, api_models, n=1, cutoff=0.9)
|
||||
if auto:
|
||||
return {
|
||||
"accepted": True,
|
||||
"persist": True,
|
||||
"recognized": True,
|
||||
"corrected_model": auto[0],
|
||||
"message": f"Auto-corrected `{requested}` → `{auto[0]}`",
|
||||
}
|
||||
# Probe failed or model not found — accept anyway (proxy likely
|
||||
# doesn't implement the Anthropic Models API).
|
||||
return {
|
||||
"accepted": True,
|
||||
"persist": True,
|
||||
"recognized": False,
|
||||
"message": (
|
||||
f"Note: could not verify `{requested}` against this endpoint's "
|
||||
f"model listing. Many Anthropic-compatible proxies do not "
|
||||
f"implement GET /v1/models. The model name has been accepted "
|
||||
f"without verification."
|
||||
),
|
||||
}
|
||||
|
||||
# Probe the live API to check if the model actually exists
|
||||
api_models = fetch_api_models(api_key, base_url)
|
||||
|
||||
if api_models is not None:
|
||||
# Gemini's OpenAI-compat /v1beta/openai/models endpoint returns IDs
|
||||
# prefixed with "models/" (e.g. "models/gemini-2.5-flash") — native
|
||||
# Gemini-API convention. Our curated list and user input both use
|
||||
# the bare ID, so a direct set-membership check drops every known
|
||||
# Gemini model. Strip the prefix before comparison. See #12532.
|
||||
if normalized == "gemini":
|
||||
api_models = [
|
||||
m[len("models/"):] if isinstance(m, str) and m.startswith("models/") else m
|
||||
for m in api_models
|
||||
]
|
||||
if requested_for_lookup in set(api_models):
|
||||
# API confirmed the model exists
|
||||
return {
|
||||
|
||||
@@ -71,14 +71,6 @@ VALID_HOOKS: Set[str] = {
|
||||
"on_session_finalize",
|
||||
"on_session_reset",
|
||||
"subagent_stop",
|
||||
# Gateway pre-dispatch hook. Fired once per incoming MessageEvent
|
||||
# after the internal-event guard but BEFORE auth/pairing and agent
|
||||
# dispatch. Plugins may return a dict to influence flow:
|
||||
# {"action": "skip", "reason": "..."} -> drop message (no reply)
|
||||
# {"action": "rewrite", "text": "..."} -> replace event.text, continue
|
||||
# {"action": "allow"} / None -> normal dispatch
|
||||
# Kwargs: event: MessageEvent, gateway: GatewayRunner, session_store.
|
||||
"pre_gateway_dispatch",
|
||||
}
|
||||
|
||||
ENTRY_POINTS_GROUP = "hermes_agent.plugins"
|
||||
|
||||
@@ -116,10 +116,6 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = {
|
||||
transport="openai_chat",
|
||||
base_url_env_var="DASHSCOPE_BASE_URL",
|
||||
),
|
||||
"alibaba-coding-plan": HermesOverlay(
|
||||
transport="openai_chat",
|
||||
base_url_env_var="ALIBABA_CODING_PLAN_BASE_URL",
|
||||
),
|
||||
"vercel": HermesOverlay(
|
||||
transport="openai_chat",
|
||||
is_aggregator=True,
|
||||
@@ -263,9 +259,6 @@ ALIASES: Dict[str, str] = {
|
||||
"aliyun": "alibaba",
|
||||
"qwen": "alibaba",
|
||||
"alibaba-cloud": "alibaba",
|
||||
"alibaba_coding": "alibaba-coding-plan",
|
||||
"alibaba-coding": "alibaba-coding-plan",
|
||||
"alibaba_coding_plan": "alibaba-coding-plan",
|
||||
|
||||
# google-gemini-cli (OAuth + Code Assist)
|
||||
"gemini-cli": "google-gemini-cli",
|
||||
|
||||
@@ -36,29 +36,6 @@ def _normalize_custom_provider_name(value: str) -> str:
|
||||
return value.strip().lower().replace(" ", "-")
|
||||
|
||||
|
||||
def _loopback_hostname(host: str) -> bool:
|
||||
h = (host or "").lower().rstrip(".")
|
||||
return h in {"localhost", "127.0.0.1", "::1", "0.0.0.0"}
|
||||
|
||||
|
||||
def _config_base_url_trustworthy_for_bare_custom(cfg_base_url: str, cfg_provider: str) -> bool:
|
||||
"""Decide whether ``model.base_url`` may back bare ``custom`` runtime resolution.
|
||||
|
||||
GitHub #14676: the model picker can select Custom while ``model.provider`` still reflects a
|
||||
previous provider. Reject non-loopback URLs unless the YAML provider is already ``custom``,
|
||||
so a stale OpenRouter/Z.ai base_url cannot hijack local ``custom`` sessions.
|
||||
"""
|
||||
cfg_provider_norm = (cfg_provider or "").strip().lower()
|
||||
bu = (cfg_base_url or "").strip()
|
||||
if not bu:
|
||||
return False
|
||||
if cfg_provider_norm == "custom":
|
||||
return True
|
||||
if base_url_host_matches(bu, "openrouter.ai"):
|
||||
return False
|
||||
return _loopback_hostname(base_url_hostname(bu))
|
||||
|
||||
|
||||
def _detect_api_mode_for_url(base_url: str) -> Optional[str]:
|
||||
"""Auto-detect api_mode from the resolved base URL.
|
||||
|
||||
@@ -183,16 +160,8 @@ def _resolve_runtime_from_pool_entry(
|
||||
requested_provider: str,
|
||||
model_cfg: Optional[Dict[str, Any]] = None,
|
||||
pool: Optional[CredentialPool] = None,
|
||||
target_model: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
model_cfg = model_cfg or _get_model_config()
|
||||
# When the caller is resolving for a specific target model (e.g. a /model
|
||||
# mid-session switch), prefer that over the persisted model.default. This
|
||||
# prevents api_mode being computed from a stale config default that no
|
||||
# longer matches the model actually being used — the bug that caused
|
||||
# opencode-zen /v1 to be stripped for chat_completions requests when
|
||||
# config.default was still a Claude model.
|
||||
effective_model = (target_model or model_cfg.get("default") or "")
|
||||
base_url = (getattr(entry, "runtime_base_url", None) or getattr(entry, "base_url", None) or "").rstrip("/")
|
||||
api_key = getattr(entry, "runtime_api_key", None) or getattr(entry, "access_token", "")
|
||||
api_mode = "chat_completions"
|
||||
@@ -238,7 +207,7 @@ def _resolve_runtime_from_pool_entry(
|
||||
api_mode = configured_mode
|
||||
elif provider in ("opencode-zen", "opencode-go"):
|
||||
from hermes_cli.models import opencode_model_api_mode
|
||||
api_mode = opencode_model_api_mode(provider, effective_model)
|
||||
api_mode = opencode_model_api_mode(provider, model_cfg.get("default", ""))
|
||||
else:
|
||||
# Auto-detect Anthropic-compatible endpoints (/anthropic suffix,
|
||||
# Kimi /coding, api.openai.com → codex_responses, api.x.ai →
|
||||
@@ -354,16 +323,12 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
|
||||
# Found match by provider key
|
||||
base_url = entry.get("api") or entry.get("url") or entry.get("base_url") or ""
|
||||
if base_url:
|
||||
result = {
|
||||
return {
|
||||
"name": entry.get("name", ep_name),
|
||||
"base_url": base_url.strip(),
|
||||
"api_key": resolved_api_key,
|
||||
"model": entry.get("default_model", ""),
|
||||
}
|
||||
api_mode = _parse_api_mode(entry.get("api_mode"))
|
||||
if api_mode:
|
||||
result["api_mode"] = api_mode
|
||||
return result
|
||||
# Also check the 'name' field if present
|
||||
display_name = entry.get("name", "")
|
||||
if display_name:
|
||||
@@ -372,16 +337,12 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
|
||||
# Found match by display name
|
||||
base_url = entry.get("api") or entry.get("url") or entry.get("base_url") or ""
|
||||
if base_url:
|
||||
result = {
|
||||
return {
|
||||
"name": display_name,
|
||||
"base_url": base_url.strip(),
|
||||
"api_key": resolved_api_key,
|
||||
"model": entry.get("default_model", ""),
|
||||
}
|
||||
api_mode = _parse_api_mode(entry.get("api_mode"))
|
||||
if api_mode:
|
||||
result["api_mode"] = api_mode
|
||||
return result
|
||||
|
||||
# Fall back to custom_providers: list (legacy format)
|
||||
custom_providers = config.get("custom_providers")
|
||||
@@ -503,7 +464,6 @@ def _resolve_openrouter_runtime(
|
||||
cfg_provider = cfg_provider.strip().lower()
|
||||
|
||||
env_openrouter_base_url = os.getenv("OPENROUTER_BASE_URL", "").strip()
|
||||
env_custom_base_url = os.getenv("CUSTOM_BASE_URL", "").strip()
|
||||
|
||||
# Use config base_url when available and the provider context matches.
|
||||
# OPENAI_BASE_URL env var is no longer consulted — config.yaml is
|
||||
@@ -513,14 +473,11 @@ def _resolve_openrouter_runtime(
|
||||
if requested_norm == "auto":
|
||||
if not cfg_provider or cfg_provider == "auto":
|
||||
use_config_base_url = True
|
||||
elif requested_norm == "custom" and _config_base_url_trustworthy_for_bare_custom(
|
||||
cfg_base_url, cfg_provider
|
||||
):
|
||||
elif requested_norm == "custom" and cfg_provider == "custom":
|
||||
use_config_base_url = True
|
||||
|
||||
base_url = (
|
||||
(explicit_base_url or "").strip()
|
||||
or env_custom_base_url
|
||||
or (cfg_base_url.strip() if use_config_base_url else "")
|
||||
or env_openrouter_base_url
|
||||
or OPENROUTER_BASE_URL
|
||||
@@ -732,18 +689,8 @@ def resolve_runtime_provider(
|
||||
requested: Optional[str] = None,
|
||||
explicit_api_key: Optional[str] = None,
|
||||
explicit_base_url: Optional[str] = None,
|
||||
target_model: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve runtime provider credentials for agent execution.
|
||||
|
||||
target_model: Optional override for model_cfg.get("default") when
|
||||
computing provider-specific api_mode (e.g. OpenCode Zen/Go where different
|
||||
models route through different API surfaces). Callers performing an
|
||||
explicit mid-session model switch should pass the new model here so
|
||||
api_mode is derived from the model they are switching TO, not the stale
|
||||
persisted default. Other callers can leave it None to preserve existing
|
||||
behavior (api_mode derived from config).
|
||||
"""
|
||||
"""Resolve runtime provider credentials for agent execution."""
|
||||
requested_provider = resolve_requested_provider(requested)
|
||||
|
||||
custom_runtime = _resolve_named_custom_runtime(
|
||||
@@ -825,7 +772,6 @@ def resolve_runtime_provider(
|
||||
requested_provider=requested_provider,
|
||||
model_cfg=model_cfg,
|
||||
pool=pool,
|
||||
target_model=target_model,
|
||||
)
|
||||
|
||||
if provider == "nous":
|
||||
@@ -1044,11 +990,7 @@ def resolve_runtime_provider(
|
||||
api_mode = configured_mode
|
||||
elif provider in ("opencode-zen", "opencode-go"):
|
||||
from hermes_cli.models import opencode_model_api_mode
|
||||
# Prefer the target_model from the caller (explicit mid-session
|
||||
# switch) over the stale model.default; see _resolve_runtime_from_pool_entry
|
||||
# for the same rationale.
|
||||
_effective = target_model or model_cfg.get("default", "")
|
||||
api_mode = opencode_model_api_mode(provider, _effective)
|
||||
api_mode = opencode_model_api_mode(provider, model_cfg.get("default", ""))
|
||||
else:
|
||||
# Auto-detect Anthropic-compatible endpoints by URL convention
|
||||
# (e.g. https://api.minimax.io/anthropic, https://dashscope.../anthropic)
|
||||
|
||||
@@ -500,15 +500,6 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
if get_env_value("HASS_TOKEN"):
|
||||
tool_status.append(("Smart Home (Home Assistant)", True, None))
|
||||
|
||||
# Spotify (OAuth via hermes auth spotify — check auth.json, not env vars)
|
||||
try:
|
||||
from hermes_cli.auth import get_provider_auth_state
|
||||
_spotify_state = get_provider_auth_state("spotify") or {}
|
||||
if _spotify_state.get("access_token") or _spotify_state.get("refresh_token"):
|
||||
tool_status.append(("Spotify (PKCE OAuth)", True, None))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Skills Hub
|
||||
if get_env_value("GITHUB_TOKEN"):
|
||||
tool_status.append(("Skills Hub (GitHub)", True, None))
|
||||
|
||||
+6
-13
@@ -164,26 +164,19 @@ def show_status(args):
|
||||
qwen_status = {}
|
||||
|
||||
nous_logged_in = bool(nous_status.get("logged_in"))
|
||||
nous_error = nous_status.get("error")
|
||||
nous_label = "logged in" if nous_logged_in else "not logged in (run: hermes auth add nous --type oauth)"
|
||||
print(
|
||||
f" {'Nous Portal':<12} {check_mark(nous_logged_in)} "
|
||||
f"{nous_label}"
|
||||
f"{'logged in' if nous_logged_in else 'not logged in (run: hermes model)'}"
|
||||
)
|
||||
portal_url = nous_status.get("portal_base_url") or "(unknown)"
|
||||
access_exp = _format_iso_timestamp(nous_status.get("access_expires_at"))
|
||||
key_exp = _format_iso_timestamp(nous_status.get("agent_key_expires_at"))
|
||||
refresh_label = "yes" if nous_status.get("has_refresh_token") else "no"
|
||||
if nous_logged_in or portal_url != "(unknown)" or nous_error:
|
||||
if nous_logged_in:
|
||||
portal_url = nous_status.get("portal_base_url") or "(unknown)"
|
||||
access_exp = _format_iso_timestamp(nous_status.get("access_expires_at"))
|
||||
key_exp = _format_iso_timestamp(nous_status.get("agent_key_expires_at"))
|
||||
refresh_label = "yes" if nous_status.get("has_refresh_token") else "no"
|
||||
print(f" Portal URL: {portal_url}")
|
||||
if nous_logged_in or nous_status.get("access_expires_at"):
|
||||
print(f" Access exp: {access_exp}")
|
||||
if nous_logged_in or nous_status.get("agent_key_expires_at"):
|
||||
print(f" Key exp: {key_exp}")
|
||||
if nous_logged_in or nous_status.get("has_refresh_token"):
|
||||
print(f" Refresh: {refresh_label}")
|
||||
if nous_error and not nous_logged_in:
|
||||
print(f" Error: {nous_error}")
|
||||
|
||||
codex_logged_in = bool(codex_status.get("logged_in"))
|
||||
print(
|
||||
|
||||
+1
-1
@@ -127,7 +127,7 @@ TIPS = [
|
||||
|
||||
# --- Tools & Capabilities ---
|
||||
"execute_code runs Python scripts that call Hermes tools programmatically — results stay out of context.",
|
||||
"delegate_task spawns up to 3 concurrent sub-agents by default (delegation.max_concurrent_children) with isolated contexts for parallel work.",
|
||||
"delegate_task spawns up to 3 concurrent sub-agents by default (configurable via delegation.max_concurrent_children) with isolated contexts for parallel work.",
|
||||
"web_extract works on PDF URLs — pass any PDF link and it converts to markdown.",
|
||||
"search_files is ripgrep-backed and faster than grep — use it instead of terminal grep.",
|
||||
"patch uses 9 fuzzy matching strategies so minor whitespace differences won't break edits.",
|
||||
|
||||
+40
-28
@@ -67,13 +67,13 @@ CONFIGURABLE_TOOLSETS = [
|
||||
("messaging", "📨 Cross-Platform Messaging", "send_message"),
|
||||
("rl", "🧪 RL Training", "Tinker-Atropos training tools"),
|
||||
("homeassistant", "🏠 Home Assistant", "smart home device control"),
|
||||
("spotify", "🎵 Spotify", "playback, search, playlists, library"),
|
||||
("discord_admin", "🛡️ Discord Server Admin", "list channels/roles, pin, assign roles"),
|
||||
]
|
||||
|
||||
# Toolsets that are OFF by default for new installs.
|
||||
# They're still in _HERMES_CORE_TOOLS (available at runtime if enabled),
|
||||
# but the setup checklist won't pre-select them for first-time users.
|
||||
_DEFAULT_OFF_TOOLSETS = {"moa", "homeassistant", "rl", "spotify"}
|
||||
_DEFAULT_OFF_TOOLSETS = {"moa", "homeassistant", "rl", "discord_admin"}
|
||||
|
||||
|
||||
def _get_effective_configurable_toolsets():
|
||||
@@ -362,22 +362,6 @@ TOOL_CATEGORIES = {
|
||||
},
|
||||
],
|
||||
},
|
||||
"spotify": {
|
||||
"name": "Spotify",
|
||||
"icon": "🎵",
|
||||
"providers": [
|
||||
{
|
||||
"name": "Spotify Web API",
|
||||
"tag": "PKCE OAuth — run `hermes auth spotify` after this",
|
||||
"env_vars": [
|
||||
{"key": "HERMES_SPOTIFY_CLIENT_ID", "prompt": "Spotify app client_id",
|
||||
"url": "https://developer.spotify.com/dashboard"},
|
||||
{"key": "HERMES_SPOTIFY_REDIRECT_URI", "prompt": "Redirect URI (must be allow-listed in your Spotify app)",
|
||||
"default": "http://127.0.0.1:43827/spotify/callback"},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
"rl": {
|
||||
"name": "RL Training",
|
||||
"icon": "🧪",
|
||||
@@ -566,7 +550,7 @@ def _get_platform_tools(
|
||||
include_default_mcp_servers: bool = True,
|
||||
) -> Set[str]:
|
||||
"""Resolve which individual toolset names are enabled for a platform."""
|
||||
from toolsets import resolve_toolset
|
||||
from toolsets import resolve_toolset, TOOLSETS
|
||||
|
||||
platform_toolsets = config.get("platform_toolsets") or {}
|
||||
toolset_names = platform_toolsets.get(platform)
|
||||
@@ -580,6 +564,8 @@ def _get_platform_tools(
|
||||
toolset_names = [str(ts) for ts in toolset_names]
|
||||
|
||||
configurable_keys = {ts_key for ts_key, _, _ in CONFIGURABLE_TOOLSETS}
|
||||
plugin_ts_keys = _get_plugin_toolset_keys()
|
||||
platform_default_keys = {p["default_toolset"] for p in PLATFORMS.values()}
|
||||
|
||||
# If the saved list contains any configurable keys directly, the user
|
||||
# has explicitly configured this platform — use direct membership.
|
||||
@@ -602,19 +588,46 @@ def _get_platform_tools(
|
||||
ts_tools = set(resolve_toolset(ts_key))
|
||||
if ts_tools and ts_tools.issubset(all_tool_names):
|
||||
enabled_toolsets.add(ts_key)
|
||||
|
||||
default_off = set(_DEFAULT_OFF_TOOLSETS)
|
||||
if platform in default_off:
|
||||
default_off.remove(platform)
|
||||
enabled_toolsets -= default_off
|
||||
|
||||
# Plugin toolsets: enabled by default unless explicitly disabled, or
|
||||
# unless the toolset is in _DEFAULT_OFF_TOOLSETS (e.g. spotify —
|
||||
# shipped as a bundled plugin but user must opt in via `hermes tools`
|
||||
# so we don't ship 7 Spotify tool schemas to users who don't use it).
|
||||
# Recover non-configurable platform toolsets (e.g. discord, feishu_doc,
|
||||
# feishu_drive). These are part of the platform's default composite but
|
||||
# absent from CONFIGURABLE_TOOLSETS, so they can't appear in the TUI
|
||||
# checklist or in a user-saved config. Must run in BOTH branches —
|
||||
# otherwise saving via `hermes tools` (which flips has_explicit_config
|
||||
# to True) silently drops them.
|
||||
platform_tool_universe = set(resolve_toolset(PLATFORMS[platform]["default_toolset"]))
|
||||
configurable_tool_universe = set()
|
||||
for ck in configurable_keys:
|
||||
configurable_tool_universe.update(resolve_toolset(ck))
|
||||
claimed = set()
|
||||
for ts_key in enabled_toolsets:
|
||||
claimed.update(resolve_toolset(ts_key))
|
||||
skip = configurable_keys | plugin_ts_keys | platform_default_keys
|
||||
skip |= {k for k in TOOLSETS if k.startswith("hermes-")}
|
||||
skip |= set(_DEFAULT_OFF_TOOLSETS) - {platform}
|
||||
for ts_key, ts_def in TOOLSETS.items():
|
||||
if ts_key in skip:
|
||||
continue
|
||||
if ts_def.get("includes"):
|
||||
continue
|
||||
ts_tools = set(resolve_toolset(ts_key))
|
||||
if not ts_tools or not ts_tools.issubset(platform_tool_universe):
|
||||
continue
|
||||
if ts_tools.issubset(configurable_tool_universe):
|
||||
continue
|
||||
if not ts_tools.issubset(claimed):
|
||||
enabled_toolsets.add(ts_key)
|
||||
claimed.update(ts_tools)
|
||||
|
||||
# Plugin toolsets: enabled by default unless explicitly disabled.
|
||||
# A plugin toolset is "known" for a platform once `hermes tools`
|
||||
# has been saved for that platform (tracked via known_plugin_toolsets).
|
||||
# Unknown plugins default to enabled; known-but-absent = disabled.
|
||||
plugin_ts_keys = _get_plugin_toolset_keys()
|
||||
if plugin_ts_keys:
|
||||
known_map = config.get("known_plugin_toolsets", {})
|
||||
known_for_platform = set(known_map.get(platform, []))
|
||||
@@ -622,9 +635,6 @@ def _get_platform_tools(
|
||||
if pts in toolset_names:
|
||||
# Explicitly listed in config — enabled
|
||||
enabled_toolsets.add(pts)
|
||||
elif pts in _DEFAULT_OFF_TOOLSETS:
|
||||
# Opt-in plugin toolset — stay off until user picks it
|
||||
continue
|
||||
elif pts not in known_for_platform:
|
||||
# New plugin not yet seen by hermes tools — default enabled
|
||||
enabled_toolsets.add(pts)
|
||||
@@ -632,7 +642,6 @@ def _get_platform_tools(
|
||||
|
||||
# Preserve any explicit non-configurable toolset entries (for example,
|
||||
# custom toolsets or MCP server names saved in platform_toolsets).
|
||||
platform_default_keys = {p["default_toolset"] for p in PLATFORMS.values()}
|
||||
explicit_passthrough = {
|
||||
ts
|
||||
for ts in toolset_names
|
||||
@@ -692,6 +701,7 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
existing_toolsets = config.get("platform_toolsets", {}).get(platform, [])
|
||||
if not isinstance(existing_toolsets, list):
|
||||
existing_toolsets = []
|
||||
existing_toolsets = [str(ts) for ts in existing_toolsets]
|
||||
|
||||
# Preserve any entries that are NOT configurable toolsets and NOT platform
|
||||
# defaults (i.e. only MCP server names should be preserved)
|
||||
@@ -699,6 +709,8 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
entry for entry in existing_toolsets
|
||||
if entry not in configurable_keys and entry not in platform_default_keys
|
||||
}
|
||||
if "no_mcp" not in enabled_toolset_keys:
|
||||
preserved_entries.discard("no_mcp")
|
||||
|
||||
# Merge preserved entries with new enabled toolsets
|
||||
config["platform_toolsets"][platform] = sorted(enabled_toolset_keys | preserved_entries)
|
||||
|
||||
@@ -1039,71 +1039,6 @@ class SessionDB:
|
||||
result.append(msg)
|
||||
return result
|
||||
|
||||
def resolve_resume_session_id(self, session_id: str) -> str:
|
||||
"""Redirect a resume target to the descendant session that holds the messages.
|
||||
|
||||
Context compression ends the current session and forks a new child session
|
||||
(linked via ``parent_session_id``). The flush cursor is reset, so the
|
||||
child is where new messages actually land — the parent ends up with
|
||||
``message_count = 0`` rows unless messages had already been flushed to
|
||||
it before compression. See #15000.
|
||||
|
||||
This helper walks ``parent_session_id`` forward from ``session_id`` and
|
||||
returns the first descendant in the chain that has at least one message
|
||||
row. If the original session already has messages, or no descendant
|
||||
has any, the original ``session_id`` is returned unchanged.
|
||||
|
||||
The chain is always walked via the child whose ``started_at`` is
|
||||
latest; that matches the single-chain shape that compression creates.
|
||||
A depth cap (32) guards against accidental loops in malformed data.
|
||||
"""
|
||||
if not session_id:
|
||||
return session_id
|
||||
|
||||
with self._lock:
|
||||
# If this session already has messages, nothing to redirect.
|
||||
try:
|
||||
row = self._conn.execute(
|
||||
"SELECT 1 FROM messages WHERE session_id = ? LIMIT 1",
|
||||
(session_id,),
|
||||
).fetchone()
|
||||
except Exception:
|
||||
return session_id
|
||||
if row is not None:
|
||||
return session_id
|
||||
|
||||
# Walk descendants: at each step, pick the most-recently-started
|
||||
# child session; stop once we find one with messages.
|
||||
current = session_id
|
||||
seen = {current}
|
||||
for _ in range(32):
|
||||
try:
|
||||
child_row = self._conn.execute(
|
||||
"SELECT id FROM sessions "
|
||||
"WHERE parent_session_id = ? "
|
||||
"ORDER BY started_at DESC, id DESC LIMIT 1",
|
||||
(current,),
|
||||
).fetchone()
|
||||
except Exception:
|
||||
return session_id
|
||||
if child_row is None:
|
||||
return session_id
|
||||
child_id = child_row["id"] if hasattr(child_row, "keys") else child_row[0]
|
||||
if not child_id or child_id in seen:
|
||||
return session_id
|
||||
seen.add(child_id)
|
||||
try:
|
||||
msg_row = self._conn.execute(
|
||||
"SELECT 1 FROM messages WHERE session_id = ? LIMIT 1",
|
||||
(child_id,),
|
||||
).fetchone()
|
||||
except Exception:
|
||||
return session_id
|
||||
if msg_row is not None:
|
||||
return child_id
|
||||
current = child_id
|
||||
return session_id
|
||||
|
||||
def get_messages_as_conversation(self, session_id: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Load messages in the OpenAI conversation format (role + content dicts).
|
||||
|
||||
+27
-35
@@ -288,30 +288,34 @@ def get_tool_definitions(
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic_schema}
|
||||
break
|
||||
|
||||
# Rebuild discord_server schema based on the bot's privileged intents
|
||||
# (detected from GET /applications/@me) and the user's action allowlist
|
||||
# in config. Hides actions the bot's intents don't support so the
|
||||
# model never attempts them, and annotates fetch_messages when the
|
||||
# Rebuild discord / discord_admin schemas based on the bot's privileged
|
||||
# intents (detected from GET /applications/@me) and the user's action
|
||||
# allowlist in config. Hides actions the bot's intents don't support so
|
||||
# the model never attempts them, and annotates fetch_messages when the
|
||||
# MESSAGE_CONTENT intent is missing.
|
||||
if "discord_server" in available_tool_names:
|
||||
try:
|
||||
from tools.discord_tool import get_dynamic_schema
|
||||
dynamic = get_dynamic_schema()
|
||||
except Exception: # pragma: no cover — defensive, fall back to static
|
||||
dynamic = None
|
||||
if dynamic is None:
|
||||
# Tool filtered out entirely (empty allowlist or detection disabled
|
||||
# the only remaining actions). Drop it from the schema list.
|
||||
filtered_tools = [
|
||||
t for t in filtered_tools
|
||||
if t.get("function", {}).get("name") != "discord_server"
|
||||
]
|
||||
available_tool_names.discard("discord_server")
|
||||
else:
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == "discord_server":
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic}
|
||||
break
|
||||
_discord_schema_fns = {
|
||||
"discord": "get_dynamic_schema_core",
|
||||
"discord_admin": "get_dynamic_schema_admin",
|
||||
}
|
||||
for discord_tool_name in _discord_schema_fns:
|
||||
if discord_tool_name in available_tool_names:
|
||||
try:
|
||||
from tools import discord_tool as _dt
|
||||
schema_fn = getattr(_dt, _discord_schema_fns[discord_tool_name])
|
||||
dynamic = schema_fn()
|
||||
except Exception:
|
||||
dynamic = None
|
||||
if dynamic is None:
|
||||
filtered_tools = [
|
||||
t for t in filtered_tools
|
||||
if t.get("function", {}).get("name") != discord_tool_name
|
||||
]
|
||||
available_tool_names.discard(discord_tool_name)
|
||||
else:
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == discord_tool_name:
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic}
|
||||
break
|
||||
|
||||
# Strip web tool cross-references from browser_navigate description when
|
||||
# web_search / web_extract are not available. The static schema says
|
||||
@@ -343,18 +347,6 @@ def get_tool_definitions(
|
||||
global _last_resolved_tool_names
|
||||
_last_resolved_tool_names = [t["function"]["name"] for t in filtered_tools]
|
||||
|
||||
# Sanitize schemas for broad backend compatibility. llama.cpp's
|
||||
# json-schema-to-grammar converter (used by its OAI server to build
|
||||
# GBNF tool-call parsers) rejects some shapes that cloud providers
|
||||
# silently accept — bare "type": "object" with no properties,
|
||||
# string-valued schema nodes from malformed MCP servers, etc. This
|
||||
# is a no-op for schemas that are already well-formed.
|
||||
try:
|
||||
from tools.schema_sanitizer import sanitize_tool_schemas
|
||||
filtered_tools = sanitize_tool_schemas(filtered_tools)
|
||||
except Exception as e: # pragma: no cover — defensive
|
||||
logger.warning("Schema sanitization skipped: %s", e)
|
||||
|
||||
return filtered_tools
|
||||
|
||||
|
||||
|
||||
@@ -59,8 +59,7 @@ Config file: `~/.hermes/hindsight/config.json`
|
||||
|
||||
| Key | Default | Description |
|
||||
|-----|---------|-------------|
|
||||
| `bank_id` | `hermes` | Memory bank name (static fallback used when `bank_id_template` is unset or resolves empty) |
|
||||
| `bank_id_template` | — | Optional template to derive the bank name dynamically. Placeholders: `{profile}`, `{workspace}`, `{platform}`, `{user}`, `{session}`. Example: `hermes-{profile}` isolates memory per active Hermes profile. Empty placeholders collapse cleanly (e.g. `hermes-{user}` with no user becomes `hermes`). |
|
||||
| `bank_id` | `hermes` | Memory bank name |
|
||||
| `bank_mission` | — | Reflect mission (identity/framing for reflect reasoning). Applied via Banks API. |
|
||||
| `bank_retain_mission` | — | Retain mission (steers what gets extracted). Applied via Banks API. |
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
Long-term memory with knowledge graph, entity resolution, and multi-strategy
|
||||
retrieval. Supports cloud (API key) and local modes.
|
||||
|
||||
Configurable timeout via HINDSIGHT_TIMEOUT env var or config.json.
|
||||
|
||||
Original PR #1811 by benfrank241, adapted to MemoryProvider ABC.
|
||||
|
||||
Config via environment variables:
|
||||
@@ -13,7 +11,6 @@ Config via environment variables:
|
||||
HINDSIGHT_BUDGET — recall budget: low/mid/high (default: mid)
|
||||
HINDSIGHT_API_URL — API endpoint
|
||||
HINDSIGHT_MODE — cloud or local (default: cloud)
|
||||
HINDSIGHT_TIMEOUT — API request timeout in seconds (default: 120)
|
||||
HINDSIGHT_RETAIN_TAGS — comma-separated tags attached to retained memories
|
||||
HINDSIGHT_RETAIN_SOURCE — metadata source value attached to retained memories
|
||||
HINDSIGHT_RETAIN_USER_PREFIX — label used before user turns in retained transcripts
|
||||
@@ -26,7 +23,6 @@ Or via $HERMES_HOME/hindsight/config.json (profile-scoped), falling back to
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -44,7 +40,6 @@ logger = logging.getLogger(__name__)
|
||||
_DEFAULT_API_URL = "https://api.hindsight.vectorize.io"
|
||||
_DEFAULT_LOCAL_URL = "http://localhost:8888"
|
||||
_MIN_CLIENT_VERSION = "0.4.22"
|
||||
_DEFAULT_TIMEOUT = 120 # seconds — cloud API can take 30-40s per request
|
||||
_VALID_BUDGETS = {"low", "mid", "high"}
|
||||
_PROVIDER_DEFAULT_MODELS = {
|
||||
"openai": "gpt-4o-mini",
|
||||
@@ -59,22 +54,6 @@ _PROVIDER_DEFAULT_MODELS = {
|
||||
}
|
||||
|
||||
|
||||
def _check_local_runtime() -> tuple[bool, str | None]:
|
||||
"""Return whether local embedded Hindsight imports cleanly.
|
||||
|
||||
On older CPUs, importing the local Hindsight stack can raise a runtime
|
||||
error from NumPy before the daemon starts. Treat that as "unavailable"
|
||||
so Hermes can degrade gracefully instead of repeatedly trying to start
|
||||
a broken local memory backend.
|
||||
"""
|
||||
try:
|
||||
importlib.import_module("hindsight")
|
||||
importlib.import_module("hindsight_embed.daemon_embed_manager")
|
||||
return True, None
|
||||
except Exception as exc:
|
||||
return False, str(exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dedicated event loop for Hindsight async calls (one per process, reused).
|
||||
# Avoids creating ephemeral loops that leak aiohttp sessions.
|
||||
@@ -102,18 +81,13 @@ def _get_loop() -> asyncio.AbstractEventLoop:
|
||||
return _loop
|
||||
|
||||
|
||||
def _run_sync(coro, timeout: float = _DEFAULT_TIMEOUT):
|
||||
def _run_sync(coro, timeout: float = 120.0):
|
||||
"""Schedule *coro* on the shared loop and block until done."""
|
||||
loop = _get_loop()
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
return future.result(timeout=timeout)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backward-compatible alias — instances use self._run_sync() instead.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -259,126 +233,6 @@ def _utc_timestamp() -> str:
|
||||
return datetime.now(timezone.utc).isoformat(timespec="milliseconds").replace("+00:00", "Z")
|
||||
|
||||
|
||||
def _embedded_profile_name(config: dict[str, Any]) -> str:
|
||||
"""Return the Hindsight embedded profile name for this Hermes config."""
|
||||
profile = config.get("profile", "hermes")
|
||||
return str(profile or "hermes")
|
||||
|
||||
|
||||
def _load_simple_env(path) -> dict[str, str]:
|
||||
"""Parse a simple KEY=VALUE env file, ignoring comments and blank lines."""
|
||||
if not path.exists():
|
||||
return {}
|
||||
|
||||
values: dict[str, str] = {}
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
values[key.strip()] = value.strip()
|
||||
return values
|
||||
|
||||
|
||||
def _build_embedded_profile_env(config: dict[str, Any], *, llm_api_key: str | None = None) -> dict[str, str]:
|
||||
"""Build the profile-scoped env file that standalone hindsight-embed consumes."""
|
||||
current_key = llm_api_key
|
||||
if current_key is None:
|
||||
current_key = (
|
||||
config.get("llmApiKey")
|
||||
or config.get("llm_api_key")
|
||||
or os.environ.get("HINDSIGHT_LLM_API_KEY", "")
|
||||
)
|
||||
|
||||
current_provider = config.get("llm_provider", "")
|
||||
current_model = config.get("llm_model", "")
|
||||
current_base_url = config.get("llm_base_url") or os.environ.get("HINDSIGHT_API_LLM_BASE_URL", "")
|
||||
|
||||
# The embedded daemon expects OpenAI wire format for these providers.
|
||||
daemon_provider = "openai" if current_provider in ("openai_compatible", "openrouter") else current_provider
|
||||
|
||||
env_values = {
|
||||
"HINDSIGHT_API_LLM_PROVIDER": str(daemon_provider),
|
||||
"HINDSIGHT_API_LLM_API_KEY": str(current_key or ""),
|
||||
"HINDSIGHT_API_LLM_MODEL": str(current_model),
|
||||
"HINDSIGHT_API_LOG_LEVEL": "info",
|
||||
}
|
||||
if current_base_url:
|
||||
env_values["HINDSIGHT_API_LLM_BASE_URL"] = str(current_base_url)
|
||||
return env_values
|
||||
|
||||
|
||||
def _embedded_profile_env_path(config: dict[str, Any]):
|
||||
from pathlib import Path
|
||||
|
||||
return Path.home() / ".hindsight" / "profiles" / f"{_embedded_profile_name(config)}.env"
|
||||
|
||||
|
||||
def _materialize_embedded_profile_env(config: dict[str, Any], *, llm_api_key: str | None = None):
|
||||
"""Write the profile-scoped env file that standalone hindsight-embed uses."""
|
||||
profile_env = _embedded_profile_env_path(config)
|
||||
profile_env.parent.mkdir(parents=True, exist_ok=True)
|
||||
env_values = _build_embedded_profile_env(config, llm_api_key=llm_api_key)
|
||||
profile_env.write_text(
|
||||
"".join(f"{key}={value}\n" for key, value in env_values.items()),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return profile_env
|
||||
|
||||
def _sanitize_bank_segment(value: str) -> str:
|
||||
"""Sanitize a bank_id_template placeholder value.
|
||||
|
||||
Bank IDs should be safe for URL paths and filesystem use. Replaces any
|
||||
character that isn't alphanumeric, dash, or underscore with a dash, and
|
||||
collapses runs of dashes.
|
||||
"""
|
||||
if not value:
|
||||
return ""
|
||||
out = []
|
||||
prev_dash = False
|
||||
for ch in str(value):
|
||||
if ch.isalnum() or ch == "-" or ch == "_":
|
||||
out.append(ch)
|
||||
prev_dash = False
|
||||
else:
|
||||
if not prev_dash:
|
||||
out.append("-")
|
||||
prev_dash = True
|
||||
return "".join(out).strip("-_")
|
||||
|
||||
|
||||
def _resolve_bank_id_template(template: str, fallback: str, **placeholders: str) -> str:
|
||||
"""Resolve a bank_id template string with the given placeholders.
|
||||
|
||||
Supported placeholders (each is sanitized before substitution):
|
||||
{profile} — active Hermes profile name (from agent_identity)
|
||||
{workspace} — Hermes workspace name (from agent_workspace)
|
||||
{platform} — "cli", "telegram", "discord", etc.
|
||||
{user} — platform user id (gateway sessions)
|
||||
{session} — current session id
|
||||
|
||||
Missing/empty placeholders are rendered as the empty string and then
|
||||
collapsed — e.g. ``hermes-{user}`` with no user becomes ``hermes``.
|
||||
|
||||
If the template is empty, resolution falls back to *fallback*.
|
||||
Returns the sanitized bank id.
|
||||
"""
|
||||
if not template:
|
||||
return fallback
|
||||
sanitized = {k: _sanitize_bank_segment(v) for k, v in placeholders.items()}
|
||||
try:
|
||||
rendered = template.format(**sanitized)
|
||||
except (KeyError, IndexError) as exc:
|
||||
logger.warning("Invalid bank_id_template %r: %s — using fallback %r",
|
||||
template, exc, fallback)
|
||||
return fallback
|
||||
while "--" in rendered:
|
||||
rendered = rendered.replace("--", "-")
|
||||
while "__" in rendered:
|
||||
rendered = rendered.replace("__", "_")
|
||||
rendered = rendered.strip("-_")
|
||||
return rendered or fallback
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MemoryProvider implementation
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -408,17 +262,13 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
self._chat_type = ""
|
||||
self._thread_id = ""
|
||||
self._agent_identity = ""
|
||||
self._agent_workspace = ""
|
||||
self._turn_index = 0
|
||||
self._client = None
|
||||
self._timeout = _DEFAULT_TIMEOUT
|
||||
self._prefetch_result = ""
|
||||
self._prefetch_lock = threading.Lock()
|
||||
self._prefetch_thread = None
|
||||
self._sync_thread = None
|
||||
self._session_id = ""
|
||||
self._parent_session_id = ""
|
||||
self._document_id = ""
|
||||
|
||||
# Tags
|
||||
self._tags: list[str] | None = None
|
||||
@@ -443,7 +293,6 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
# Bank
|
||||
self._bank_mission = ""
|
||||
self._bank_retain_mission: str | None = None
|
||||
self._bank_id_template = ""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
@@ -453,16 +302,9 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
try:
|
||||
cfg = _load_config()
|
||||
mode = cfg.get("mode", "cloud")
|
||||
if mode in ("local", "local_embedded"):
|
||||
available, _ = _check_local_runtime()
|
||||
return available
|
||||
if mode == "local_external":
|
||||
if mode in ("local", "local_embedded", "local_external"):
|
||||
return True
|
||||
has_key = bool(
|
||||
cfg.get("apiKey")
|
||||
or cfg.get("api_key")
|
||||
or os.environ.get("HINDSIGHT_API_KEY", "")
|
||||
)
|
||||
has_key = bool(cfg.get("apiKey") or os.environ.get("HINDSIGHT_API_KEY", ""))
|
||||
has_url = bool(cfg.get("api_url") or os.environ.get("HINDSIGHT_API_URL", ""))
|
||||
return has_key or has_url
|
||||
except Exception:
|
||||
@@ -521,7 +363,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
else:
|
||||
deps_to_install = [cloud_dep]
|
||||
|
||||
print("\n Checking dependencies...")
|
||||
print(f"\n Checking dependencies...")
|
||||
uv_path = shutil.which("uv")
|
||||
if not uv_path:
|
||||
print(" ⚠ uv not found — install it: curl -LsSf https://astral.sh/uv/install.sh | sh")
|
||||
@@ -532,14 +374,14 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
[uv_path, "pip", "install", "--python", sys.executable, "--quiet", "--upgrade"] + deps_to_install,
|
||||
check=True, timeout=120, capture_output=True,
|
||||
)
|
||||
print(" ✓ Dependencies up to date")
|
||||
print(f" ✓ Dependencies up to date")
|
||||
except Exception as e:
|
||||
print(f" ⚠ Install failed: {e}")
|
||||
print(f" Run manually: uv pip install --python {sys.executable} {' '.join(deps_to_install)}")
|
||||
|
||||
# Step 3: Mode-specific config
|
||||
if mode == "cloud":
|
||||
print("\n Get your API key at https://ui.hindsight.vectorize.io\n")
|
||||
print(f"\n Get your API key at https://ui.hindsight.vectorize.io\n")
|
||||
existing_key = os.environ.get("HINDSIGHT_API_KEY", "")
|
||||
if existing_key:
|
||||
masked = f"...{existing_key[-4:]}" if len(existing_key) > 4 else "set"
|
||||
@@ -592,19 +434,13 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
sys.stdout.write(" LLM API key: ")
|
||||
sys.stdout.flush()
|
||||
llm_key = getpass.getpass(prompt="") if sys.stdin.isatty() else sys.stdin.readline().strip()
|
||||
# Always write explicitly (including empty) so the provider sees ""
|
||||
# rather than a missing variable. The daemon reads from .env at
|
||||
# startup and fails when HINDSIGHT_LLM_API_KEY is unset.
|
||||
env_writes["HINDSIGHT_LLM_API_KEY"] = llm_key
|
||||
if llm_key:
|
||||
env_writes["HINDSIGHT_LLM_API_KEY"] = llm_key
|
||||
|
||||
# Step 4: Save everything
|
||||
provider_config["bank_id"] = "hermes"
|
||||
provider_config["recall_budget"] = "mid"
|
||||
# Read existing timeout from config if present, otherwise use default
|
||||
existing_timeout = self._config.get("timeout") if self._config else None
|
||||
timeout_val = existing_timeout if existing_timeout else _DEFAULT_TIMEOUT
|
||||
provider_config["timeout"] = timeout_val
|
||||
env_writes["HINDSIGHT_TIMEOUT"] = str(timeout_val)
|
||||
bank_id = "hermes"
|
||||
config["memory"]["provider"] = "hindsight"
|
||||
save_config(config)
|
||||
|
||||
@@ -630,32 +466,10 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
new_lines.append(f"{k}={v}")
|
||||
env_path.write_text("\n".join(new_lines) + "\n")
|
||||
|
||||
if mode == "local_embedded":
|
||||
materialized_config = dict(provider_config)
|
||||
config_path = Path(hermes_home) / "hindsight" / "config.json"
|
||||
try:
|
||||
materialized_config = json.loads(config_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
llm_api_key = env_writes.get("HINDSIGHT_LLM_API_KEY", "")
|
||||
if not llm_api_key:
|
||||
llm_api_key = _load_simple_env(Path(hermes_home) / ".env").get("HINDSIGHT_LLM_API_KEY", "")
|
||||
if not llm_api_key:
|
||||
llm_api_key = _load_simple_env(_embedded_profile_env_path(materialized_config)).get(
|
||||
"HINDSIGHT_API_LLM_API_KEY",
|
||||
"",
|
||||
)
|
||||
|
||||
_materialize_embedded_profile_env(
|
||||
materialized_config,
|
||||
llm_api_key=llm_api_key or None,
|
||||
)
|
||||
|
||||
print(f"\n ✓ Hindsight memory configured ({mode} mode)")
|
||||
if env_writes:
|
||||
print(" API keys saved to .env")
|
||||
print("\n Start a new session to activate.\n")
|
||||
print(f" API keys saved to .env")
|
||||
print(f"\n Start a new session to activate.\n")
|
||||
|
||||
def get_config_schema(self):
|
||||
return [
|
||||
@@ -671,8 +485,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
{"key": "llm_base_url", "description": "Endpoint URL (e.g. http://192.168.1.10:8080/v1)", "default": "", "when": {"mode": "local_embedded", "llm_provider": "openai_compatible"}},
|
||||
{"key": "llm_api_key", "description": "LLM API key (optional for openai_compatible)", "secret": True, "env_var": "HINDSIGHT_LLM_API_KEY", "when": {"mode": "local_embedded"}},
|
||||
{"key": "llm_model", "description": "LLM model", "default": "gpt-4o-mini", "default_from": {"field": "llm_provider", "map": _PROVIDER_DEFAULT_MODELS}, "when": {"mode": "local_embedded"}},
|
||||
{"key": "bank_id", "description": "Memory bank name (static fallback when bank_id_template is unset)", "default": "hermes"},
|
||||
{"key": "bank_id_template", "description": "Optional template to derive bank_id dynamically. Placeholders: {profile}, {workspace}, {platform}, {user}, {session}. Example: hermes-{profile}", "default": ""},
|
||||
{"key": "bank_id", "description": "Memory bank name", "default": "hermes"},
|
||||
{"key": "bank_mission", "description": "Mission/purpose description for the memory bank"},
|
||||
{"key": "bank_retain_mission", "description": "Custom extraction prompt for memory retention"},
|
||||
{"key": "recall_budget", "description": "Recall thoroughness", "default": "mid", "choices": ["low", "mid", "high"]},
|
||||
@@ -692,19 +505,12 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
{"key": "recall_max_tokens", "description": "Maximum tokens for recall results", "default": 4096},
|
||||
{"key": "recall_max_input_chars", "description": "Maximum input query length for auto-recall", "default": 800},
|
||||
{"key": "recall_prompt_preamble", "description": "Custom preamble for recalled memories in context"},
|
||||
{"key": "timeout", "description": "API request timeout in seconds", "default": _DEFAULT_TIMEOUT},
|
||||
]
|
||||
|
||||
def _get_client(self):
|
||||
"""Return the cached Hindsight client (created once, reused)."""
|
||||
if self._client is None:
|
||||
if self._mode == "local_embedded":
|
||||
available, reason = _check_local_runtime()
|
||||
if not available:
|
||||
raise RuntimeError(
|
||||
"Hindsight local runtime is unavailable"
|
||||
+ (f": {reason}" if reason else "")
|
||||
)
|
||||
from hindsight import HindsightEmbedded
|
||||
HindsightEmbedded.__del__ = lambda self: None
|
||||
llm_provider = self._config.get("llm_provider", "")
|
||||
@@ -723,30 +529,16 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
self._client = HindsightEmbedded(**kwargs)
|
||||
else:
|
||||
from hindsight_client import Hindsight
|
||||
timeout = self._timeout or _DEFAULT_TIMEOUT
|
||||
kwargs = {"base_url": self._api_url, "timeout": float(timeout)}
|
||||
kwargs = {"base_url": self._api_url, "timeout": 30.0}
|
||||
if self._api_key:
|
||||
kwargs["api_key"] = self._api_key
|
||||
logger.debug("Creating Hindsight cloud client (url=%s, has_key=%s, timeout=%s)",
|
||||
self._api_url, bool(self._api_key), kwargs["timeout"])
|
||||
logger.debug("Creating Hindsight cloud client (url=%s, has_key=%s)",
|
||||
self._api_url, bool(self._api_key))
|
||||
self._client = Hindsight(**kwargs)
|
||||
return self._client
|
||||
|
||||
def _run_sync(self, coro):
|
||||
"""Schedule *coro* on the shared loop using the configured timeout."""
|
||||
return _run_sync(coro, timeout=self._timeout)
|
||||
|
||||
def initialize(self, session_id: str, **kwargs) -> None:
|
||||
self._session_id = str(session_id or "").strip()
|
||||
self._parent_session_id = str(kwargs.get("parent_session_id", "") or "").strip()
|
||||
|
||||
# Each process lifecycle gets its own document_id. Reusing session_id
|
||||
# alone caused overwrites on /resume — the reloaded session starts
|
||||
# with an empty _session_turns, so the next retain would replace the
|
||||
# previously stored content. session_id stays in tags so processes
|
||||
# for the same session remain filterable together.
|
||||
start_ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
||||
self._document_id = f"{self._session_id}-{start_ts}"
|
||||
|
||||
# Check client version and auto-upgrade if needed
|
||||
try:
|
||||
@@ -756,9 +548,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
if Version(installed) < Version(_MIN_CLIENT_VERSION):
|
||||
logger.warning("hindsight-client %s is outdated (need >=%s), attempting upgrade...",
|
||||
installed, _MIN_CLIENT_VERSION)
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import shutil, subprocess, sys
|
||||
uv_path = shutil.which("uv")
|
||||
if uv_path:
|
||||
try:
|
||||
@@ -785,41 +575,19 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
self._chat_type = str(kwargs.get("chat_type") or "").strip()
|
||||
self._thread_id = str(kwargs.get("thread_id") or "").strip()
|
||||
self._agent_identity = str(kwargs.get("agent_identity") or "").strip()
|
||||
self._agent_workspace = str(kwargs.get("agent_workspace") or "").strip()
|
||||
self._turn_index = 0
|
||||
self._session_turns = []
|
||||
self._mode = self._config.get("mode", "cloud")
|
||||
# Read timeout from config or env var, fall back to default
|
||||
self._timeout = self._config.get("timeout") or int(os.environ.get("HINDSIGHT_TIMEOUT", str(_DEFAULT_TIMEOUT)))
|
||||
# "local" is a legacy alias for "local_embedded"
|
||||
if self._mode == "local":
|
||||
self._mode = "local_embedded"
|
||||
if self._mode == "local_embedded":
|
||||
available, reason = _check_local_runtime()
|
||||
if not available:
|
||||
logger.warning(
|
||||
"Hindsight local mode disabled because its runtime could not be imported: %s",
|
||||
reason,
|
||||
)
|
||||
self._mode = "disabled"
|
||||
return
|
||||
self._api_key = self._config.get("apiKey") or self._config.get("api_key") or os.environ.get("HINDSIGHT_API_KEY", "")
|
||||
default_url = _DEFAULT_LOCAL_URL if self._mode in ("local_embedded", "local_external") else _DEFAULT_API_URL
|
||||
self._api_url = self._config.get("api_url") or os.environ.get("HINDSIGHT_API_URL", default_url)
|
||||
self._llm_base_url = self._config.get("llm_base_url", "")
|
||||
|
||||
banks = self._config.get("banks", {}).get("hermes", {})
|
||||
static_bank_id = self._config.get("bank_id") or banks.get("bankId", "hermes")
|
||||
self._bank_id_template = self._config.get("bank_id_template", "") or ""
|
||||
self._bank_id = _resolve_bank_id_template(
|
||||
self._bank_id_template,
|
||||
fallback=static_bank_id,
|
||||
profile=self._agent_identity,
|
||||
workspace=self._agent_workspace,
|
||||
platform=self._platform,
|
||||
user=self._user_id,
|
||||
session=self._session_id,
|
||||
)
|
||||
self._bank_id = self._config.get("bank_id") or banks.get("bankId", "hermes")
|
||||
budget = self._config.get("recall_budget") or self._config.get("budget") or banks.get("budget", "mid")
|
||||
self._budget = budget if budget in _VALID_BUDGETS else "mid"
|
||||
|
||||
@@ -872,10 +640,6 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
pass
|
||||
logger.info("Hindsight initialized: mode=%s, api_url=%s, bank=%s, budget=%s, memory_mode=%s, prefetch_method=%s, client=%s",
|
||||
self._mode, self._api_url, self._bank_id, self._budget, self._memory_mode, self._prefetch_method, _client_version)
|
||||
if self._bank_id_template:
|
||||
logger.debug("Hindsight bank resolved from template %r: profile=%s workspace=%s platform=%s user=%s -> bank=%s",
|
||||
self._bank_id_template, self._agent_identity, self._agent_workspace,
|
||||
self._platform, self._user_id, self._bank_id)
|
||||
logger.debug("Hindsight config: auto_retain=%s, auto_recall=%s, retain_every_n=%d, "
|
||||
"retain_async=%s, retain_context=%s, recall_max_tokens=%d, recall_max_input_chars=%d, tags=%s, recall_tags=%s",
|
||||
self._auto_retain, self._auto_recall, self._retain_every_n_turns,
|
||||
@@ -905,13 +669,42 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
# Update the profile .env to match our current config so
|
||||
# the daemon always starts with the right settings.
|
||||
# If the config changed and the daemon is running, stop it.
|
||||
profile_env = _embedded_profile_env_path(self._config)
|
||||
expected_env = _build_embedded_profile_env(self._config)
|
||||
saved = _load_simple_env(profile_env)
|
||||
config_changed = saved != expected_env
|
||||
from pathlib import Path as _Path
|
||||
profile_env = _Path.home() / ".hindsight" / "profiles" / f"{profile}.env"
|
||||
current_key = self._config.get("llm_api_key") or os.environ.get("HINDSIGHT_LLM_API_KEY", "")
|
||||
current_provider = self._config.get("llm_provider", "")
|
||||
current_model = self._config.get("llm_model", "")
|
||||
current_base_url = self._config.get("llm_base_url") or os.environ.get("HINDSIGHT_API_LLM_BASE_URL", "")
|
||||
# Map openai_compatible/openrouter → openai for the daemon (OpenAI wire format)
|
||||
daemon_provider = "openai" if current_provider in ("openai_compatible", "openrouter") else current_provider
|
||||
|
||||
# Read saved profile config
|
||||
saved = {}
|
||||
if profile_env.exists():
|
||||
for line in profile_env.read_text().splitlines():
|
||||
if "=" in line and not line.startswith("#"):
|
||||
k, v = line.split("=", 1)
|
||||
saved[k.strip()] = v.strip()
|
||||
|
||||
config_changed = (
|
||||
saved.get("HINDSIGHT_API_LLM_PROVIDER") != daemon_provider or
|
||||
saved.get("HINDSIGHT_API_LLM_MODEL") != current_model or
|
||||
saved.get("HINDSIGHT_API_LLM_API_KEY") != current_key or
|
||||
saved.get("HINDSIGHT_API_LLM_BASE_URL", "") != current_base_url
|
||||
)
|
||||
|
||||
if config_changed:
|
||||
profile_env = _materialize_embedded_profile_env(self._config)
|
||||
# Write updated profile .env
|
||||
profile_env.parent.mkdir(parents=True, exist_ok=True)
|
||||
env_lines = (
|
||||
f"HINDSIGHT_API_LLM_PROVIDER={daemon_provider}\n"
|
||||
f"HINDSIGHT_API_LLM_API_KEY={current_key}\n"
|
||||
f"HINDSIGHT_API_LLM_MODEL={current_model}\n"
|
||||
f"HINDSIGHT_API_LOG_LEVEL=info\n"
|
||||
)
|
||||
if current_base_url:
|
||||
env_lines += f"HINDSIGHT_API_LLM_BASE_URL={current_base_url}\n"
|
||||
profile_env.write_text(env_lines)
|
||||
if client._manager.is_running(profile):
|
||||
with open(log_path, "a") as f:
|
||||
f.write("\n=== Config changed, restarting daemon ===\n")
|
||||
@@ -984,7 +777,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
client = self._get_client()
|
||||
if self._prefetch_method == "reflect":
|
||||
logger.debug("Prefetch: calling reflect (bank=%s, query_len=%d)", self._bank_id, len(query))
|
||||
resp = self._run_sync(client.areflect(bank_id=self._bank_id, query=query, budget=self._budget))
|
||||
resp = _run_sync(client.areflect(bank_id=self._bank_id, query=query, budget=self._budget))
|
||||
text = resp.text or ""
|
||||
else:
|
||||
recall_kwargs: dict = {
|
||||
@@ -998,7 +791,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
recall_kwargs["types"] = self._recall_types
|
||||
logger.debug("Prefetch: calling recall (bank=%s, query_len=%d, budget=%s)",
|
||||
self._bank_id, len(query), self._budget)
|
||||
resp = self._run_sync(client.arecall(**recall_kwargs))
|
||||
resp = _run_sync(client.arecall(**recall_kwargs))
|
||||
num_results = len(resp.results) if resp.results else 0
|
||||
logger.debug("Prefetch: recall returned %d results", num_results)
|
||||
text = "\n".join(f"- {r.text}" for r in resp.results if r.text) if resp.results else ""
|
||||
@@ -1095,7 +888,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
if session_id:
|
||||
self._session_id = str(session_id).strip()
|
||||
|
||||
turn = json.dumps(self._build_turn_messages(user_content, assistant_content), ensure_ascii=False)
|
||||
turn = json.dumps(self._build_turn_messages(user_content, assistant_content))
|
||||
self._session_turns.append(turn)
|
||||
self._turn_counter += 1
|
||||
self._turn_index = self._turn_counter
|
||||
@@ -1109,12 +902,6 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
len(self._session_turns), sum(len(t) for t in self._session_turns))
|
||||
content = "[" + ",".join(self._session_turns) + "]"
|
||||
|
||||
lineage_tags: list[str] = []
|
||||
if self._session_id:
|
||||
lineage_tags.append(f"session:{self._session_id}")
|
||||
if self._parent_session_id:
|
||||
lineage_tags.append(f"parent:{self._parent_session_id}")
|
||||
|
||||
def _sync():
|
||||
try:
|
||||
client = self._get_client()
|
||||
@@ -1125,16 +912,15 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
message_count=len(self._session_turns) * 2,
|
||||
turn_index=self._turn_index,
|
||||
),
|
||||
tags=lineage_tags or None,
|
||||
)
|
||||
item.pop("bank_id", None)
|
||||
item.pop("retain_async", None)
|
||||
logger.debug("Hindsight retain: bank=%s, doc=%s, async=%s, content_len=%d, num_turns=%d",
|
||||
self._bank_id, self._document_id, self._retain_async, len(content), len(self._session_turns))
|
||||
self._run_sync(client.aretain_batch(
|
||||
self._bank_id, self._session_id, self._retain_async, len(content), len(self._session_turns))
|
||||
_run_sync(client.aretain_batch(
|
||||
bank_id=self._bank_id,
|
||||
items=[item],
|
||||
document_id=self._document_id,
|
||||
document_id=self._session_id,
|
||||
retain_async=self._retain_async,
|
||||
))
|
||||
logger.debug("Hindsight retain succeeded")
|
||||
@@ -1171,7 +957,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
)
|
||||
logger.debug("Tool hindsight_retain: bank=%s, content_len=%d, context=%s",
|
||||
self._bank_id, len(content), context)
|
||||
self._run_sync(client.aretain(**retain_kwargs))
|
||||
_run_sync(client.aretain(**retain_kwargs))
|
||||
logger.debug("Tool hindsight_retain: success")
|
||||
return json.dumps({"result": "Memory stored successfully."})
|
||||
except Exception as e:
|
||||
@@ -1194,7 +980,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
recall_kwargs["types"] = self._recall_types
|
||||
logger.debug("Tool hindsight_recall: bank=%s, query_len=%d, budget=%s",
|
||||
self._bank_id, len(query), self._budget)
|
||||
resp = self._run_sync(client.arecall(**recall_kwargs))
|
||||
resp = _run_sync(client.arecall(**recall_kwargs))
|
||||
num_results = len(resp.results) if resp.results else 0
|
||||
logger.debug("Tool hindsight_recall: %d results", num_results)
|
||||
if not resp.results:
|
||||
@@ -1212,7 +998,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
try:
|
||||
logger.debug("Tool hindsight_reflect: bank=%s, query_len=%d, budget=%s",
|
||||
self._bank_id, len(query), self._budget)
|
||||
resp = self._run_sync(client.areflect(
|
||||
resp = _run_sync(client.areflect(
|
||||
bank_id=self._bank_id, query=query, budget=self._budget
|
||||
))
|
||||
logger.debug("Tool hindsight_reflect: response_len=%d", len(resp.text or ""))
|
||||
@@ -1225,6 +1011,7 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.debug("Hindsight shutdown: waiting for background threads")
|
||||
global _loop, _loop_thread
|
||||
for t in (self._prefetch_thread, self._sync_thread):
|
||||
if t and t.is_alive():
|
||||
t.join(timeout=5.0)
|
||||
@@ -1239,21 +1026,17 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
except RuntimeError:
|
||||
pass
|
||||
else:
|
||||
self._run_sync(self._client.aclose())
|
||||
_run_sync(self._client.aclose())
|
||||
except Exception:
|
||||
pass
|
||||
self._client = None
|
||||
# The module-global background event loop (_loop / _loop_thread)
|
||||
# is intentionally NOT stopped here. It is shared across every
|
||||
# HindsightMemoryProvider instance in the process — the plugin
|
||||
# loader creates a new provider per AIAgent, and the gateway
|
||||
# creates one AIAgent per concurrent chat session. Stopping the
|
||||
# loop from one provider's shutdown() strands the aiohttp
|
||||
# ClientSession + TCPConnector owned by every sibling provider
|
||||
# on a dead loop, which surfaces as the "Unclosed client session"
|
||||
# / "Unclosed connector" warnings reported in #11923. The loop
|
||||
# runs on a daemon thread and is reclaimed on process exit;
|
||||
# per-session cleanup happens via self._client.aclose() above.
|
||||
# Stop the background event loop so no tasks are pending at exit
|
||||
if _loop is not None and _loop.is_running():
|
||||
_loop.call_soon_threadsafe(_loop.stop)
|
||||
if _loop_thread is not None:
|
||||
_loop_thread.join(timeout=5.0)
|
||||
_loop = None
|
||||
_loop_thread = None
|
||||
|
||||
|
||||
def register(ctx) -> None:
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
"""Spotify integration plugin — bundled, auto-loaded.
|
||||
|
||||
Registers 7 tools (playback, devices, queue, search, playlists, albums,
|
||||
library) into the ``spotify`` toolset. Each tool's handler is gated by
|
||||
``_check_spotify_available()`` — when the user has not run ``hermes auth
|
||||
spotify``, the tools remain registered (so they appear in ``hermes
|
||||
tools``) but the runtime check prevents dispatch.
|
||||
|
||||
Why a plugin instead of a top-level ``tools/`` file?
|
||||
|
||||
- ``plugins/`` is where third-party service integrations live (see
|
||||
``plugins/image_gen/`` for the backend-provider pattern, ``plugins/
|
||||
disk-cleanup/`` for the standalone pattern). ``tools/`` is reserved
|
||||
for foundational capabilities (terminal, read_file, web_search, etc.).
|
||||
- Mirroring the image_gen plugin layout (``plugins/<category>/<backend>/``
|
||||
for categories, flat ``plugins/<name>/`` for standalones) makes new
|
||||
service integrations a pattern contributors can copy.
|
||||
- Bundled + ``kind: backend`` auto-loads on startup just like image_gen
|
||||
backends — no user opt-in needed, no ``plugins.enabled`` config.
|
||||
|
||||
The Spotify auth flow (``hermes auth spotify``), CLI plumbing, and docs
|
||||
are unchanged. This move is purely structural.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from plugins.spotify.tools import (
|
||||
SPOTIFY_ALBUMS_SCHEMA,
|
||||
SPOTIFY_DEVICES_SCHEMA,
|
||||
SPOTIFY_LIBRARY_SCHEMA,
|
||||
SPOTIFY_PLAYBACK_SCHEMA,
|
||||
SPOTIFY_PLAYLISTS_SCHEMA,
|
||||
SPOTIFY_QUEUE_SCHEMA,
|
||||
SPOTIFY_SEARCH_SCHEMA,
|
||||
_check_spotify_available,
|
||||
_handle_spotify_albums,
|
||||
_handle_spotify_devices,
|
||||
_handle_spotify_library,
|
||||
_handle_spotify_playback,
|
||||
_handle_spotify_playlists,
|
||||
_handle_spotify_queue,
|
||||
_handle_spotify_search,
|
||||
)
|
||||
|
||||
_TOOLS = (
|
||||
("spotify_playback", SPOTIFY_PLAYBACK_SCHEMA, _handle_spotify_playback, "🎵"),
|
||||
("spotify_devices", SPOTIFY_DEVICES_SCHEMA, _handle_spotify_devices, "🔈"),
|
||||
("spotify_queue", SPOTIFY_QUEUE_SCHEMA, _handle_spotify_queue, "📻"),
|
||||
("spotify_search", SPOTIFY_SEARCH_SCHEMA, _handle_spotify_search, "🔎"),
|
||||
("spotify_playlists", SPOTIFY_PLAYLISTS_SCHEMA, _handle_spotify_playlists, "📚"),
|
||||
("spotify_albums", SPOTIFY_ALBUMS_SCHEMA, _handle_spotify_albums, "💿"),
|
||||
("spotify_library", SPOTIFY_LIBRARY_SCHEMA, _handle_spotify_library, "❤️"),
|
||||
)
|
||||
|
||||
|
||||
def register(ctx) -> None:
|
||||
"""Register all Spotify tools. Called once by the plugin loader."""
|
||||
for name, schema, handler, emoji in _TOOLS:
|
||||
ctx.register_tool(
|
||||
name=name,
|
||||
toolset="spotify",
|
||||
schema=schema,
|
||||
handler=handler,
|
||||
check_fn=_check_spotify_available,
|
||||
emoji=emoji,
|
||||
)
|
||||
@@ -1,435 +0,0 @@
|
||||
"""Thin Spotify Web API helper used by Hermes native tools."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, Iterable, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from hermes_cli.auth import (
|
||||
AuthError,
|
||||
resolve_spotify_runtime_credentials,
|
||||
)
|
||||
|
||||
|
||||
class SpotifyError(RuntimeError):
|
||||
"""Base Spotify tool error."""
|
||||
|
||||
|
||||
class SpotifyAuthRequiredError(SpotifyError):
|
||||
"""Raised when the user needs to authenticate with Spotify first."""
|
||||
|
||||
|
||||
class SpotifyAPIError(SpotifyError):
|
||||
"""Structured Spotify API failure."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
*,
|
||||
status_code: Optional[int] = None,
|
||||
response_body: Optional[str] = None,
|
||||
) -> None:
|
||||
super().__init__(message)
|
||||
self.status_code = status_code
|
||||
self.response_body = response_body
|
||||
self.path = None
|
||||
|
||||
|
||||
class SpotifyClient:
|
||||
def __init__(self) -> None:
|
||||
self._runtime = self._resolve_runtime(refresh_if_expiring=True)
|
||||
|
||||
def _resolve_runtime(self, *, force_refresh: bool = False, refresh_if_expiring: bool = True) -> Dict[str, Any]:
|
||||
try:
|
||||
return resolve_spotify_runtime_credentials(
|
||||
force_refresh=force_refresh,
|
||||
refresh_if_expiring=refresh_if_expiring,
|
||||
)
|
||||
except AuthError as exc:
|
||||
raise SpotifyAuthRequiredError(str(exc)) from exc
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
return str(self._runtime.get("base_url") or "").rstrip("/")
|
||||
|
||||
def _headers(self) -> Dict[str, str]:
|
||||
return {
|
||||
"Authorization": f"Bearer {self._runtime['access_token']}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
def request(
|
||||
self,
|
||||
method: str,
|
||||
path: str,
|
||||
*,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None,
|
||||
allow_retry_on_401: bool = True,
|
||||
empty_response: Optional[Dict[str, Any]] = None,
|
||||
) -> Any:
|
||||
url = f"{self.base_url}{path}"
|
||||
response = httpx.request(
|
||||
method,
|
||||
url,
|
||||
headers=self._headers(),
|
||||
params=_strip_none(params),
|
||||
json=_strip_none(json_body) if json_body is not None else None,
|
||||
timeout=30.0,
|
||||
)
|
||||
if response.status_code == 401 and allow_retry_on_401:
|
||||
self._runtime = self._resolve_runtime(force_refresh=True, refresh_if_expiring=True)
|
||||
return self.request(
|
||||
method,
|
||||
path,
|
||||
params=params,
|
||||
json_body=json_body,
|
||||
allow_retry_on_401=False,
|
||||
)
|
||||
if response.status_code >= 400:
|
||||
self._raise_api_error(response, method=method, path=path)
|
||||
if response.status_code == 204 or not response.content:
|
||||
return empty_response or {"success": True, "status_code": response.status_code, "empty": True}
|
||||
if "application/json" in response.headers.get("content-type", ""):
|
||||
return response.json()
|
||||
return {"success": True, "text": response.text}
|
||||
|
||||
def _raise_api_error(self, response: httpx.Response, *, method: str, path: str) -> None:
|
||||
detail = response.text.strip()
|
||||
message = _friendly_spotify_error_message(
|
||||
status_code=response.status_code,
|
||||
detail=_extract_spotify_error_detail(response, fallback=detail),
|
||||
method=method,
|
||||
path=path,
|
||||
retry_after=response.headers.get("Retry-After"),
|
||||
)
|
||||
error = SpotifyAPIError(message, status_code=response.status_code, response_body=detail)
|
||||
error.path = path
|
||||
raise error
|
||||
|
||||
def get_devices(self) -> Any:
|
||||
return self.request("GET", "/me/player/devices")
|
||||
|
||||
def transfer_playback(self, *, device_id: str, play: bool = False) -> Any:
|
||||
return self.request("PUT", "/me/player", json_body={
|
||||
"device_ids": [device_id],
|
||||
"play": play,
|
||||
})
|
||||
|
||||
def get_playback_state(self, *, market: Optional[str] = None) -> Any:
|
||||
return self.request(
|
||||
"GET",
|
||||
"/me/player",
|
||||
params={"market": market},
|
||||
empty_response={
|
||||
"status_code": 204,
|
||||
"empty": True,
|
||||
"message": "No active Spotify playback session was found. Open Spotify on a device and start playback, or transfer playback to an available device.",
|
||||
},
|
||||
)
|
||||
|
||||
def get_currently_playing(self, *, market: Optional[str] = None) -> Any:
|
||||
return self.request(
|
||||
"GET",
|
||||
"/me/player/currently-playing",
|
||||
params={"market": market},
|
||||
empty_response={
|
||||
"status_code": 204,
|
||||
"empty": True,
|
||||
"message": "Spotify is not currently playing anything. Start playback in Spotify and try again.",
|
||||
},
|
||||
)
|
||||
|
||||
def start_playback(
|
||||
self,
|
||||
*,
|
||||
device_id: Optional[str] = None,
|
||||
context_uri: Optional[str] = None,
|
||||
uris: Optional[list[str]] = None,
|
||||
offset: Optional[Dict[str, Any]] = None,
|
||||
position_ms: Optional[int] = None,
|
||||
) -> Any:
|
||||
return self.request(
|
||||
"PUT",
|
||||
"/me/player/play",
|
||||
params={"device_id": device_id},
|
||||
json_body={
|
||||
"context_uri": context_uri,
|
||||
"uris": uris,
|
||||
"offset": offset,
|
||||
"position_ms": position_ms,
|
||||
},
|
||||
)
|
||||
|
||||
def pause_playback(self, *, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("PUT", "/me/player/pause", params={"device_id": device_id})
|
||||
|
||||
def skip_next(self, *, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("POST", "/me/player/next", params={"device_id": device_id})
|
||||
|
||||
def skip_previous(self, *, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("POST", "/me/player/previous", params={"device_id": device_id})
|
||||
|
||||
def seek(self, *, position_ms: int, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("PUT", "/me/player/seek", params={
|
||||
"position_ms": position_ms,
|
||||
"device_id": device_id,
|
||||
})
|
||||
|
||||
def set_repeat(self, *, state: str, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("PUT", "/me/player/repeat", params={"state": state, "device_id": device_id})
|
||||
|
||||
def set_shuffle(self, *, state: bool, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("PUT", "/me/player/shuffle", params={"state": str(bool(state)).lower(), "device_id": device_id})
|
||||
|
||||
def set_volume(self, *, volume_percent: int, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("PUT", "/me/player/volume", params={
|
||||
"volume_percent": volume_percent,
|
||||
"device_id": device_id,
|
||||
})
|
||||
|
||||
def get_queue(self) -> Any:
|
||||
return self.request("GET", "/me/player/queue")
|
||||
|
||||
def add_to_queue(self, *, uri: str, device_id: Optional[str] = None) -> Any:
|
||||
return self.request("POST", "/me/player/queue", params={"uri": uri, "device_id": device_id})
|
||||
|
||||
def search(
|
||||
self,
|
||||
*,
|
||||
query: str,
|
||||
search_types: list[str],
|
||||
limit: int = 10,
|
||||
offset: int = 0,
|
||||
market: Optional[str] = None,
|
||||
include_external: Optional[str] = None,
|
||||
) -> Any:
|
||||
return self.request("GET", "/search", params={
|
||||
"q": query,
|
||||
"type": ",".join(search_types),
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"market": market,
|
||||
"include_external": include_external,
|
||||
})
|
||||
|
||||
def get_my_playlists(self, *, limit: int = 20, offset: int = 0) -> Any:
|
||||
return self.request("GET", "/me/playlists", params={"limit": limit, "offset": offset})
|
||||
|
||||
def get_playlist(self, *, playlist_id: str, market: Optional[str] = None) -> Any:
|
||||
return self.request("GET", f"/playlists/{playlist_id}", params={"market": market})
|
||||
|
||||
def create_playlist(
|
||||
self,
|
||||
*,
|
||||
name: str,
|
||||
public: bool = False,
|
||||
collaborative: bool = False,
|
||||
description: Optional[str] = None,
|
||||
) -> Any:
|
||||
return self.request("POST", "/me/playlists", json_body={
|
||||
"name": name,
|
||||
"public": public,
|
||||
"collaborative": collaborative,
|
||||
"description": description,
|
||||
})
|
||||
|
||||
def add_playlist_items(
|
||||
self,
|
||||
*,
|
||||
playlist_id: str,
|
||||
uris: list[str],
|
||||
position: Optional[int] = None,
|
||||
) -> Any:
|
||||
return self.request("POST", f"/playlists/{playlist_id}/items", json_body={
|
||||
"uris": uris,
|
||||
"position": position,
|
||||
})
|
||||
|
||||
def remove_playlist_items(
|
||||
self,
|
||||
*,
|
||||
playlist_id: str,
|
||||
uris: list[str],
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> Any:
|
||||
return self.request("DELETE", f"/playlists/{playlist_id}/items", json_body={
|
||||
"items": [{"uri": uri} for uri in uris],
|
||||
"snapshot_id": snapshot_id,
|
||||
})
|
||||
|
||||
def update_playlist_details(
|
||||
self,
|
||||
*,
|
||||
playlist_id: str,
|
||||
name: Optional[str] = None,
|
||||
public: Optional[bool] = None,
|
||||
collaborative: Optional[bool] = None,
|
||||
description: Optional[str] = None,
|
||||
) -> Any:
|
||||
return self.request("PUT", f"/playlists/{playlist_id}", json_body={
|
||||
"name": name,
|
||||
"public": public,
|
||||
"collaborative": collaborative,
|
||||
"description": description,
|
||||
})
|
||||
|
||||
def get_album(self, *, album_id: str, market: Optional[str] = None) -> Any:
|
||||
return self.request("GET", f"/albums/{album_id}", params={"market": market})
|
||||
|
||||
def get_album_tracks(self, *, album_id: str, limit: int = 20, offset: int = 0, market: Optional[str] = None) -> Any:
|
||||
return self.request("GET", f"/albums/{album_id}/tracks", params={
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"market": market,
|
||||
})
|
||||
|
||||
def get_saved_tracks(self, *, limit: int = 20, offset: int = 0, market: Optional[str] = None) -> Any:
|
||||
return self.request("GET", "/me/tracks", params={"limit": limit, "offset": offset, "market": market})
|
||||
|
||||
def save_library_items(self, *, uris: list[str]) -> Any:
|
||||
return self.request("PUT", "/me/library", params={"uris": ",".join(uris)})
|
||||
|
||||
def library_contains(self, *, uris: list[str]) -> Any:
|
||||
return self.request("GET", "/me/library/contains", params={"uris": ",".join(uris)})
|
||||
|
||||
def get_saved_albums(self, *, limit: int = 20, offset: int = 0, market: Optional[str] = None) -> Any:
|
||||
return self.request("GET", "/me/albums", params={"limit": limit, "offset": offset, "market": market})
|
||||
|
||||
def remove_saved_tracks(self, *, track_ids: list[str]) -> Any:
|
||||
uris = [f"spotify:track:{track_id}" for track_id in track_ids]
|
||||
return self.request("DELETE", "/me/library", params={"uris": ",".join(uris)})
|
||||
|
||||
def remove_saved_albums(self, *, album_ids: list[str]) -> Any:
|
||||
uris = [f"spotify:album:{album_id}" for album_id in album_ids]
|
||||
return self.request("DELETE", "/me/library", params={"uris": ",".join(uris)})
|
||||
|
||||
def get_recently_played(
|
||||
self,
|
||||
*,
|
||||
limit: int = 20,
|
||||
after: Optional[int] = None,
|
||||
before: Optional[int] = None,
|
||||
) -> Any:
|
||||
return self.request("GET", "/me/player/recently-played", params={
|
||||
"limit": limit,
|
||||
"after": after,
|
||||
"before": before,
|
||||
})
|
||||
|
||||
|
||||
def _extract_spotify_error_detail(response: httpx.Response, *, fallback: str) -> str:
|
||||
detail = fallback
|
||||
try:
|
||||
payload = response.json()
|
||||
if isinstance(payload, dict):
|
||||
error_obj = payload.get("error")
|
||||
if isinstance(error_obj, dict):
|
||||
detail = str(error_obj.get("message") or detail)
|
||||
elif isinstance(error_obj, str):
|
||||
detail = error_obj
|
||||
except Exception:
|
||||
pass
|
||||
return detail.strip()
|
||||
|
||||
|
||||
def _friendly_spotify_error_message(
|
||||
*,
|
||||
status_code: int,
|
||||
detail: str,
|
||||
method: str,
|
||||
path: str,
|
||||
retry_after: Optional[str],
|
||||
) -> str:
|
||||
normalized_detail = detail.lower()
|
||||
is_playback_path = path.startswith("/me/player")
|
||||
|
||||
if status_code == 401:
|
||||
return "Spotify authentication failed or expired. Run `hermes auth spotify` again."
|
||||
|
||||
if status_code == 403:
|
||||
if is_playback_path:
|
||||
return (
|
||||
"Spotify rejected this playback request. Playback control usually requires a Spotify Premium account "
|
||||
"and an active Spotify Connect device."
|
||||
)
|
||||
if "scope" in normalized_detail or "permission" in normalized_detail:
|
||||
return "Spotify rejected the request because the current auth scope is insufficient. Re-run `hermes auth spotify` to refresh permissions."
|
||||
return "Spotify rejected the request. The account may not have permission for this action."
|
||||
|
||||
if status_code == 404:
|
||||
if is_playback_path:
|
||||
return "Spotify could not find an active playback device or player session for this request."
|
||||
return "Spotify resource not found."
|
||||
|
||||
if status_code == 429:
|
||||
message = "Spotify rate limit exceeded."
|
||||
if retry_after:
|
||||
message += f" Retry after {retry_after} seconds."
|
||||
return message
|
||||
|
||||
if detail:
|
||||
return detail
|
||||
return f"Spotify API request failed with status {status_code}."
|
||||
|
||||
|
||||
def _strip_none(payload: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
if not payload:
|
||||
return {}
|
||||
return {key: value for key, value in payload.items() if value is not None}
|
||||
|
||||
|
||||
def normalize_spotify_id(value: str, expected_type: Optional[str] = None) -> str:
|
||||
cleaned = (value or "").strip()
|
||||
if not cleaned:
|
||||
raise SpotifyError("Spotify id/uri/url is required.")
|
||||
if cleaned.startswith("spotify:"):
|
||||
parts = cleaned.split(":")
|
||||
if len(parts) >= 3:
|
||||
item_type = parts[1]
|
||||
if expected_type and item_type != expected_type:
|
||||
raise SpotifyError(f"Expected a Spotify {expected_type}, got {item_type}.")
|
||||
return parts[2]
|
||||
if "open.spotify.com" in cleaned:
|
||||
parsed = urlparse(cleaned)
|
||||
path_parts = [part for part in parsed.path.split("/") if part]
|
||||
if len(path_parts) >= 2:
|
||||
item_type, item_id = path_parts[0], path_parts[1]
|
||||
if expected_type and item_type != expected_type:
|
||||
raise SpotifyError(f"Expected a Spotify {expected_type}, got {item_type}.")
|
||||
return item_id
|
||||
return cleaned
|
||||
|
||||
|
||||
def normalize_spotify_uri(value: str, expected_type: Optional[str] = None) -> str:
|
||||
cleaned = (value or "").strip()
|
||||
if not cleaned:
|
||||
raise SpotifyError("Spotify URI/url/id is required.")
|
||||
if cleaned.startswith("spotify:"):
|
||||
if expected_type:
|
||||
parts = cleaned.split(":")
|
||||
if len(parts) >= 3 and parts[1] != expected_type:
|
||||
raise SpotifyError(f"Expected a Spotify {expected_type}, got {parts[1]}.")
|
||||
return cleaned
|
||||
item_id = normalize_spotify_id(cleaned, expected_type)
|
||||
if expected_type:
|
||||
return f"spotify:{expected_type}:{item_id}"
|
||||
return cleaned
|
||||
|
||||
|
||||
def normalize_spotify_uris(values: Iterable[str], expected_type: Optional[str] = None) -> list[str]:
|
||||
uris: list[str] = []
|
||||
for value in values:
|
||||
uri = normalize_spotify_uri(str(value), expected_type)
|
||||
if uri not in uris:
|
||||
uris.append(uri)
|
||||
if not uris:
|
||||
raise SpotifyError("At least one Spotify item is required.")
|
||||
return uris
|
||||
|
||||
|
||||
def compact_json(data: Any) -> str:
|
||||
return json.dumps(data, ensure_ascii=False)
|
||||
@@ -1,13 +0,0 @@
|
||||
name: spotify
|
||||
version: 1.0.0
|
||||
description: "Native Spotify integration — 7 tools (playback, devices, queue, search, playlists, albums, library) using Spotify Web API + PKCE OAuth. Auth via `hermes auth spotify`. Tools gate on `providers.spotify` in ~/.hermes/auth.json."
|
||||
author: NousResearch
|
||||
kind: backend
|
||||
provides_tools:
|
||||
- spotify_playback
|
||||
- spotify_devices
|
||||
- spotify_queue
|
||||
- spotify_search
|
||||
- spotify_playlists
|
||||
- spotify_albums
|
||||
- spotify_library
|
||||
@@ -1,454 +0,0 @@
|
||||
"""Native Spotify tools for Hermes (registered via plugins/spotify)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from hermes_cli.auth import get_auth_status
|
||||
from plugins.spotify.client import (
|
||||
SpotifyAPIError,
|
||||
SpotifyAuthRequiredError,
|
||||
SpotifyClient,
|
||||
SpotifyError,
|
||||
normalize_spotify_id,
|
||||
normalize_spotify_uri,
|
||||
normalize_spotify_uris,
|
||||
)
|
||||
from tools.registry import tool_error, tool_result
|
||||
|
||||
|
||||
def _check_spotify_available() -> bool:
|
||||
try:
|
||||
return bool(get_auth_status("spotify").get("logged_in"))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _spotify_client() -> SpotifyClient:
|
||||
return SpotifyClient()
|
||||
|
||||
|
||||
def _spotify_tool_error(exc: Exception) -> str:
|
||||
if isinstance(exc, (SpotifyError, SpotifyAuthRequiredError)):
|
||||
return tool_error(str(exc))
|
||||
if isinstance(exc, SpotifyAPIError):
|
||||
return tool_error(str(exc), status_code=exc.status_code)
|
||||
return tool_error(f"Spotify tool failed: {type(exc).__name__}: {exc}")
|
||||
|
||||
|
||||
def _coerce_limit(raw: Any, *, default: int = 20, minimum: int = 1, maximum: int = 50) -> int:
|
||||
try:
|
||||
value = int(raw)
|
||||
except Exception:
|
||||
value = default
|
||||
return max(minimum, min(maximum, value))
|
||||
|
||||
|
||||
def _coerce_bool(raw: Any, default: bool = False) -> bool:
|
||||
if isinstance(raw, bool):
|
||||
return raw
|
||||
if isinstance(raw, str):
|
||||
cleaned = raw.strip().lower()
|
||||
if cleaned in {"1", "true", "yes", "on"}:
|
||||
return True
|
||||
if cleaned in {"0", "false", "no", "off"}:
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def _as_list(raw: Any) -> List[str]:
|
||||
if raw is None:
|
||||
return []
|
||||
if isinstance(raw, list):
|
||||
return [str(item).strip() for item in raw if str(item).strip()]
|
||||
return [str(raw).strip()] if str(raw).strip() else []
|
||||
|
||||
|
||||
def _describe_empty_playback(payload: Any, *, action: str) -> dict | None:
|
||||
if not isinstance(payload, dict) or not payload.get("empty"):
|
||||
return None
|
||||
if action == "get_currently_playing":
|
||||
return {
|
||||
"success": True,
|
||||
"action": action,
|
||||
"is_playing": False,
|
||||
"status_code": payload.get("status_code", 204),
|
||||
"message": payload.get("message") or "Spotify is not currently playing anything.",
|
||||
}
|
||||
if action == "get_state":
|
||||
return {
|
||||
"success": True,
|
||||
"action": action,
|
||||
"has_active_device": False,
|
||||
"status_code": payload.get("status_code", 204),
|
||||
"message": payload.get("message") or "No active Spotify playback session was found.",
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def _handle_spotify_playback(args: dict, **kw) -> str:
|
||||
action = str(args.get("action") or "get_state").strip().lower()
|
||||
client = _spotify_client()
|
||||
try:
|
||||
if action == "get_state":
|
||||
payload = client.get_playback_state(market=args.get("market"))
|
||||
empty_result = _describe_empty_playback(payload, action=action)
|
||||
return tool_result(empty_result or payload)
|
||||
if action == "get_currently_playing":
|
||||
payload = client.get_currently_playing(market=args.get("market"))
|
||||
empty_result = _describe_empty_playback(payload, action=action)
|
||||
return tool_result(empty_result or payload)
|
||||
if action == "play":
|
||||
offset = args.get("offset")
|
||||
if isinstance(offset, dict):
|
||||
payload_offset = {k: v for k, v in offset.items() if v is not None}
|
||||
else:
|
||||
payload_offset = None
|
||||
uris = normalize_spotify_uris(_as_list(args.get("uris")), "track") if args.get("uris") else None
|
||||
context_uri = None
|
||||
if args.get("context_uri"):
|
||||
raw_context = str(args.get("context_uri"))
|
||||
context_type = None
|
||||
if raw_context.startswith("spotify:album:") or "/album/" in raw_context:
|
||||
context_type = "album"
|
||||
elif raw_context.startswith("spotify:playlist:") or "/playlist/" in raw_context:
|
||||
context_type = "playlist"
|
||||
elif raw_context.startswith("spotify:artist:") or "/artist/" in raw_context:
|
||||
context_type = "artist"
|
||||
context_uri = normalize_spotify_uri(raw_context, context_type)
|
||||
result = client.start_playback(
|
||||
device_id=args.get("device_id"),
|
||||
context_uri=context_uri,
|
||||
uris=uris,
|
||||
offset=payload_offset,
|
||||
position_ms=args.get("position_ms"),
|
||||
)
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "pause":
|
||||
result = client.pause_playback(device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "next":
|
||||
result = client.skip_next(device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "previous":
|
||||
result = client.skip_previous(device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "seek":
|
||||
if args.get("position_ms") is None:
|
||||
return tool_error("position_ms is required for action='seek'")
|
||||
result = client.seek(position_ms=int(args["position_ms"]), device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "set_repeat":
|
||||
state = str(args.get("state") or "").strip().lower()
|
||||
if state not in {"track", "context", "off"}:
|
||||
return tool_error("state must be one of: track, context, off")
|
||||
result = client.set_repeat(state=state, device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "set_shuffle":
|
||||
result = client.set_shuffle(state=_coerce_bool(args.get("state")), device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "set_volume":
|
||||
if args.get("volume_percent") is None:
|
||||
return tool_error("volume_percent is required for action='set_volume'")
|
||||
result = client.set_volume(volume_percent=max(0, min(100, int(args["volume_percent"]))), device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
if action == "recently_played":
|
||||
after = args.get("after")
|
||||
before = args.get("before")
|
||||
if after and before:
|
||||
return tool_error("Provide only one of 'after' or 'before'")
|
||||
return tool_result(client.get_recently_played(
|
||||
limit=_coerce_limit(args.get("limit"), default=20),
|
||||
after=int(after) if after is not None else None,
|
||||
before=int(before) if before is not None else None,
|
||||
))
|
||||
return tool_error(f"Unknown spotify_playback action: {action}")
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
def _handle_spotify_devices(args: dict, **kw) -> str:
|
||||
action = str(args.get("action") or "list").strip().lower()
|
||||
client = _spotify_client()
|
||||
try:
|
||||
if action == "list":
|
||||
return tool_result(client.get_devices())
|
||||
if action == "transfer":
|
||||
device_id = str(args.get("device_id") or "").strip()
|
||||
if not device_id:
|
||||
return tool_error("device_id is required for action='transfer'")
|
||||
result = client.transfer_playback(device_id=device_id, play=_coerce_bool(args.get("play")))
|
||||
return tool_result({"success": True, "action": action, "result": result})
|
||||
return tool_error(f"Unknown spotify_devices action: {action}")
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
def _handle_spotify_queue(args: dict, **kw) -> str:
|
||||
action = str(args.get("action") or "get").strip().lower()
|
||||
client = _spotify_client()
|
||||
try:
|
||||
if action == "get":
|
||||
return tool_result(client.get_queue())
|
||||
if action == "add":
|
||||
uri = normalize_spotify_uri(str(args.get("uri") or ""), None)
|
||||
result = client.add_to_queue(uri=uri, device_id=args.get("device_id"))
|
||||
return tool_result({"success": True, "action": action, "uri": uri, "result": result})
|
||||
return tool_error(f"Unknown spotify_queue action: {action}")
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
def _handle_spotify_search(args: dict, **kw) -> str:
|
||||
client = _spotify_client()
|
||||
query = str(args.get("query") or "").strip()
|
||||
if not query:
|
||||
return tool_error("query is required")
|
||||
raw_types = _as_list(args.get("types") or args.get("type") or ["track"])
|
||||
search_types = [value.lower() for value in raw_types if value.lower() in {"album", "artist", "playlist", "track", "show", "episode", "audiobook"}]
|
||||
if not search_types:
|
||||
return tool_error("types must contain one or more of: album, artist, playlist, track, show, episode, audiobook")
|
||||
try:
|
||||
return tool_result(client.search(
|
||||
query=query,
|
||||
search_types=search_types,
|
||||
limit=_coerce_limit(args.get("limit"), default=10),
|
||||
offset=max(0, int(args.get("offset") or 0)),
|
||||
market=args.get("market"),
|
||||
include_external=args.get("include_external"),
|
||||
))
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
def _handle_spotify_playlists(args: dict, **kw) -> str:
|
||||
action = str(args.get("action") or "list").strip().lower()
|
||||
client = _spotify_client()
|
||||
try:
|
||||
if action == "list":
|
||||
return tool_result(client.get_my_playlists(
|
||||
limit=_coerce_limit(args.get("limit"), default=20),
|
||||
offset=max(0, int(args.get("offset") or 0)),
|
||||
))
|
||||
if action == "get":
|
||||
playlist_id = normalize_spotify_id(str(args.get("playlist_id") or ""), "playlist")
|
||||
return tool_result(client.get_playlist(playlist_id=playlist_id, market=args.get("market")))
|
||||
if action == "create":
|
||||
name = str(args.get("name") or "").strip()
|
||||
if not name:
|
||||
return tool_error("name is required for action='create'")
|
||||
return tool_result(client.create_playlist(
|
||||
name=name,
|
||||
public=_coerce_bool(args.get("public")),
|
||||
collaborative=_coerce_bool(args.get("collaborative")),
|
||||
description=args.get("description"),
|
||||
))
|
||||
if action == "add_items":
|
||||
playlist_id = normalize_spotify_id(str(args.get("playlist_id") or ""), "playlist")
|
||||
uris = normalize_spotify_uris(_as_list(args.get("uris")))
|
||||
return tool_result(client.add_playlist_items(
|
||||
playlist_id=playlist_id,
|
||||
uris=uris,
|
||||
position=args.get("position"),
|
||||
))
|
||||
if action == "remove_items":
|
||||
playlist_id = normalize_spotify_id(str(args.get("playlist_id") or ""), "playlist")
|
||||
uris = normalize_spotify_uris(_as_list(args.get("uris")))
|
||||
return tool_result(client.remove_playlist_items(
|
||||
playlist_id=playlist_id,
|
||||
uris=uris,
|
||||
snapshot_id=args.get("snapshot_id"),
|
||||
))
|
||||
if action == "update_details":
|
||||
playlist_id = normalize_spotify_id(str(args.get("playlist_id") or ""), "playlist")
|
||||
return tool_result(client.update_playlist_details(
|
||||
playlist_id=playlist_id,
|
||||
name=args.get("name"),
|
||||
public=args.get("public"),
|
||||
collaborative=args.get("collaborative"),
|
||||
description=args.get("description"),
|
||||
))
|
||||
return tool_error(f"Unknown spotify_playlists action: {action}")
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
def _handle_spotify_albums(args: dict, **kw) -> str:
|
||||
action = str(args.get("action") or "get").strip().lower()
|
||||
client = _spotify_client()
|
||||
try:
|
||||
album_id = normalize_spotify_id(str(args.get("album_id") or args.get("id") or ""), "album")
|
||||
if action == "get":
|
||||
return tool_result(client.get_album(album_id=album_id, market=args.get("market")))
|
||||
if action == "tracks":
|
||||
return tool_result(client.get_album_tracks(
|
||||
album_id=album_id,
|
||||
limit=_coerce_limit(args.get("limit"), default=20),
|
||||
offset=max(0, int(args.get("offset") or 0)),
|
||||
market=args.get("market"),
|
||||
))
|
||||
return tool_error(f"Unknown spotify_albums action: {action}")
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
def _handle_spotify_library(args: dict, **kw) -> str:
|
||||
"""Unified handler for saved tracks + saved albums (formerly two tools)."""
|
||||
kind = str(args.get("kind") or "").strip().lower()
|
||||
if kind not in {"tracks", "albums"}:
|
||||
return tool_error("kind must be one of: tracks, albums")
|
||||
action = str(args.get("action") or "list").strip().lower()
|
||||
item_type = "track" if kind == "tracks" else "album"
|
||||
client = _spotify_client()
|
||||
try:
|
||||
if action == "list":
|
||||
limit = _coerce_limit(args.get("limit"), default=20)
|
||||
offset = max(0, int(args.get("offset") or 0))
|
||||
market = args.get("market")
|
||||
if kind == "tracks":
|
||||
return tool_result(client.get_saved_tracks(limit=limit, offset=offset, market=market))
|
||||
return tool_result(client.get_saved_albums(limit=limit, offset=offset, market=market))
|
||||
if action == "save":
|
||||
uris = normalize_spotify_uris(_as_list(args.get("uris") or args.get("items")), item_type)
|
||||
return tool_result(client.save_library_items(uris=uris))
|
||||
if action == "remove":
|
||||
ids = [normalize_spotify_id(item, item_type) for item in _as_list(args.get("ids") or args.get("items"))]
|
||||
if not ids:
|
||||
return tool_error("ids/items is required for action='remove'")
|
||||
if kind == "tracks":
|
||||
return tool_result(client.remove_saved_tracks(track_ids=ids))
|
||||
return tool_result(client.remove_saved_albums(album_ids=ids))
|
||||
return tool_error(f"Unknown spotify_library action: {action}")
|
||||
except Exception as exc:
|
||||
return _spotify_tool_error(exc)
|
||||
|
||||
|
||||
COMMON_STRING = {"type": "string"}
|
||||
|
||||
SPOTIFY_PLAYBACK_SCHEMA = {
|
||||
"name": "spotify_playback",
|
||||
"description": "Control Spotify playback, inspect the active playback state, or fetch recently played tracks.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {"type": "string", "enum": ["get_state", "get_currently_playing", "play", "pause", "next", "previous", "seek", "set_repeat", "set_shuffle", "set_volume", "recently_played"]},
|
||||
"device_id": COMMON_STRING,
|
||||
"market": COMMON_STRING,
|
||||
"context_uri": COMMON_STRING,
|
||||
"uris": {"type": "array", "items": COMMON_STRING},
|
||||
"offset": {"type": "object"},
|
||||
"position_ms": {"type": "integer"},
|
||||
"state": {"description": "For set_repeat use track/context/off. For set_shuffle use boolean-like true/false.", "oneOf": [{"type": "string"}, {"type": "boolean"}]},
|
||||
"volume_percent": {"type": "integer"},
|
||||
"limit": {"type": "integer", "description": "For recently_played: number of tracks (max 50)"},
|
||||
"after": {"type": "integer", "description": "For recently_played: Unix ms cursor (after this timestamp)"},
|
||||
"before": {"type": "integer", "description": "For recently_played: Unix ms cursor (before this timestamp)"},
|
||||
},
|
||||
"required": ["action"],
|
||||
},
|
||||
}
|
||||
|
||||
SPOTIFY_DEVICES_SCHEMA = {
|
||||
"name": "spotify_devices",
|
||||
"description": "List Spotify Connect devices or transfer playback to a different device.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {"type": "string", "enum": ["list", "transfer"]},
|
||||
"device_id": COMMON_STRING,
|
||||
"play": {"type": "boolean"},
|
||||
},
|
||||
"required": ["action"],
|
||||
},
|
||||
}
|
||||
|
||||
SPOTIFY_QUEUE_SCHEMA = {
|
||||
"name": "spotify_queue",
|
||||
"description": "Inspect the user's Spotify queue or add an item to it.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {"type": "string", "enum": ["get", "add"]},
|
||||
"uri": COMMON_STRING,
|
||||
"device_id": COMMON_STRING,
|
||||
},
|
||||
"required": ["action"],
|
||||
},
|
||||
}
|
||||
|
||||
SPOTIFY_SEARCH_SCHEMA = {
|
||||
"name": "spotify_search",
|
||||
"description": "Search the Spotify catalog for tracks, albums, artists, playlists, shows, or episodes.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": COMMON_STRING,
|
||||
"types": {"type": "array", "items": COMMON_STRING},
|
||||
"type": COMMON_STRING,
|
||||
"limit": {"type": "integer"},
|
||||
"offset": {"type": "integer"},
|
||||
"market": COMMON_STRING,
|
||||
"include_external": COMMON_STRING,
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
}
|
||||
|
||||
SPOTIFY_PLAYLISTS_SCHEMA = {
|
||||
"name": "spotify_playlists",
|
||||
"description": "List, inspect, create, update, and modify Spotify playlists.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {"type": "string", "enum": ["list", "get", "create", "add_items", "remove_items", "update_details"]},
|
||||
"playlist_id": COMMON_STRING,
|
||||
"market": COMMON_STRING,
|
||||
"limit": {"type": "integer"},
|
||||
"offset": {"type": "integer"},
|
||||
"name": COMMON_STRING,
|
||||
"description": COMMON_STRING,
|
||||
"public": {"type": "boolean"},
|
||||
"collaborative": {"type": "boolean"},
|
||||
"uris": {"type": "array", "items": COMMON_STRING},
|
||||
"position": {"type": "integer"},
|
||||
"snapshot_id": COMMON_STRING,
|
||||
},
|
||||
"required": ["action"],
|
||||
},
|
||||
}
|
||||
|
||||
SPOTIFY_ALBUMS_SCHEMA = {
|
||||
"name": "spotify_albums",
|
||||
"description": "Fetch Spotify album metadata or album tracks.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {"type": "string", "enum": ["get", "tracks"]},
|
||||
"album_id": COMMON_STRING,
|
||||
"id": COMMON_STRING,
|
||||
"market": COMMON_STRING,
|
||||
"limit": {"type": "integer"},
|
||||
"offset": {"type": "integer"},
|
||||
},
|
||||
"required": ["action"],
|
||||
},
|
||||
}
|
||||
|
||||
SPOTIFY_LIBRARY_SCHEMA = {
|
||||
"name": "spotify_library",
|
||||
"description": "List, save, or remove the user's saved Spotify tracks or albums. Use `kind` to select which.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {"type": "string", "enum": ["tracks", "albums"], "description": "Which library to operate on"},
|
||||
"action": {"type": "string", "enum": ["list", "save", "remove"]},
|
||||
"limit": {"type": "integer"},
|
||||
"offset": {"type": "integer"},
|
||||
"market": COMMON_STRING,
|
||||
"uris": {"type": "array", "items": COMMON_STRING},
|
||||
"ids": {"type": "array", "items": COMMON_STRING},
|
||||
"items": {"type": "array", "items": COMMON_STRING},
|
||||
},
|
||||
"required": ["kind", "action"],
|
||||
},
|
||||
}
|
||||
+73
-340
@@ -31,13 +31,11 @@ logger = logging.getLogger(__name__)
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import threading
|
||||
from types import SimpleNamespace
|
||||
import urllib.request
|
||||
import uuid
|
||||
from typing import List, Dict, Any, Optional
|
||||
from openai import OpenAI
|
||||
@@ -183,25 +181,6 @@ def _get_proxy_from_env() -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]:
|
||||
"""Return an env-configured proxy unless NO_PROXY excludes this base URL."""
|
||||
proxy = _get_proxy_from_env()
|
||||
if not proxy or not base_url:
|
||||
return proxy
|
||||
|
||||
host = base_url_hostname(base_url)
|
||||
if not host:
|
||||
return proxy
|
||||
|
||||
try:
|
||||
if urllib.request.proxy_bypass_environment(host):
|
||||
return None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return proxy
|
||||
|
||||
|
||||
def _install_safe_stdio() -> None:
|
||||
"""Wrap stdout/stderr so best-effort console output cannot crash the agent."""
|
||||
for stream_name in ("stdout", "stderr"):
|
||||
@@ -685,40 +664,6 @@ def _sanitize_structure_non_ascii(payload: Any) -> bool:
|
||||
_QWEN_CODE_VERSION = "0.14.1"
|
||||
|
||||
|
||||
def _routermint_headers() -> dict:
|
||||
"""Return the User-Agent RouterMint needs to avoid Cloudflare 1010 blocks."""
|
||||
from hermes_cli import __version__ as _HERMES_VERSION
|
||||
|
||||
return {
|
||||
"User-Agent": f"HermesAgent/{_HERMES_VERSION}",
|
||||
}
|
||||
|
||||
|
||||
def _pool_may_recover_from_rate_limit(pool) -> bool:
|
||||
"""Decide whether to wait for credential-pool rotation instead of falling back.
|
||||
|
||||
The existing pool-rotation path requires the pool to (1) exist and (2) have
|
||||
at least one entry not currently in exhaustion cooldown. But rotation is
|
||||
only meaningful when the pool has more than one entry.
|
||||
|
||||
With a single-credential pool (common for Gemini OAuth, Vertex service
|
||||
accounts, and any "one personal key" configuration), the primary entry
|
||||
just 429'd and there is nothing to rotate to. Waiting for the pool
|
||||
cooldown to expire means retrying against the same exhausted quota — the
|
||||
daily-quota 429 will recur immediately, and the retry budget is burned.
|
||||
|
||||
In that case we must fall back to the configured ``fallback_model``
|
||||
instead. Returns True only when rotation has somewhere to go.
|
||||
|
||||
See issue #11314.
|
||||
"""
|
||||
if pool is None:
|
||||
return False
|
||||
if not pool.has_available():
|
||||
return False
|
||||
return len(pool.entries()) > 1
|
||||
|
||||
|
||||
def _qwen_portal_headers() -> dict:
|
||||
"""Return default HTTP headers required by Qwen Portal API."""
|
||||
import platform as _plat
|
||||
@@ -1062,21 +1007,8 @@ class AIAgent:
|
||||
self._use_prompt_caching, self._use_native_cache_layout = (
|
||||
self._anthropic_prompt_cache_policy()
|
||||
)
|
||||
# Anthropic supports "5m" (default) and "1h" cache TTL tiers. Read from
|
||||
# config.yaml under prompt_caching.cache_ttl; unknown values keep "5m".
|
||||
# 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long
|
||||
# sessions with >5-minute pauses between turns (#14971).
|
||||
self._cache_ttl = "5m"
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load_pc_cfg
|
||||
|
||||
_pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {}
|
||||
_ttl = _pc_cfg.get("cache_ttl", "5m")
|
||||
if _ttl in ("5m", "1h"):
|
||||
self._cache_ttl = _ttl
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost)
|
||||
|
||||
# Iteration budget: the LLM is only notified when it actually exhausts
|
||||
# the iteration budget (api_call_count >= max_iterations). At that
|
||||
# point we inject ONE message, allow one final API call, and if the
|
||||
@@ -1248,8 +1180,6 @@ class AIAgent:
|
||||
"X-OpenRouter-Title": "Hermes Agent",
|
||||
"X-OpenRouter-Categories": "productivity,cli-agent",
|
||||
}
|
||||
elif base_url_host_matches(effective_base, "api.routermint.com"):
|
||||
client_kwargs["default_headers"] = _routermint_headers()
|
||||
elif base_url_host_matches(effective_base, "api.githubcopilot.com"):
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
@@ -2119,14 +2049,12 @@ class AIAgent:
|
||||
# ("switched to anthropic, tui keeps trying openrouter").
|
||||
old_norm = (old_provider or "").strip().lower()
|
||||
new_norm = (new_provider or "").strip().lower()
|
||||
fallback_chain = list(getattr(self, "_fallback_chain", []) or [])
|
||||
if old_norm and new_norm and old_norm != new_norm:
|
||||
fallback_chain = [
|
||||
entry for entry in fallback_chain
|
||||
self._fallback_chain = [
|
||||
entry for entry in self._fallback_chain
|
||||
if (entry.get("provider") or "").strip().lower() not in {old_norm, new_norm}
|
||||
]
|
||||
self._fallback_chain = fallback_chain
|
||||
self._fallback_model = fallback_chain[0] if fallback_chain else None
|
||||
self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None
|
||||
|
||||
logging.info(
|
||||
"Model switched in-place: %s (%s) -> %s (%s)",
|
||||
@@ -2948,69 +2876,6 @@ class AIAgent:
|
||||
"If nothing stands out, just say 'Nothing to save.' and stop."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _summarize_background_review_actions(
|
||||
review_messages: List[Dict],
|
||||
prior_snapshot: List[Dict],
|
||||
) -> List[str]:
|
||||
"""Build the human-facing action summary for a background review pass.
|
||||
|
||||
Walks the review agent's session messages and collects "successful tool
|
||||
action" descriptions to surface to the user (e.g. "Memory updated").
|
||||
Tool messages already present in ``prior_snapshot`` are skipped so we
|
||||
don't re-surface stale results from the prior conversation that the
|
||||
review agent inherited via ``conversation_history`` (issue #14944).
|
||||
|
||||
Matching is by ``tool_call_id`` when available, with a content-equality
|
||||
fallback for tool messages that lack one.
|
||||
"""
|
||||
existing_tool_call_ids = set()
|
||||
existing_tool_contents = set()
|
||||
for prior in prior_snapshot or []:
|
||||
if not isinstance(prior, dict) or prior.get("role") != "tool":
|
||||
continue
|
||||
tcid = prior.get("tool_call_id")
|
||||
if tcid:
|
||||
existing_tool_call_ids.add(tcid)
|
||||
else:
|
||||
content = prior.get("content")
|
||||
if isinstance(content, str):
|
||||
existing_tool_contents.add(content)
|
||||
|
||||
actions: List[str] = []
|
||||
for msg in review_messages or []:
|
||||
if not isinstance(msg, dict) or msg.get("role") != "tool":
|
||||
continue
|
||||
tcid = msg.get("tool_call_id")
|
||||
if tcid and tcid in existing_tool_call_ids:
|
||||
continue
|
||||
if not tcid:
|
||||
content_str = msg.get("content")
|
||||
if isinstance(content_str, str) and content_str in existing_tool_contents:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(msg.get("content", "{}"))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
if not isinstance(data, dict) or not data.get("success"):
|
||||
continue
|
||||
message = data.get("message", "")
|
||||
target = data.get("target", "")
|
||||
if "created" in message.lower():
|
||||
actions.append(message)
|
||||
elif "updated" in message.lower():
|
||||
actions.append(message)
|
||||
elif "added" in message.lower() or (target and "add" in message.lower()):
|
||||
label = "Memory" if target == "memory" else "User profile" if target == "user" else target
|
||||
actions.append(f"{label} updated")
|
||||
elif "Entry added" in message:
|
||||
label = "Memory" if target == "memory" else "User profile" if target == "user" else target
|
||||
actions.append(f"{label} updated")
|
||||
elif "removed" in message.lower() or "replaced" in message.lower():
|
||||
label = "Memory" if target == "memory" else "User profile" if target == "user" else target
|
||||
actions.append(f"{label} updated")
|
||||
return actions
|
||||
|
||||
def _spawn_background_review(
|
||||
self,
|
||||
messages_snapshot: List[Dict],
|
||||
@@ -3060,15 +2925,32 @@ class AIAgent:
|
||||
)
|
||||
|
||||
# Scan the review agent's messages for successful tool actions
|
||||
# and surface a compact summary to the user. Tool messages
|
||||
# already present in messages_snapshot must be skipped, since
|
||||
# the review agent inherits that history and would otherwise
|
||||
# re-surface stale "created"/"updated" messages from the prior
|
||||
# conversation as if they just happened (issue #14944).
|
||||
actions = self._summarize_background_review_actions(
|
||||
getattr(review_agent, "_session_messages", []),
|
||||
messages_snapshot,
|
||||
)
|
||||
# and surface a compact summary to the user.
|
||||
actions = []
|
||||
for msg in getattr(review_agent, "_session_messages", []):
|
||||
if not isinstance(msg, dict) or msg.get("role") != "tool":
|
||||
continue
|
||||
try:
|
||||
data = json.loads(msg.get("content", "{}"))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
if not data.get("success"):
|
||||
continue
|
||||
message = data.get("message", "")
|
||||
target = data.get("target", "")
|
||||
if "created" in message.lower():
|
||||
actions.append(message)
|
||||
elif "updated" in message.lower():
|
||||
actions.append(message)
|
||||
elif "added" in message.lower() or (target and "add" in message.lower()):
|
||||
label = "Memory" if target == "memory" else "User profile" if target == "user" else target
|
||||
actions.append(f"{label} updated")
|
||||
elif "Entry added" in message:
|
||||
label = "Memory" if target == "memory" else "User profile" if target == "user" else target
|
||||
actions.append(f"{label} updated")
|
||||
elif "removed" in message.lower() or "replaced" in message.lower():
|
||||
label = "Memory" if target == "memory" else "User profile" if target == "user" else target
|
||||
actions.append(f"{label} updated")
|
||||
|
||||
if actions:
|
||||
summary = " · ".join(dict.fromkeys(actions))
|
||||
@@ -4474,69 +4356,25 @@ class AIAgent:
|
||||
def _repair_tool_call(self, tool_name: str) -> str | None:
|
||||
"""Attempt to repair a mismatched tool name before aborting.
|
||||
|
||||
Models sometimes emit variants of a tool name that differ only
|
||||
in casing, separators, or class-like suffixes. Normalize
|
||||
aggressively before falling back to fuzzy match:
|
||||
|
||||
1. Lowercase direct match.
|
||||
2. Lowercase + hyphens/spaces -> underscores.
|
||||
3. CamelCase -> snake_case (TodoTool -> todo_tool).
|
||||
4. Strip trailing ``_tool`` / ``-tool`` / ``tool`` suffix that
|
||||
Claude-style models sometimes tack on (TodoTool_tool ->
|
||||
TodoTool -> Todo -> todo). Applied twice so double-tacked
|
||||
suffixes like ``TodoTool_tool`` reduce all the way.
|
||||
5. Fuzzy match (difflib, cutoff=0.7).
|
||||
|
||||
See #14784 for the original reports (TodoTool_tool, Patch_tool,
|
||||
BrowserClick_tool were all returning "Unknown tool" before).
|
||||
1. Try lowercase
|
||||
2. Try normalized (lowercase + hyphens/spaces -> underscores)
|
||||
3. Try fuzzy match (difflib, cutoff=0.7)
|
||||
|
||||
Returns the repaired name if found in valid_tool_names, else None.
|
||||
"""
|
||||
import re
|
||||
from difflib import get_close_matches
|
||||
|
||||
if not tool_name:
|
||||
return None
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
return s.lower().replace("-", "_").replace(" ", "_")
|
||||
|
||||
def _camel_snake(s: str) -> str:
|
||||
return re.sub(r"(?<!^)(?=[A-Z])", "_", s).lower()
|
||||
|
||||
def _strip_tool_suffix(s: str) -> str | None:
|
||||
lc = s.lower()
|
||||
for suffix in ("_tool", "-tool", "tool"):
|
||||
if lc.endswith(suffix):
|
||||
return s[: -len(suffix)].rstrip("_-")
|
||||
return None
|
||||
|
||||
# Cheap fast-paths first — these cover the common case.
|
||||
# 1. Lowercase
|
||||
lowered = tool_name.lower()
|
||||
if lowered in self.valid_tool_names:
|
||||
return lowered
|
||||
normalized = _norm(tool_name)
|
||||
|
||||
# 2. Normalize
|
||||
normalized = lowered.replace("-", "_").replace(" ", "_")
|
||||
if normalized in self.valid_tool_names:
|
||||
return normalized
|
||||
|
||||
# Build the full candidate set for class-like emissions.
|
||||
cands: set[str] = {tool_name, lowered, normalized, _camel_snake(tool_name)}
|
||||
# Strip trailing tool-suffix up to twice — TodoTool_tool needs it.
|
||||
for _ in range(2):
|
||||
extra: set[str] = set()
|
||||
for c in cands:
|
||||
stripped = _strip_tool_suffix(c)
|
||||
if stripped:
|
||||
extra.add(stripped)
|
||||
extra.add(_norm(stripped))
|
||||
extra.add(_camel_snake(stripped))
|
||||
cands |= extra
|
||||
|
||||
for c in cands:
|
||||
if c and c in self.valid_tool_names:
|
||||
return c
|
||||
|
||||
# Fuzzy match as last resort.
|
||||
# 3. Fuzzy match
|
||||
matches = get_close_matches(lowered, self.valid_tool_names, n=1, cutoff=0.7)
|
||||
if matches:
|
||||
return matches[0]
|
||||
@@ -4628,7 +4466,7 @@ class AIAgent:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _build_keepalive_http_client(base_url: str = "") -> Any:
|
||||
def _build_keepalive_http_client() -> Any:
|
||||
try:
|
||||
import httpx as _httpx
|
||||
import socket as _socket
|
||||
@@ -4642,9 +4480,8 @@ class AIAgent:
|
||||
_sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPALIVE, 30))
|
||||
# When a custom transport is provided, httpx won't auto-read proxy
|
||||
# from env vars (allow_env_proxies = trust_env and transport is None).
|
||||
# Explicitly read proxy settings while still honoring NO_PROXY for
|
||||
# loopback / local endpoints such as a locally hosted sub2api.
|
||||
_proxy = _get_proxy_for_base_url(base_url)
|
||||
# Explicitly read proxy settings to ensure HTTP_PROXY/HTTPS_PROXY work.
|
||||
_proxy = _get_proxy_from_env()
|
||||
return _httpx.Client(
|
||||
transport=_httpx.HTTPTransport(socket_options=_sock_opts),
|
||||
proxy=_proxy,
|
||||
@@ -4702,7 +4539,7 @@ class AIAgent:
|
||||
if k in {"api_key", "base_url", "default_headers", "timeout", "http_client"}
|
||||
}
|
||||
if "http_client" not in safe_kwargs:
|
||||
keepalive_http = self._build_keepalive_http_client(base_url)
|
||||
keepalive_http = self._build_keepalive_http_client()
|
||||
if keepalive_http is not None:
|
||||
safe_kwargs["http_client"] = keepalive_http
|
||||
client = GeminiNativeClient(**safe_kwargs)
|
||||
@@ -4731,7 +4568,7 @@ class AIAgent:
|
||||
# Tests in ``tests/run_agent/test_create_openai_client_reuse.py`` and
|
||||
# ``tests/run_agent/test_sequential_chats_live.py`` pin this invariant.
|
||||
if "http_client" not in client_kwargs:
|
||||
keepalive_http = self._build_keepalive_http_client(client_kwargs.get("base_url", ""))
|
||||
keepalive_http = self._build_keepalive_http_client()
|
||||
if keepalive_http is not None:
|
||||
client_kwargs["http_client"] = keepalive_http
|
||||
client = OpenAI(**client_kwargs)
|
||||
@@ -5207,41 +5044,6 @@ class AIAgent:
|
||||
|
||||
return True
|
||||
|
||||
def _try_refresh_copilot_client_credentials(self) -> bool:
|
||||
"""Refresh Copilot credentials and rebuild the shared OpenAI client.
|
||||
|
||||
Copilot tokens may remain the same string across refreshes (`gh auth token`
|
||||
returns a stable OAuth token in many setups). We still rebuild the client
|
||||
on 401 so retries recover from stale auth/client state without requiring
|
||||
a session restart.
|
||||
"""
|
||||
if self.provider != "copilot":
|
||||
return False
|
||||
|
||||
try:
|
||||
from hermes_cli.copilot_auth import resolve_copilot_token
|
||||
|
||||
new_token, token_source = resolve_copilot_token()
|
||||
except Exception as exc:
|
||||
logger.debug("Copilot credential refresh failed: %s", exc)
|
||||
return False
|
||||
|
||||
if not isinstance(new_token, str) or not new_token.strip():
|
||||
return False
|
||||
|
||||
new_token = new_token.strip()
|
||||
|
||||
self.api_key = new_token
|
||||
self._client_kwargs["api_key"] = self.api_key
|
||||
self._client_kwargs["base_url"] = self.base_url
|
||||
self._apply_client_headers_for_base_url(str(self.base_url or ""))
|
||||
|
||||
if not self._replace_primary_openai_client(reason="copilot_credential_refresh"):
|
||||
return False
|
||||
|
||||
logger.info("Copilot credentials refreshed from %s", token_source)
|
||||
return True
|
||||
|
||||
def _try_refresh_anthropic_client_credentials(self) -> bool:
|
||||
if self.api_mode != "anthropic_messages" or not hasattr(self, "_anthropic_api_key"):
|
||||
return False
|
||||
@@ -5295,8 +5097,6 @@ class AIAgent:
|
||||
self._client_kwargs["default_headers"] = dict(_OR_HEADERS)
|
||||
elif base_url_host_matches(base_url, "ai-gateway.vercel.sh"):
|
||||
self._client_kwargs["default_headers"] = dict(_AI_GATEWAY_HEADERS)
|
||||
elif base_url_host_matches(base_url, "api.routermint.com"):
|
||||
self._client_kwargs["default_headers"] = _routermint_headers()
|
||||
elif base_url_host_matches(base_url, "api.githubcopilot.com"):
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
@@ -5375,7 +5175,7 @@ class AIAgent:
|
||||
effective_reason = FailoverReason.billing
|
||||
elif status_code == 429:
|
||||
effective_reason = FailoverReason.rate_limit
|
||||
elif status_code in (401, 403):
|
||||
elif status_code == 401:
|
||||
effective_reason = FailoverReason.auth
|
||||
|
||||
if effective_reason == FailoverReason.billing:
|
||||
@@ -6494,7 +6294,7 @@ class AIAgent:
|
||||
|
||||
# ── Provider fallback ──────────────────────────────────────────────────
|
||||
|
||||
def _try_activate_fallback(self, reason: "FailoverReason | None" = None) -> bool:
|
||||
def _try_activate_fallback(self) -> bool:
|
||||
"""Switch to the next fallback model/provider in the chain.
|
||||
|
||||
Called when the current model is failing after retries. Swaps the
|
||||
@@ -6506,15 +6306,6 @@ class AIAgent:
|
||||
auth resolution and client construction — no duplicated provider→key
|
||||
mappings.
|
||||
"""
|
||||
if reason in (FailoverReason.rate_limit, FailoverReason.billing):
|
||||
# Only start cooldown when leaving the primary provider. If we're
|
||||
# already on a fallback and chain-switching, the primary wasn't the
|
||||
# source of the 429 so the cooldown should not be reset/extended.
|
||||
fallback_already_active = bool(getattr(self, "_fallback_activated", False))
|
||||
current_provider = (getattr(self, "provider", "") or "").strip().lower()
|
||||
primary_provider = ((self._primary_runtime or {}).get("provider") or "").strip().lower()
|
||||
if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
|
||||
self._rate_limited_until = time.monotonic() + 60
|
||||
if self._fallback_index >= len(self._fallback_chain):
|
||||
return False
|
||||
|
||||
@@ -6651,15 +6442,11 @@ class AIAgent:
|
||||
# Without this, compression decisions use the primary model's
|
||||
# context window (e.g. 200K) instead of the fallback's (e.g. 32K),
|
||||
# causing oversized sessions to overflow the fallback.
|
||||
# Also pass _config_context_length so the explicit config override
|
||||
# (model.context_length in config.yaml) is respected — without this,
|
||||
# the fallback activation drops to 128K even when config says 204800.
|
||||
if hasattr(self, 'context_compressor') and self.context_compressor:
|
||||
from agent.model_metadata import get_model_context_length
|
||||
fb_context_length = get_model_context_length(
|
||||
self.model, base_url=self.base_url,
|
||||
api_key=self.api_key, provider=self.provider,
|
||||
config_context_length=getattr(self, "_config_context_length", None),
|
||||
)
|
||||
self.context_compressor.update_model(
|
||||
model=self.model,
|
||||
@@ -6698,9 +6485,6 @@ class AIAgent:
|
||||
if not self._fallback_activated:
|
||||
return False
|
||||
|
||||
if getattr(self, "_rate_limited_until", 0) > time.monotonic():
|
||||
return False # primary still in rate-limit cooldown, stay on fallback
|
||||
|
||||
rt = self._primary_runtime
|
||||
try:
|
||||
# ── Core runtime state ──
|
||||
@@ -7772,12 +7556,7 @@ class AIAgent:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens, focus_topic=focus_topic)
|
||||
except TypeError:
|
||||
# Plugin context engine with strict signature that doesn't accept
|
||||
# focus_topic — fall back to calling without it.
|
||||
compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens)
|
||||
compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens, focus_topic=focus_topic)
|
||||
|
||||
todo_snapshot = self._todo_store.format_for_injection()
|
||||
if todo_snapshot:
|
||||
@@ -9498,7 +9277,6 @@ class AIAgent:
|
||||
codex_auth_retry_attempted=False
|
||||
anthropic_auth_retry_attempted=False
|
||||
nous_auth_retry_attempted=False
|
||||
copilot_auth_retry_attempted=False
|
||||
thinking_sig_retry_attempted = False
|
||||
has_retried_429 = False
|
||||
restart_with_compressed_messages = False
|
||||
@@ -9656,47 +9434,28 @@ class AIAgent:
|
||||
response_invalid = True
|
||||
error_details.append("response is None")
|
||||
else:
|
||||
# Provider returned a terminal failure (e.g. quota exhaustion).
|
||||
# Treat as invalid so the fallback chain is triggered instead of
|
||||
# letting the error bubble up outside the retry/fallback loop.
|
||||
_codex_resp_status = str(getattr(response, "status", "") or "").strip().lower()
|
||||
if _codex_resp_status in {"failed", "cancelled"}:
|
||||
_codex_error_obj = getattr(response, "error", None)
|
||||
_codex_error_msg = (
|
||||
_codex_error_obj.get("message") if isinstance(_codex_error_obj, dict)
|
||||
else str(_codex_error_obj) if _codex_error_obj
|
||||
else f"Responses API returned status '{_codex_resp_status}'"
|
||||
# output_text fallback: stream backfill may have failed
|
||||
# but normalize can still recover from output_text
|
||||
_out_text = getattr(response, "output_text", None)
|
||||
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
|
||||
if _out_text_stripped:
|
||||
logger.debug(
|
||||
"Codex response.output is empty but output_text is present "
|
||||
"(%d chars); deferring to normalization.",
|
||||
len(_out_text_stripped),
|
||||
)
|
||||
logging.warning(
|
||||
"Codex response status='%s' (error=%s). Routing to fallback. %s",
|
||||
_codex_resp_status, _codex_error_msg,
|
||||
self._client_log_context(),
|
||||
else:
|
||||
_resp_status = getattr(response, "status", None)
|
||||
_resp_incomplete = getattr(response, "incomplete_details", None)
|
||||
logger.warning(
|
||||
"Codex response.output is empty after stream backfill "
|
||||
"(status=%s, incomplete_details=%s, model=%s). %s",
|
||||
_resp_status, _resp_incomplete,
|
||||
getattr(response, "model", None),
|
||||
f"api_mode={self.api_mode} provider={self.provider}",
|
||||
)
|
||||
response_invalid = True
|
||||
error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}")
|
||||
else:
|
||||
# output_text fallback: stream backfill may have failed
|
||||
# but normalize can still recover from output_text
|
||||
_out_text = getattr(response, "output_text", None)
|
||||
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
|
||||
if _out_text_stripped:
|
||||
logger.debug(
|
||||
"Codex response.output is empty but output_text is present "
|
||||
"(%d chars); deferring to normalization.",
|
||||
len(_out_text_stripped),
|
||||
)
|
||||
else:
|
||||
_resp_status = getattr(response, "status", None)
|
||||
_resp_incomplete = getattr(response, "incomplete_details", None)
|
||||
logger.warning(
|
||||
"Codex response.output is empty after stream backfill "
|
||||
"(status=%s, incomplete_details=%s, model=%s). %s",
|
||||
_resp_status, _resp_incomplete,
|
||||
getattr(response, "model", None),
|
||||
f"api_mode={self.api_mode} provider={self.provider}",
|
||||
)
|
||||
response_invalid = True
|
||||
error_details.append("response.output is empty")
|
||||
error_details.append("response.output is empty")
|
||||
elif self.api_mode == "anthropic_messages":
|
||||
_tv = self._get_transport()
|
||||
if not _tv.validate_response(response):
|
||||
@@ -10462,15 +10221,6 @@ class AIAgent:
|
||||
print(f"{self.log_prefix} • Check credits / billing: https://portal.nousresearch.com")
|
||||
print(f"{self.log_prefix} • Verify stored credentials: {_dhh}/auth.json")
|
||||
print(f"{self.log_prefix} • Switch providers temporarily: /model <model> --provider openrouter")
|
||||
if (
|
||||
self.provider == "copilot"
|
||||
and status_code == 401
|
||||
and not copilot_auth_retry_attempted
|
||||
):
|
||||
copilot_auth_retry_attempted = True
|
||||
if self._try_refresh_copilot_client_credentials():
|
||||
self._vprint(f"{self.log_prefix}🔐 Copilot credentials refreshed after 401. Retrying request...")
|
||||
continue
|
||||
if (
|
||||
self.api_mode == "anthropic_messages"
|
||||
and status_code == 401
|
||||
@@ -10670,14 +10420,14 @@ class AIAgent:
|
||||
)
|
||||
if is_rate_limited and self._fallback_index < len(self._fallback_chain):
|
||||
# Don't eagerly fallback if credential pool rotation may
|
||||
# still recover. See _pool_may_recover_from_rate_limit
|
||||
# for the single-credential-pool exception. Fixes #11314.
|
||||
pool_may_recover = _pool_may_recover_from_rate_limit(
|
||||
self._credential_pool
|
||||
)
|
||||
# still recover. The pool's retry-then-rotate cycle needs
|
||||
# at least one more attempt to fire — jumping to a fallback
|
||||
# provider here short-circuits it.
|
||||
pool = self._credential_pool
|
||||
pool_may_recover = pool is not None and pool.has_available()
|
||||
if not pool_may_recover:
|
||||
self._emit_status("⚠️ Rate limited — switching to fallback provider...")
|
||||
if self._try_activate_fallback(reason=classified.reason):
|
||||
if self._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
primary_recovery_attempted = False
|
||||
@@ -10930,26 +10680,9 @@ class AIAgent:
|
||||
# already accounts for 413, 429, 529 (transient), context
|
||||
# overflow, and generic-400 heuristics. Local validation
|
||||
# errors (ValueError, TypeError) are programming bugs.
|
||||
# Exclude UnicodeEncodeError — it's a ValueError subclass
|
||||
# but is handled separately by the surrogate sanitization
|
||||
# path above. Exclude json.JSONDecodeError — also a
|
||||
# ValueError subclass, but it indicates a transient
|
||||
# provider/network failure (malformed response body,
|
||||
# truncated stream, routing layer corruption), not a
|
||||
# local programming bug, and should be retried (#14782).
|
||||
is_local_validation_error = (
|
||||
isinstance(api_error, (ValueError, TypeError))
|
||||
and not isinstance(
|
||||
api_error, (UnicodeEncodeError, json.JSONDecodeError)
|
||||
)
|
||||
# ssl.SSLError (and its subclass SSLCertVerificationError)
|
||||
# inherits from OSError *and* ValueError via Python MRO,
|
||||
# so the isinstance(ValueError) check above would
|
||||
# misclassify a TLS transport failure as a local
|
||||
# programming bug and abort without retrying. Exclude
|
||||
# ssl.SSLError explicitly so the error classifier's
|
||||
# retryable=True mapping takes effect instead.
|
||||
and not isinstance(api_error, ssl.SSLError)
|
||||
and not isinstance(api_error, UnicodeEncodeError)
|
||||
)
|
||||
is_client_error = (
|
||||
is_local_validation_error
|
||||
|
||||
@@ -1,377 +0,0 @@
|
||||
# Compression Eval — Design
|
||||
|
||||
Status: proposal. Nothing under `scripts/compression_eval/` runs in CI.
|
||||
This is an offline tool authors run before merging prompt or algorithm
|
||||
changes to `agent/context_compressor.py`.
|
||||
|
||||
## Why
|
||||
|
||||
We tune the compressor prompt and the `_template_sections` checklist by
|
||||
hand, ship, and wait for the next real session to notice regressions.
|
||||
There is no automated check that a prompt edit still preserves file
|
||||
paths, error messages, or the active task across a compression.
|
||||
|
||||
Factory.ai's December 2025 write-up
|
||||
(https://factory.ai/news/evaluating-compression) describes a
|
||||
probe-based eval that scores compressed state on six dimensions. The
|
||||
methodology is the valuable part — the benchmarks in the post are a
|
||||
marketing piece. We adopt the methodology and discard the scoreboard.
|
||||
|
||||
## Goal
|
||||
|
||||
Given a real session transcript and a bank of probe questions that
|
||||
exercise what the transcript contained, answer:
|
||||
|
||||
1. After `ContextCompressor.compress()` runs, can the agent still
|
||||
answer each probe correctly from the compressed state?
|
||||
2. Which of the six dimensions (accuracy, context awareness, artifact
|
||||
trail, completeness, continuity, instruction following) is the
|
||||
prompt weakest on?
|
||||
3. Does a prompt change improve or regress any dimension vs. the
|
||||
previous run?
|
||||
|
||||
That is the full scope. No "compare against OpenAI and Anthropic"
|
||||
benchmarking, no public scoreboard, no marketing claims.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- Not a pytest. Requires API credentials, costs money, takes minutes
|
||||
per fixture, and output is LLM-graded and non-deterministic.
|
||||
- Not part of `scripts/run_tests.sh`. Not invoked by CI.
|
||||
- Not a replacement for the existing compressor unit tests in
|
||||
`tests/agent/test_context_compressor.py` — those stay as the
|
||||
structural / boundary / tool-pair-sanitization guard.
|
||||
- Not a general trajectory eval. Scoped to context compaction only.
|
||||
|
||||
## Where it lives
|
||||
|
||||
```
|
||||
scripts/compression_eval/
|
||||
├── DESIGN.md # this file
|
||||
├── README.md # how to run, cost expectations, caveats
|
||||
├── run_eval.py # entry point (fire CLI, like sample_and_compress.py)
|
||||
├── scrub_fixtures.py # regenerate fixtures from ~/.hermes/sessions/*.jsonl
|
||||
├── fixtures/ # checked-in scrubbed session snapshots
|
||||
│ ├── feature-impl-context-priority.json
|
||||
│ ├── debug-session-feishu-id-model.json
|
||||
│ └── config-build-competitive-scouts.json
|
||||
├── probes/ # probe banks paired with fixtures
|
||||
│ └── <fixture>.probes.json
|
||||
├── rubric.py # grading prompt + dimension definitions
|
||||
├── grader.py # judge-model call + score parsing
|
||||
├── compressor_driver.py # thin wrapper over ContextCompressor
|
||||
└── results/ # gitignored; timestamped output per run
|
||||
└── .gitkeep
|
||||
```
|
||||
|
||||
`scripts/` is the right home: offline tooling, no CI involvement,
|
||||
precedent already set by `sample_and_compress.py`,
|
||||
`contributor_audit.py`, `discord-voice-doctor.py`.
|
||||
|
||||
`environments/` is for Atropos RL training environments — wrong shape.
|
||||
`tests/` is hermetic and credential-free — incompatible with a
|
||||
probe-based eval that needs a judge model.
|
||||
|
||||
## Fixture format
|
||||
|
||||
A fixture is a single compressed-enough conversation captured from a
|
||||
real session. Stored as JSON (pretty-printed, reviewable in PRs):
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "401-debug",
|
||||
"description": "178-turn session debugging a 401 on /api/auth/login",
|
||||
"model": "anthropic/claude-sonnet-4.6",
|
||||
"context_length": 200000,
|
||||
"messages": [
|
||||
{"role": "system", "content": "..."},
|
||||
{"role": "user", "content": "..."},
|
||||
{"role": "assistant", "content": "...", "tool_calls": [...]},
|
||||
{"role": "tool", "tool_call_id": "...", "content": "..."}
|
||||
],
|
||||
"notes": "Captured 2026-04-24 from session 20260424_*.jsonl; \
|
||||
PII scrubbed; secrets redacted via redact_sensitive_text."
|
||||
}
|
||||
```
|
||||
|
||||
### Sourcing fixtures
|
||||
|
||||
Fixtures are scrubbed snapshots of real sessions from the
|
||||
maintainer's `~/.hermes/sessions/*.jsonl` store, generated
|
||||
reproducibly by `scrub_fixtures.py` in this directory. Re-run the
|
||||
scrubber with `python3 scripts/compression_eval/scrub_fixtures.py`
|
||||
to regenerate them after a scrubber change.
|
||||
|
||||
Three shipped fixtures cover three different session shapes:
|
||||
|
||||
| Fixture | Source shape | Messages | Tokens (rough) | Tests |
|
||||
|---|---|---|---|---|
|
||||
| `feature-impl-context-priority` | investigate → patch → test → PR → merge | 75 | ~45k | continuation, artifact trail (2 files modified, 1 PR, ~16k skill_view in head) |
|
||||
| `debug-session-feishu-id-model` | PR triage + upstream docs + decision | 59 | ~28k | recall (PR #, error shape), decision (outcome + reason), large PR diff blocks |
|
||||
| `config-build-competitive-scouts` | iterative config: 11 cron jobs across 7 weekdays | 61 | ~26k | artifact trail (which jobs, which days), iterative-merge |
|
||||
|
||||
The `~26k-45k` token range is below the default 50%-of-200k
|
||||
compression threshold, so the eval will always **force** a
|
||||
`compress()` call rather than wait for the natural trigger. That is
|
||||
the intended shape — we want a controlled single-shot compression so
|
||||
score deltas are attributable to the prompt change, not to whether
|
||||
the threshold happened to fire at the same boundary twice.
|
||||
|
||||
### Scrubber pipeline
|
||||
|
||||
`scrub_fixtures.py` applies, per message:
|
||||
|
||||
1. `agent.redact.redact_sensitive_text` — API keys, tokens,
|
||||
connection strings
|
||||
2. Username paths: `/home/teknium` → `/home/user`
|
||||
3. Personal handles: all case variants of the maintainer name → `user`
|
||||
4. Email addresses → `contributor@example.com`; git
|
||||
`Author: Name <addr>` header lines normalised
|
||||
5. `<REASONING_SCRATCHPAD>...</REASONING_SCRATCHPAD>` and
|
||||
`<think>...</think>` stripped from assistant content
|
||||
6. Messaging-platform user mentions (`<@123456>`, `<@***>`) →
|
||||
`<@user>`
|
||||
7. First user message paraphrased to remove personal voice;
|
||||
subsequent user turns kept verbatim after the redactions above
|
||||
8. System prompt replaced with a generic public-safe placeholder so
|
||||
we don't check in the maintainer's tuned soul/skills/memory system
|
||||
block
|
||||
9. Orphan empty-assistant messages (artifact of scratchpad-only
|
||||
turns) and trailing tool messages with no matching assistant are
|
||||
dropped
|
||||
10. Tool outputs preserved verbatim. An earlier iteration truncated
|
||||
> 2KB tool bodies to keep fixture JSON small, but that defeats
|
||||
the purpose: real sessions have 30KB `skill_view` dumps, 10KB
|
||||
`read_file` outputs, 5KB `web_extract` bodies — compression has
|
||||
to handle them. Truncation is now a no-op; the pipeline note
|
||||
remains in `scrubbing_passes` for audit trail clarity.
|
||||
|
||||
Before every fixture PR: grep the fixture for PII patterns. An
|
||||
audit is embedded at the bottom of the scrubber as comments.
|
||||
|
||||
**Fixtures must stay small.** Target <200 KB per fixture, <500 KB
|
||||
total for the directory. Current total: ~410 KB across three
|
||||
fixtures. Larger sessions are truncated with a
|
||||
`truncated_to: <index>` field in the fixture header so the cut is
|
||||
reviewable.
|
||||
|
||||
## Probe format
|
||||
|
||||
One probe file per fixture, so reviewers can see the question bank
|
||||
evolve alongside the fixture:
|
||||
|
||||
```json
|
||||
{
|
||||
"fixture": "401-debug",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-error-code",
|
||||
"type": "recall",
|
||||
"question": "What was the original error code and endpoint?",
|
||||
"expected_facts": ["401", "/api/auth/login"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-files-modified",
|
||||
"type": "artifact",
|
||||
"question": "Which files have been modified in this session?",
|
||||
"expected_facts": ["session_store.py", "redis_client.py"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-next-step",
|
||||
"type": "continuation",
|
||||
"question": "What should we do next?",
|
||||
"expected_facts": ["re-run the integration tests", "restart the worker"]
|
||||
},
|
||||
{
|
||||
"id": "decision-redis-approach",
|
||||
"type": "decision",
|
||||
"question": "What did we decide about the Redis issue?",
|
||||
"expected_facts": ["switch to redis-py 5.x", "pooled connection"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The four probe types come directly from Factory's methodology:
|
||||
**recall, artifact, continuation, decision**. `expected_facts` gives
|
||||
the grader concrete anchors instead of relying purely on LLM taste.
|
||||
|
||||
Authoring a probe bank is a one-time cost per fixture. 8-12 probes per
|
||||
fixture is the target — enough to cover all four types, few enough to
|
||||
grade in under a minute at reasonable cost.
|
||||
|
||||
## Grading
|
||||
|
||||
Each probe gets scored 0-5 on **six dimensions** (Factory's six):
|
||||
|
||||
| Dimension | What it measures |
|
||||
|-----------------------|-----------------------------------------------------|
|
||||
| accuracy | File paths, function names, error codes are correct |
|
||||
| context_awareness | Reflects current state, not a mid-session snapshot |
|
||||
| artifact_trail | Knows which files were read / modified / created |
|
||||
| completeness | Addresses all parts of the probe |
|
||||
| continuity | Agent can continue without re-fetching |
|
||||
| instruction_following | Probe answered in the requested form |
|
||||
|
||||
Grading is done by a single judge-model call per probe with a
|
||||
deterministic rubric prompt (see `rubric.py`). The rubric includes the
|
||||
`expected_facts` list so the judge has a concrete anchor. Default
|
||||
judge model: whatever the user has configured as their main model at
|
||||
run time (same resolution path as `auxiliary_client.call_llm`). A
|
||||
`--judge-model` flag allows overriding for consistency across runs.
|
||||
|
||||
Non-determinism caveat: two runs of the same fixture will produce
|
||||
different scores. A single run means nothing. Report medians over
|
||||
N=3 runs by default, and require an improvement of >=0.3 on any
|
||||
dimension before claiming a prompt change is a win.
|
||||
|
||||
## Run flow
|
||||
|
||||
```
|
||||
python scripts/compression_eval/run_eval.py [OPTIONS]
|
||||
```
|
||||
|
||||
Options (fire-style, mirroring `sample_and_compress.py`):
|
||||
|
||||
| Flag | Default | Purpose |
|
||||
|------------------------|------------|-------------------------------------------|
|
||||
| `--fixtures` | all | Comma-separated fixture names |
|
||||
| `--runs` | 3 | Runs per fixture (for median) |
|
||||
| `--judge-model` | auto | Override judge model |
|
||||
| `--compressor-model` | auto | Override model used *inside* the compressor |
|
||||
| `--label` | timestamp | Subdirectory under `results/` |
|
||||
| `--focus-topic` | none | Pass-through to `compress(focus_topic=)` |
|
||||
| `--compare-to` | none | Path to a previous run for diff output |
|
||||
|
||||
Steps per fixture per run:
|
||||
|
||||
1. Load fixture JSON and probe bank.
|
||||
2. Construct a `ContextCompressor` against the fixture's model.
|
||||
3. Call `compressor.compress(messages)` — capture the compressed
|
||||
message list.
|
||||
4. For each probe: ask the judge model to role-play as the continuing
|
||||
agent with only the compressed state, then grade the answer on the
|
||||
six dimensions using `rubric.py`.
|
||||
5. Write a per-run JSON to `results/<label>/<fixture>-run-N.json`.
|
||||
6. After all runs, emit a markdown summary to
|
||||
`results/<label>/report.md`.
|
||||
|
||||
## Report format
|
||||
|
||||
Pasted verbatim into PR descriptions that touch the compressor:
|
||||
|
||||
```
|
||||
## Compression eval — label 2026-04-25_13-40-02
|
||||
|
||||
Main model: anthropic/claude-sonnet-4.6 Judge: same
|
||||
3 runs per fixture, medians reported.
|
||||
|
||||
| Fixture | Accuracy | Context | Artifact | Complete | Continuity | Instruction | Overall |
|
||||
|----------------|----------|---------|----------|----------|------------|-------------|---------|
|
||||
| 401-debug | 4.1 | 4.0 | 2.5 | 4.3 | 3.8 | 5.0 | 3.95 |
|
||||
| pr-review | 3.9 | 3.8 | 3.1 | 4.2 | 3.9 | 5.0 | 3.98 |
|
||||
| feature-impl | 4.0 | 3.9 | 2.9 | 4.1 | 4.0 | 5.0 | 3.98 |
|
||||
|
||||
Per-probe misses (score < 3.0):
|
||||
- 401-debug / artifact-files-modified: 1.7 — summary dropped redis_client.py
|
||||
- pr-review / decision-auth-rewrite: 2.3 — outcome captured, reasoning dropped
|
||||
```
|
||||
|
||||
## Cost expectations
|
||||
|
||||
Dominated by the judge calls. For 3 fixtures × 10 probes × 3 runs =
|
||||
90 judge calls per eval run. On Claude Sonnet 4.6 that is roughly
|
||||
$0.50-$1.50 per full eval depending on probe length. The compressor
|
||||
itself makes 1 call per fixture × 3 runs = 9 additional calls.
|
||||
|
||||
**This is not a check to run after every commit.** It is a
|
||||
before-merge check for PRs that touch:
|
||||
|
||||
- `agent/context_compressor.py` — any change to `_template_sections`,
|
||||
`_generate_summary`, or `compress()`.
|
||||
- `agent/auxiliary_client.py` — when changing how compression tasks
|
||||
are routed.
|
||||
- `agent/prompt_builder.py` — when the compression-note phrasing
|
||||
changes.
|
||||
|
||||
## Open questions (to resolve before implementing)
|
||||
|
||||
1. **Fixture scrubbing: manual or scripted?** A scripted scrub that
|
||||
also replaces project names / hostnames would lower the cost of
|
||||
contributing a new fixture. Risk: over-aggressive replacement
|
||||
destroys the signal the probe depends on. Propose: start manual,
|
||||
add scripted helpers once we have 3 fixtures and know the common
|
||||
PII shapes.
|
||||
|
||||
2. **Judge model selection.** Factory uses GPT-5.2. We can't pin one
|
||||
— user's main model changes. Options: (a) grade with main model
|
||||
(cheap, inconsistent across users), (b) require a specific judge
|
||||
model (e.g. `claude-sonnet-4.6`), inconsistent for users without
|
||||
access. Propose (a) with a `--judge-model` override, and make the
|
||||
model name prominent in the report so comparisons across machines
|
||||
are legible.
|
||||
|
||||
3. **Noise floor.** Before landing prompt changes, run the current
|
||||
prompt N=10 times to measure per-dimension stddev. That tells us
|
||||
the minimum delta to call a change significant. Suspect 0.2-0.3 on
|
||||
a 0-5 scale. Decision deferred until after the first fixture is
|
||||
landed.
|
||||
|
||||
4. **Iterative-merge coverage.** The real Factory-vs-Anthropic
|
||||
difference is incremental merge vs. regenerate. A fixture that
|
||||
only compresses once doesn't exercise our iterative path. Add a
|
||||
fourth fixture that forces two compressions (manually chained),
|
||||
with probes that test whether information from the first
|
||||
compression survives the second. Deferred to a follow-up PR.
|
||||
|
||||
## Implementation status
|
||||
|
||||
This PR ships the full eval end-to-end:
|
||||
|
||||
- `scrub_fixtures.py` — reproducible scrubber
|
||||
- `fixtures/` — three scrubbed session fixtures
|
||||
- `probes/` — three probe banks (10-11 probes each, all four types)
|
||||
- `rubric.py` — six-dimension grading rubric + judge-prompt builder + response parser
|
||||
- `compressor_driver.py` — thin wrapper around `ContextCompressor` for forced single-shot compression
|
||||
- `grader.py` — two-phase continuation + grading calls via OpenAI SDK
|
||||
- `report.py` — markdown report renderer + `--compare-to` delta mode + per-run JSON dumper
|
||||
- `run_eval.py` — entry point (`fire`-style CLI)
|
||||
- `tests/scripts/test_compression_eval.py` — 33 unit tests covering rubric parsing, report rendering, fixture/probe loading, and a PII smoke test on the fixtures (LLM paths not tested — they require credentials and are exercised by the eval itself)
|
||||
|
||||
### Noise floor — one empirical data point
|
||||
|
||||
A single same-inputs re-run of `debug-session-feishu-id-model`
|
||||
(compressor + judge = `openai/gpt-5.4-mini` via Nous Portal,
|
||||
runs=1) produced:
|
||||
|
||||
- Run A overall: 3.25
|
||||
- Run B overall: 3.17 (delta -0.08)
|
||||
|
||||
Individual dimensions varied by up to ±0.5 between the two runs on
|
||||
single-run medians. This confirms DESIGN.md's "< 0.3 is noise"
|
||||
guidance is the right order of magnitude for a single-run
|
||||
comparison. With `runs=3` default, per-dimension variance should
|
||||
tighten; noise-floor measurement at N=10 is still a useful
|
||||
follow-up to calibrate precisely.
|
||||
|
||||
## Open follow-ups (not blocking this PR)
|
||||
|
||||
1. **Iterative-merge fixture** — our actual compression win over
|
||||
"regenerate from scratch" approaches is only exercised when
|
||||
`_previous_summary` is re-used on a second compression. None of
|
||||
the three shipped fixtures force two compressions. The natural
|
||||
basis is `config-build-competitive-scouts` (already iterative by
|
||||
shape); splitting it at the Monday/Tuesday boundary would force
|
||||
the second compression to merge rather than regenerate.
|
||||
2. **Noise-floor precision** — run the current prompt N=10 times
|
||||
against one fixture to pin down per-dimension stddev and publish
|
||||
the numbers in README.
|
||||
3. **Scripted scrubber helpers** — the current scrubber is manual
|
||||
per-fixture. A helper that identifies candidate sessions to
|
||||
scrub (by shape or by keyword) would lower the cost of adding
|
||||
fixture #4+.
|
||||
4. **Judge model selection policy** — current code uses whatever
|
||||
the user passes as `--judge-model` (default: same as compressor).
|
||||
Pinning the judge across users would stabilise cross-machine
|
||||
comparisons, at the cost of blocking users without access to
|
||||
the pinned model.
|
||||
@@ -1,110 +0,0 @@
|
||||
# compression_eval
|
||||
|
||||
Offline eval harness for `agent/context_compressor.py`. Runs a real
|
||||
conversation transcript through the compressor, then probes the
|
||||
compressed state with targeted questions graded on six dimensions.
|
||||
|
||||
## When to run
|
||||
|
||||
Before merging changes to:
|
||||
|
||||
- `agent/context_compressor.py` — any change to `_template_sections`,
|
||||
`_generate_summary`, `compress()`, or its boundary logic
|
||||
- `agent/auxiliary_client.py` — when changing how compression tasks
|
||||
are routed
|
||||
- `agent/prompt_builder.py` — when the compression-note phrasing
|
||||
changes
|
||||
|
||||
## Not for CI
|
||||
|
||||
This harness makes real model calls (compressor + continuation +
|
||||
judge = ~3 calls per probe × probes per fixture × runs). Costs ~$0.50
|
||||
to ~$1.50 per full run depending on models, takes minutes, is
|
||||
LLM-graded (non-deterministic). It lives in `scripts/` and is
|
||||
invoked by hand. `tests/` and `scripts/run_tests.sh` do not touch it.
|
||||
|
||||
`tests/scripts/test_compression_eval.py` covers the non-LLM code
|
||||
paths (rubric parsing, report rendering, fixture/probe loading, PII
|
||||
smoke check on the checked-in fixtures) and DOES run in CI.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Run all three fixtures, 3 runs each, with your configured provider
|
||||
python3 scripts/compression_eval/run_eval.py
|
||||
|
||||
# Faster iteration — one fixture, one run
|
||||
python3 scripts/compression_eval/run_eval.py \
|
||||
--fixtures=debug-session-feishu-id-model --runs=1
|
||||
|
||||
# Pin a cheap model for both compression + judge (recommended)
|
||||
python3 scripts/compression_eval/run_eval.py \
|
||||
--compressor-provider=nous --compressor-model=openai/gpt-5.4-mini \
|
||||
--judge-provider=nous --judge-model=openai/gpt-5.4-mini \
|
||||
--runs=3 --label=baseline
|
||||
|
||||
# After editing context_compressor.py, rerun with a new label and diff
|
||||
python3 scripts/compression_eval/run_eval.py \
|
||||
--compressor-provider=nous --compressor-model=openai/gpt-5.4-mini \
|
||||
--judge-provider=nous --judge-model=openai/gpt-5.4-mini \
|
||||
--runs=3 --label=my-prompt-tweak \
|
||||
--compare-to=results/baseline
|
||||
```
|
||||
|
||||
Results land in `results/<label>/report.md` and are intended to be
|
||||
pasted verbatim into PR descriptions. `--compare-to` renders a delta
|
||||
column per dimension so reviewers can see "did this actually help?"
|
||||
at a glance.
|
||||
|
||||
Rule of thumb: dimension deltas below ±0.3 are within run-to-run
|
||||
noise on `runs=3`. Publish a bigger N if you want tighter bounds.
|
||||
|
||||
## Fixtures
|
||||
|
||||
Three scrubbed session snapshots live under `fixtures/`:
|
||||
|
||||
- `feature-impl-context-priority.json` — 75 msgs, investigate →
|
||||
patch → test → PR → merge
|
||||
- `debug-session-feishu-id-model.json` — 59 msgs, PR triage +
|
||||
upstream docs + decision
|
||||
- `config-build-competitive-scouts.json` — 61 msgs, iterative
|
||||
config accumulation (11 cron jobs)
|
||||
|
||||
Regenerate them from the maintainer's `~/.hermes/sessions/*.jsonl`
|
||||
with `python3 scripts/compression_eval/scrub_fixtures.py`. The
|
||||
scrubber pipeline and PII-audit checklist are documented in
|
||||
`DESIGN.md` under **Scrubber pipeline**.
|
||||
|
||||
## Probes
|
||||
|
||||
One probe bank per fixture under `probes/`, 10-11 probes each,
|
||||
covering all four types: **recall**, **artifact**, **continuation**,
|
||||
**decision**. Each probe carries an `expected_facts` list of concrete
|
||||
anchors (PR numbers, file paths, error codes, commands run) that the
|
||||
judge sees alongside the assistant's answer.
|
||||
|
||||
## How it scores
|
||||
|
||||
Six dimensions, 0-5 per probe:
|
||||
|
||||
| Dimension | What it measures |
|
||||
|-----------------------|------------------------------------------------------|
|
||||
| accuracy | File paths, function names, PR/issue numbers correct |
|
||||
| context_awareness | Reflects current session state, not a snapshot |
|
||||
| artifact_trail | Correctly enumerates files / commands / PRs |
|
||||
| completeness | Addresses ALL parts of the probe |
|
||||
| continuity | Next assistant could continue without re-fetching |
|
||||
| instruction_following | Answer in the requested form |
|
||||
|
||||
Report renders medians across N runs; probes scoring below 3.0
|
||||
overall surface in a separate section with the judge's specific
|
||||
complaint noted inline.
|
||||
|
||||
## Related
|
||||
|
||||
- `agent/context_compressor.py` — the thing under test
|
||||
- `tests/agent/test_context_compressor.py` — structural unit tests
|
||||
that do run in CI
|
||||
- `scripts/sample_and_compress.py` — the closest existing script in
|
||||
shape (offline, credential-requiring, not in CI)
|
||||
- `DESIGN.md` — full architecture + methodology + open follow-ups
|
||||
@@ -1,114 +0,0 @@
|
||||
"""Wraps ContextCompressor to run a single forced compression on a fixture.
|
||||
|
||||
The real agent loop checks ``should_compress()`` before calling ``compress()``.
|
||||
Fixtures are intentionally sized below the 100k threshold so ``compress()``
|
||||
runs in a controlled, single-shot mode — score deltas attribute to the
|
||||
prompt change, not to whether the threshold happened to fire at the same
|
||||
boundary twice.
|
||||
|
||||
Resolves the provider for the compression call via the same path the real
|
||||
agent uses (``hermes_cli.runtime_provider.resolve_runtime_provider``) so
|
||||
behaviour matches production aside from being a single call.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# Make sibling imports work whether invoked as a script or as a module.
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
if str(_REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
|
||||
from agent.context_compressor import ( # noqa: E402
|
||||
ContextCompressor,
|
||||
estimate_messages_tokens_rough,
|
||||
)
|
||||
|
||||
|
||||
def run_compression(
|
||||
*,
|
||||
messages: List[Dict[str, Any]],
|
||||
compressor_model: str,
|
||||
compressor_provider: str,
|
||||
compressor_base_url: str,
|
||||
compressor_api_key: str,
|
||||
compressor_api_mode: str,
|
||||
context_length: int,
|
||||
focus_topic: Optional[str] = None,
|
||||
summary_model_override: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run a single forced compression pass over the fixture messages.
|
||||
|
||||
Returns a dict with:
|
||||
- compressed_messages: the post-compression message list
|
||||
- summary_text: the summary produced (extracted from the compressed head)
|
||||
- pre_tokens, post_tokens: rough token counts before/after
|
||||
- compression_ratio: 1 - (post/pre)
|
||||
- pre_message_count, post_message_count
|
||||
"""
|
||||
compressor = ContextCompressor(
|
||||
model=compressor_model,
|
||||
threshold_percent=0.50,
|
||||
protect_first_n=3,
|
||||
protect_last_n=20,
|
||||
summary_target_ratio=0.20,
|
||||
quiet_mode=True,
|
||||
summary_model_override=summary_model_override or "",
|
||||
base_url=compressor_base_url,
|
||||
api_key=compressor_api_key,
|
||||
config_context_length=context_length,
|
||||
provider=compressor_provider,
|
||||
api_mode=compressor_api_mode,
|
||||
)
|
||||
|
||||
pre_tokens = estimate_messages_tokens_rough(messages)
|
||||
compressed = compressor.compress(
|
||||
messages,
|
||||
current_tokens=pre_tokens,
|
||||
focus_topic=focus_topic,
|
||||
)
|
||||
post_tokens = estimate_messages_tokens_rough(compressed)
|
||||
|
||||
summary_text = _extract_summary_from_messages(compressed)
|
||||
|
||||
ratio = (1.0 - (post_tokens / pre_tokens)) if pre_tokens > 0 else 0.0
|
||||
|
||||
return {
|
||||
"compressed_messages": compressed,
|
||||
"summary_text": summary_text,
|
||||
"pre_tokens": pre_tokens,
|
||||
"post_tokens": post_tokens,
|
||||
"compression_ratio": ratio,
|
||||
"pre_message_count": len(messages),
|
||||
"post_message_count": len(compressed),
|
||||
}
|
||||
|
||||
|
||||
_SUMMARY_MARKERS = (
|
||||
"## Active Task",
|
||||
"## Goal",
|
||||
"## Completed Actions",
|
||||
)
|
||||
|
||||
|
||||
def _extract_summary_from_messages(messages: List[Dict[str, Any]]) -> str:
|
||||
"""Find the structured summary block inside the compressed message list.
|
||||
|
||||
The compressor injects the summary as a user (or system-appended) message
|
||||
near the head. We look for the section-header markers from
|
||||
``_template_sections`` in ``agent/context_compressor.py``.
|
||||
"""
|
||||
for msg in messages:
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, str):
|
||||
if isinstance(content, list):
|
||||
content = "\n".join(
|
||||
p.get("text", "") for p in content if isinstance(p, dict)
|
||||
)
|
||||
else:
|
||||
continue
|
||||
if any(marker in content for marker in _SUMMARY_MARKERS):
|
||||
return content
|
||||
return ""
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,181 +0,0 @@
|
||||
"""Two-phase probe grading.
|
||||
|
||||
Phase 1 — **Continuation**: simulate the next assistant turn. Feed the
|
||||
compressed message list plus the probe question and ask the continuing
|
||||
model to answer using only the compressed context. This is exactly what
|
||||
a real next-turn call would look like.
|
||||
|
||||
Phase 2 — **Grading**: a separate judge-model call scores the answer on
|
||||
the six rubric dimensions using ``rubric.build_judge_prompt``.
|
||||
|
||||
Both phases use the OpenAI SDK directly against the resolved provider
|
||||
endpoint, so the explicit api_key + base_url we pass always reaches the
|
||||
wire. (``agent.auxiliary_client.call_llm`` is designed for task-tagged
|
||||
auxiliary calls backed by config lookups; for eval we need the explicit
|
||||
credentials to win unconditionally.)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
if str(_REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
|
||||
from openai import OpenAI # noqa: E402
|
||||
|
||||
from rubric import build_judge_prompt, parse_judge_response # noqa: E402
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_CONTINUATION_SYSTEM = (
|
||||
"You are the continuing assistant in a long session. Earlier turns have "
|
||||
"been compacted into a handoff summary that is now part of the "
|
||||
"conversation history. The user has just asked you a question. "
|
||||
"Answer using ONLY what you can determine from the conversation history "
|
||||
"you see (including the handoff summary). Do NOT invent details. If the "
|
||||
"summary does not contain a specific fact, say so explicitly rather "
|
||||
"than guessing. Be direct and concrete — cite file paths, PR numbers, "
|
||||
"error codes, and exact values when they are present in the summary."
|
||||
)
|
||||
|
||||
|
||||
def answer_probe(
|
||||
*,
|
||||
compressed_messages: List[Dict[str, Any]],
|
||||
probe_question: str,
|
||||
model: str,
|
||||
provider: str,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
max_tokens: int = 1024,
|
||||
timeout: Optional[float] = 120.0,
|
||||
) -> str:
|
||||
"""Run the continuation call: what does the next assistant answer?
|
||||
|
||||
Builds a messages list of [system_continuation, *compressed, probe_user]
|
||||
and asks the configured model. Returns the answer content as a string.
|
||||
"""
|
||||
# Strip any pre-existing system message from the compressed list and
|
||||
# replace with our continuation system prompt. The fixture's generic
|
||||
# system is not the right frame for the continuation simulation.
|
||||
history = [m for m in compressed_messages if m.get("role") != "system"]
|
||||
messages = (
|
||||
[{"role": "system", "content": _CONTINUATION_SYSTEM}]
|
||||
+ _sanitize_for_chat_api(history)
|
||||
+ [{"role": "user", "content": probe_question}]
|
||||
)
|
||||
|
||||
client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
if not isinstance(content, str):
|
||||
content = "" if content is None else str(content)
|
||||
return content.strip()
|
||||
|
||||
|
||||
def grade_probe(
|
||||
*,
|
||||
probe_question: str,
|
||||
probe_type: str,
|
||||
expected_facts: List[str],
|
||||
assistant_answer: str,
|
||||
judge_model: str,
|
||||
judge_provider: str,
|
||||
judge_base_url: str,
|
||||
judge_api_key: str,
|
||||
max_tokens: int = 512,
|
||||
timeout: Optional[float] = 120.0,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run the judge call and parse the six dimension scores.
|
||||
|
||||
Returns dict {scores: {dim: int}, notes: str, overall: float,
|
||||
raw: str, parse_error: str|None}. On parse failure, scores are zeros
|
||||
and parse_error is populated — the caller decides whether to retry
|
||||
or accept.
|
||||
"""
|
||||
prompt = build_judge_prompt(
|
||||
probe_question=probe_question,
|
||||
probe_type=probe_type,
|
||||
expected_facts=expected_facts,
|
||||
assistant_answer=assistant_answer,
|
||||
)
|
||||
client = OpenAI(api_key=judge_api_key, base_url=judge_base_url, timeout=timeout)
|
||||
response = client.chat.completions.create(
|
||||
model=judge_model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
raw = response.choices[0].message.content or ""
|
||||
if not isinstance(raw, str):
|
||||
raw = str(raw)
|
||||
|
||||
try:
|
||||
parsed = parse_judge_response(raw)
|
||||
parsed["raw"] = raw
|
||||
parsed["parse_error"] = None
|
||||
return parsed
|
||||
except ValueError as exc:
|
||||
logger.warning("Judge response parse failed: %s | raw=%r", exc, raw[:200])
|
||||
from rubric import DIMENSIONS
|
||||
return {
|
||||
"scores": {d: 0 for d in DIMENSIONS},
|
||||
"notes": "",
|
||||
"overall": 0.0,
|
||||
"raw": raw,
|
||||
"parse_error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def _sanitize_for_chat_api(
|
||||
messages: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Drop tool_calls/tool pairs that are incomplete.
|
||||
|
||||
A compressed message list may contain tool_call references whose matching
|
||||
``tool`` result was summarized away, which breaks strict-validator
|
||||
providers (Anthropic, OpenAI). Easiest correct behaviour for the eval:
|
||||
strip tool_calls entirely and drop ``tool`` role messages — the
|
||||
continuation model only needs the summary + recent turns to answer the
|
||||
probe, not the precise tool-call bookkeeping.
|
||||
"""
|
||||
clean: List[Dict[str, Any]] = []
|
||||
for m in messages:
|
||||
role = m.get("role")
|
||||
if role == "tool":
|
||||
# Convert tool result to a plain user note so the continuation
|
||||
# model still sees the content without needing the structured
|
||||
# tool_call_id pairing.
|
||||
content = m.get("content")
|
||||
if isinstance(content, list):
|
||||
content = "\n".join(
|
||||
p.get("text", "") for p in content if isinstance(p, dict)
|
||||
)
|
||||
clean.append({
|
||||
"role": "user",
|
||||
"content": f"[earlier tool result]\n{content or ''}",
|
||||
})
|
||||
continue
|
||||
new = {"role": role, "content": m.get("content", "")}
|
||||
# Drop tool_calls — the downstream assistant message's content
|
||||
# still describes what the agent was doing.
|
||||
clean.append(new)
|
||||
# Collapse consecutive same-role turns into one (alternation rule)
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for m in clean:
|
||||
if merged and merged[-1]["role"] == m["role"]:
|
||||
prev = merged[-1]
|
||||
prev_c = prev.get("content") or ""
|
||||
new_c = m.get("content") or ""
|
||||
prev["content"] = f"{prev_c}\n\n{new_c}" if prev_c else new_c
|
||||
else:
|
||||
merged.append(m)
|
||||
return merged
|
||||
@@ -1,96 +0,0 @@
|
||||
{
|
||||
"fixture": "config-build-competitive-scouts",
|
||||
"description": "Probes for the competitive-scout cron-job setup session. Anchors are which agents were configured, which day of the week each runs, and the full final schedule. This fixture most directly tests artifact-trail and iterative-merge because the job list grows by one per user turn.",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-first-repo",
|
||||
"type": "recall",
|
||||
"question": "What was the first repository the user asked to create a scout cron for, and on what day of the week?",
|
||||
"expected_facts": ["openclaw", "Sunday"]
|
||||
},
|
||||
{
|
||||
"id": "recall-closed-source-target",
|
||||
"type": "recall",
|
||||
"question": "One of the scout targets does not have an open-source repository and had to be configured as a web scan instead. Which one, and on what day?",
|
||||
"expected_facts": ["claude code", "Friday", "web scan"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-all-jobs",
|
||||
"type": "artifact",
|
||||
"question": "List every scout cron job created in this session.",
|
||||
"expected_facts": [
|
||||
"openclaw-pr-scout",
|
||||
"nanoclaw-pr-scout",
|
||||
"ironclaw-pr-scout",
|
||||
"kilocode-pr-scout",
|
||||
"codex-pr-scout",
|
||||
"gemini-cli-pr-scout",
|
||||
"cline-pr-scout",
|
||||
"opencode-pr-scout",
|
||||
"claude-code-scout",
|
||||
"aider-pr-scout",
|
||||
"roocode-pr-scout"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-final-schedule",
|
||||
"type": "artifact",
|
||||
"question": "What is the final weekly schedule? Give the day and the agents scanned on each day.",
|
||||
"expected_facts": [
|
||||
"Sun: openclaw, nanoclaw, ironclaw",
|
||||
"Mon: kilo code",
|
||||
"Tue: codex",
|
||||
"Wed: gemini cli, cline",
|
||||
"Thu: opencode",
|
||||
"Fri: claude code",
|
||||
"Sat: aider, roo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-sunday-count",
|
||||
"type": "artifact",
|
||||
"question": "How many cron jobs run on Sunday?",
|
||||
"expected_facts": ["3", "three", "openclaw, nanoclaw, ironclaw"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-total-count",
|
||||
"type": "artifact",
|
||||
"question": "How many scout cron jobs were created in total by the end of the session?",
|
||||
"expected_facts": ["11", "eleven"]
|
||||
},
|
||||
{
|
||||
"id": "decision-kilo-open-source",
|
||||
"type": "decision",
|
||||
"question": "The user asked whether Kilo Code is open source. What was the answer, and what did the user decide to do with it?",
|
||||
"expected_facts": [
|
||||
"yes, open source",
|
||||
"Kilo-Org/kilocode",
|
||||
"added as Monday scout"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "decision-saturday-fill",
|
||||
"type": "decision",
|
||||
"question": "Saturday was the last open day at one point. Which scout(s) were placed on Saturday, and why were those chosen?",
|
||||
"expected_facts": ["aider", "roo", "filled in last based on openrouter popularity / cli comparison rankings"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-execution-time",
|
||||
"type": "continuation",
|
||||
"question": "At what local time of day do these scout cron jobs run?",
|
||||
"expected_facts": ["10 AM Pacific", "17:00 UTC", "0 17 * * *"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-skill-used",
|
||||
"type": "continuation",
|
||||
"question": "Each scout job runs with a specific skill preloaded. Which one?",
|
||||
"expected_facts": ["hermes-agent-dev"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-weekday-coverage",
|
||||
"type": "continuation",
|
||||
"question": "After the session ended, are there any weekdays still uncovered by a scout job?",
|
||||
"expected_facts": ["no", "all 7 days covered", "full week loaded"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,72 +0,0 @@
|
||||
{
|
||||
"fixture": "debug-session-feishu-id-model",
|
||||
"description": "Probes for the Feishu identity-model PR #8388 triage session. Anchors are the PR number, what the PR actually contained, what upstream docs confirmed, and the final decision + reasoning.",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-pr-number",
|
||||
"type": "recall",
|
||||
"question": "What is the PR number under review in this session, and what repository is it against?",
|
||||
"expected_facts": ["PR #8388", "NousResearch/hermes-agent", "hermes-agent"]
|
||||
},
|
||||
{
|
||||
"id": "recall-bug-claim",
|
||||
"type": "recall",
|
||||
"question": "What is the core bug the PR claims to fix? Be specific about the identifier involved.",
|
||||
"expected_facts": ["open_id", "app-scoped", "not canonical", "Feishu identity model"]
|
||||
},
|
||||
{
|
||||
"id": "recall-upstream-confirmation",
|
||||
"type": "recall",
|
||||
"question": "Do upstream Feishu/Lark docs confirm that open_id is app-scoped rather than a canonical cross-app identity?",
|
||||
"expected_facts": ["yes", "confirmed", "open.feishu.cn", "same user has different Open IDs in different apps"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-pr-scope",
|
||||
"type": "artifact",
|
||||
"question": "Roughly how large is PR #8388, and which gateway subsystems does it touch beyond the Feishu adapter?",
|
||||
"expected_facts": ["4647 lines", "gateway/run.py", "cron/scheduler.py", "gateway/config.py", "multi-account", "bind"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-new-tool",
|
||||
"type": "artifact",
|
||||
"question": "Does the PR add a new tool file? If so, what is its path?",
|
||||
"expected_facts": ["tools/feishu_id_tool.py", "new file"]
|
||||
},
|
||||
{
|
||||
"id": "decision-pr-assessment",
|
||||
"type": "decision",
|
||||
"question": "What is the reviewer's overall assessment of PR #8388 — approve, reject, or something more nuanced? Explain in one sentence.",
|
||||
"expected_facts": [
|
||||
"core claim is correct",
|
||||
"scope is wrong",
|
||||
"bait-and-switch",
|
||||
"overbuilt",
|
||||
"implement cleaner ourselves"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "decision-core-claim-validity",
|
||||
"type": "decision",
|
||||
"question": "Setting aside the PR's size, is the underlying identity-model concern technically valid or not?",
|
||||
"expected_facts": ["technically valid", "correct", "open_id is app-scoped"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-next-action",
|
||||
"type": "continuation",
|
||||
"question": "Based on the review outcome, what is the next action the agent has been asked to take regarding this PR?",
|
||||
"expected_facts": ["close the PR", "implement ourselves", "cleaner", "less complex"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-implementation-scope",
|
||||
"type": "continuation",
|
||||
"question": "If implementing the Feishu fix cleanly ourselves, which specific behaviour needs to change — what should replace the current use of open_id?",
|
||||
"expected_facts": ["use union_id", "or user_id", "canonical identity", "cross-app stable ID"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-sources-to-reference",
|
||||
"type": "continuation",
|
||||
"question": "Which upstream documentation sources were fetched during review that should be referenced when writing the clean implementation?",
|
||||
"expected_facts": ["open.feishu.cn", "open.larkoffice.com", "user-identity-introduction"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,74 +0,0 @@
|
||||
{
|
||||
"fixture": "feature-impl-context-priority",
|
||||
"description": "Probes for the .hermes.md / AGENTS.md / CLAUDE.md / .cursorrules priority feature session. Anchors are the concrete facts the next assistant would need to continue: user's priority order, files modified, helper-function structure, live-test scenarios, and PR number.",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-priority-order",
|
||||
"type": "recall",
|
||||
"question": "What is the priority order the user asked for when multiple project-context files are present? List them from highest to lowest priority.",
|
||||
"expected_facts": [".hermes.md", "AGENTS.md", "CLAUDE.md", ".cursorrules", "highest to lowest"]
|
||||
},
|
||||
{
|
||||
"id": "recall-selection-mode",
|
||||
"type": "recall",
|
||||
"question": "When multiple context files exist in the same directory, does the agent now load all of them or pick only one?",
|
||||
"expected_facts": ["only one", "priority-based selection", "highest-priority winner"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-files-modified",
|
||||
"type": "artifact",
|
||||
"question": "Which files in the hermes-agent repository were modified during this session? List them.",
|
||||
"expected_facts": [
|
||||
"agent/prompt_builder.py",
|
||||
"tests/agent/test_prompt_builder.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-helper-functions",
|
||||
"type": "artifact",
|
||||
"question": "The session introduced separate helper functions for each context-file type. What are their names?",
|
||||
"expected_facts": [
|
||||
"_load_hermes_md",
|
||||
"_load_agents_md",
|
||||
"_load_claude_md",
|
||||
"_load_cursorrules"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-test-scenarios",
|
||||
"type": "artifact",
|
||||
"question": "A scratch directory was created with scenario subdirectories to live-test the priority chain. Roughly how many scenarios, and what directory was it created under?",
|
||||
"expected_facts": ["10 scenarios", "/tmp/context-priority-test"]
|
||||
},
|
||||
{
|
||||
"id": "decision-claude-md-was-unsupported",
|
||||
"type": "decision",
|
||||
"question": "What was the finding about CLAUDE.md support in the existing loader before this session's changes?",
|
||||
"expected_facts": ["CLAUDE.md was not handled", "not supported", "new handler added"]
|
||||
},
|
||||
{
|
||||
"id": "decision-load-all-or-one",
|
||||
"type": "decision",
|
||||
"question": "Was the decision to load multiple context files when present, or to load only the highest-priority one? Explain the reasoning in one sentence.",
|
||||
"expected_facts": ["load only one", "highest priority", "user preference", "do not want to load multiple"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-pr-number-and-status",
|
||||
"type": "continuation",
|
||||
"question": "A pull request was opened for this feature. What is the PR number and what is its merge status?",
|
||||
"expected_facts": ["PR #2301", "merged", "squash"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-test-suite-result",
|
||||
"type": "continuation",
|
||||
"question": "What was the result of the full test suite run after the implementation changes?",
|
||||
"expected_facts": ["5680 passed", "0 failures", "clean"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-next-step",
|
||||
"type": "continuation",
|
||||
"question": "If asked to pick up this session, what is the current state of main? Anything left to do?",
|
||||
"expected_facts": ["merged to main", "main is current", "nothing outstanding", "pulled"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,235 +0,0 @@
|
||||
"""Markdown report rendering + diff-against-baseline for compression-eval runs.
|
||||
|
||||
Report format is optimised for pasting directly into a PR description.
|
||||
Top-of-report table is the per-fixture medians; below that is the
|
||||
probe-by-probe miss list (scores < 3.0 on overall).
|
||||
|
||||
Diff mode (``compare_to``) emits a second table with deltas per fixture
|
||||
per dimension against a previous run directory.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from rubric import DIMENSIONS
|
||||
|
||||
|
||||
def write_run_json(
|
||||
*,
|
||||
results_dir: Path,
|
||||
fixture_name: str,
|
||||
run_index: int,
|
||||
payload: Dict[str, Any],
|
||||
) -> Path:
|
||||
"""Dump one fixture's per-run results as JSON for later diffing."""
|
||||
results_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = results_dir / f"{fixture_name}-run-{run_index}.json"
|
||||
with path.open("w") as fh:
|
||||
json.dump(payload, fh, indent=2, ensure_ascii=False)
|
||||
return path
|
||||
|
||||
|
||||
def _median(values: List[float]) -> float:
|
||||
return statistics.median(values) if values else 0.0
|
||||
|
||||
|
||||
def _format_score(value: float) -> str:
|
||||
return f"{value:.2f}"
|
||||
|
||||
|
||||
def _format_delta(baseline: float, current: float) -> str:
|
||||
delta = current - baseline
|
||||
if abs(delta) < 0.01:
|
||||
return f"{current:.2f} (±0)"
|
||||
sign = "+" if delta > 0 else ""
|
||||
return f"{current:.2f} ({sign}{delta:.2f})"
|
||||
|
||||
|
||||
def summarize_fixture_runs(
|
||||
fixture_runs: List[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
"""Collapse N runs of one fixture into per-dimension medians + metadata.
|
||||
|
||||
Each run payload is {probes: [{id, type, scores: {...}, overall, ...}]}.
|
||||
Returns {fixture_name, runs, dimension_medians, overall_median, misses}.
|
||||
"""
|
||||
if not fixture_runs:
|
||||
return {}
|
||||
|
||||
fixture_name = fixture_runs[0]["fixture_name"]
|
||||
n_runs = len(fixture_runs)
|
||||
|
||||
# Per-probe-per-dimension aggregation across runs
|
||||
probe_ids = [p["id"] for p in fixture_runs[0]["probes"]]
|
||||
per_probe: Dict[str, Dict[str, List[float]]] = {
|
||||
pid: {d: [] for d in DIMENSIONS} for pid in probe_ids
|
||||
}
|
||||
per_probe_overall: Dict[str, List[float]] = {pid: [] for pid in probe_ids}
|
||||
|
||||
for run in fixture_runs:
|
||||
for p in run["probes"]:
|
||||
pid = p["id"]
|
||||
for d in DIMENSIONS:
|
||||
per_probe[pid][d].append(p["scores"].get(d, 0))
|
||||
per_probe_overall[pid].append(p["overall"])
|
||||
|
||||
# Median each probe across runs, then median those medians across probes
|
||||
dim_medians: Dict[str, float] = {}
|
||||
for d in DIMENSIONS:
|
||||
per_probe_med = [_median(per_probe[pid][d]) for pid in probe_ids]
|
||||
dim_medians[d] = _median(per_probe_med)
|
||||
overall_median = _median([_median(per_probe_overall[pid]) for pid in probe_ids])
|
||||
|
||||
# Misses = probes whose median overall < 3.0
|
||||
misses: List[Dict[str, Any]] = []
|
||||
for pid in probe_ids:
|
||||
med = _median(per_probe_overall[pid])
|
||||
if med < 3.0:
|
||||
# Pull the notes from the last run to give the reader a
|
||||
# concrete clue. (Taking the most recent run is fine —
|
||||
# notes vary across runs and any one is illustrative.)
|
||||
notes = ""
|
||||
probe_type = ""
|
||||
for p in fixture_runs[-1]["probes"]:
|
||||
if p["id"] == pid:
|
||||
notes = p.get("notes", "")
|
||||
probe_type = p.get("type", "")
|
||||
break
|
||||
misses.append({
|
||||
"id": pid,
|
||||
"type": probe_type,
|
||||
"overall_median": med,
|
||||
"notes": notes,
|
||||
})
|
||||
|
||||
return {
|
||||
"fixture_name": fixture_name,
|
||||
"runs": n_runs,
|
||||
"dimension_medians": dim_medians,
|
||||
"overall_median": overall_median,
|
||||
"misses": misses,
|
||||
"compression": fixture_runs[0].get("compression", {}),
|
||||
}
|
||||
|
||||
|
||||
def render_report(
|
||||
*,
|
||||
label: str,
|
||||
compressor_model: str,
|
||||
judge_model: str,
|
||||
runs_per_fixture: int,
|
||||
summaries: List[Dict[str, Any]],
|
||||
baseline_summaries: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> str:
|
||||
"""Render the full markdown report.
|
||||
|
||||
baseline_summaries is the same shape as summaries, sourced from a
|
||||
previous run (via --compare-to). When present, dimension scores in
|
||||
the main table render with deltas.
|
||||
"""
|
||||
lines: List[str] = []
|
||||
lines.append(f"## Compression eval — label `{label}`")
|
||||
lines.append("")
|
||||
lines.append(f"- Compressor model: `{compressor_model}`")
|
||||
lines.append(f"- Judge model: `{judge_model}`")
|
||||
lines.append(f"- Runs per fixture: {runs_per_fixture}")
|
||||
lines.append("- Medians over runs reported")
|
||||
if baseline_summaries:
|
||||
lines.append("- Deltas shown against baseline run")
|
||||
lines.append("")
|
||||
|
||||
baseline_by_name: Dict[str, Dict[str, Any]] = {}
|
||||
if baseline_summaries:
|
||||
baseline_by_name = {s["fixture_name"]: s for s in baseline_summaries}
|
||||
|
||||
# Main table
|
||||
header = ["Fixture"] + DIMENSIONS + ["overall"]
|
||||
lines.append("| " + " | ".join(header) + " |")
|
||||
lines.append("|" + "|".join(["---"] * len(header)) + "|")
|
||||
for s in summaries:
|
||||
row = [s["fixture_name"]]
|
||||
baseline = baseline_by_name.get(s["fixture_name"])
|
||||
for d in DIMENSIONS:
|
||||
cur = s["dimension_medians"][d]
|
||||
if baseline and d in baseline.get("dimension_medians", {}):
|
||||
row.append(_format_delta(baseline["dimension_medians"][d], cur))
|
||||
else:
|
||||
row.append(_format_score(cur))
|
||||
if baseline:
|
||||
row.append(_format_delta(baseline["overall_median"], s["overall_median"]))
|
||||
else:
|
||||
row.append(_format_score(s["overall_median"]))
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
lines.append("")
|
||||
|
||||
# Compression metadata
|
||||
lines.append("### Compression summary")
|
||||
lines.append("")
|
||||
lines.append("| Fixture | Pre tokens | Post tokens | Ratio | Pre msgs | Post msgs |")
|
||||
lines.append("|---|---|---|---|---|---|")
|
||||
for s in summaries:
|
||||
c = s.get("compression", {})
|
||||
lines.append(
|
||||
"| {name} | {pre} | {post} | {ratio:.1%} | {pm} | {pom} |".format(
|
||||
name=s["fixture_name"],
|
||||
pre=c.get("pre_tokens", 0),
|
||||
post=c.get("post_tokens", 0),
|
||||
ratio=c.get("compression_ratio", 0.0),
|
||||
pm=c.get("pre_message_count", 0),
|
||||
pom=c.get("post_message_count", 0),
|
||||
)
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Per-probe misses
|
||||
any_misses = any(s["misses"] for s in summaries)
|
||||
if any_misses:
|
||||
lines.append("### Probes scoring below 3.0 overall (median)")
|
||||
lines.append("")
|
||||
for s in summaries:
|
||||
if not s["misses"]:
|
||||
continue
|
||||
lines.append(f"**{s['fixture_name']}**")
|
||||
for m in s["misses"]:
|
||||
note_part = f" — {m['notes']}" if m["notes"] else ""
|
||||
lines.append(
|
||||
f"- `{m['id']}` ({m['type']}): "
|
||||
f"{m['overall_median']:.2f}{note_part}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Methodology")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"Probe-based eval adapted from "
|
||||
"https://factory.ai/news/evaluating-compression. Each fixture is "
|
||||
"compressed in a single forced `ContextCompressor.compress()` call, "
|
||||
"then a continuation call asks the compressor model to answer each "
|
||||
"probe from the compressed state, then the judge model scores the "
|
||||
"answer 0-5 on six dimensions. A single run is noisy; medians "
|
||||
"across multiple runs are the meaningful signal. Changes below "
|
||||
"~0.3 on any dimension are likely within run-to-run noise."
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def load_baseline_summaries(baseline_dir: Path) -> List[Dict[str, Any]]:
|
||||
"""Load summaries from a previous eval run for --compare-to.
|
||||
|
||||
Reads the dumped per-run JSONs and re-summarises them so the
|
||||
aggregation matches whatever summariser was current at the time of
|
||||
the new run (forward-compatible with schema additions).
|
||||
"""
|
||||
if not baseline_dir.exists():
|
||||
raise FileNotFoundError(f"baseline dir not found: {baseline_dir}")
|
||||
|
||||
by_fixture: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for path in sorted(baseline_dir.glob("*-run-*.json")):
|
||||
with path.open() as fh:
|
||||
payload = json.load(fh)
|
||||
by_fixture.setdefault(payload["fixture_name"], []).append(payload)
|
||||
|
||||
return [summarize_fixture_runs(runs) for runs in by_fixture.values()]
|
||||
@@ -1,198 +0,0 @@
|
||||
"""Rubric for probe-based compression eval grading.
|
||||
|
||||
Six dimensions scored 0-5 by a judge model. The scoring anchors are spelled
|
||||
out so the judge interpretation is stable across runs and across judge
|
||||
models.
|
||||
|
||||
Adapted from the methodology in
|
||||
https://factory.ai/news/evaluating-compression. Their scoreboard is not
|
||||
adopted; only the dimension definitions and the 0-5 scale.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
# Canonical dimension order. All reports, parsers, and comparisons derive
|
||||
# from this list — do not hardcode the order elsewhere.
|
||||
DIMENSIONS: List[str] = [
|
||||
"accuracy",
|
||||
"context_awareness",
|
||||
"artifact_trail",
|
||||
"completeness",
|
||||
"continuity",
|
||||
"instruction_following",
|
||||
]
|
||||
|
||||
DIMENSION_DESCRIPTIONS: Dict[str, str] = {
|
||||
"accuracy": (
|
||||
"Are concrete facts correct — file paths, function names, PR/issue "
|
||||
"numbers, error codes, command outputs, line numbers? A single wrong "
|
||||
"path or error code should cost points. Vague but non-contradicting "
|
||||
"answers score mid-range."
|
||||
),
|
||||
"context_awareness": (
|
||||
"Does the answer reflect the CURRENT state of the session, not a "
|
||||
"mid-session snapshot? For example, if a file was modified then "
|
||||
"reverted, does the answer describe the reverted state? If three "
|
||||
"PRs were opened, does the answer know which was merged?"
|
||||
),
|
||||
"artifact_trail": (
|
||||
"Does the answer correctly enumerate the artifacts (files read, "
|
||||
"files modified, commands run, tools called, PRs opened, cron jobs "
|
||||
"created)? Missing artifacts cost more than extra unrelated ones."
|
||||
),
|
||||
"completeness": (
|
||||
"Does the answer address ALL parts of the probe question? If the "
|
||||
"probe asks for three things and only two are answered, that is "
|
||||
"incomplete regardless of accuracy on the two."
|
||||
),
|
||||
"continuity": (
|
||||
"Could the next assistant continue the work using only this answer, "
|
||||
"without having to re-fetch files or re-explore the codebase? An "
|
||||
"answer that lists files by name but doesn't mention the change is "
|
||||
"poor continuity even if accurate."
|
||||
),
|
||||
"instruction_following": (
|
||||
"Is the answer in the format the probe requested (list, number, "
|
||||
"short phrase, yes/no)? Ignore tone and length, only assess "
|
||||
"whether the requested form was honoured."
|
||||
),
|
||||
}
|
||||
|
||||
SCORE_SCALE: Dict[int, str] = {
|
||||
0: "No useful information; wrong or hallucinated.",
|
||||
1: "Major gaps or a key fact is wrong.",
|
||||
2: "Partially correct but significant omissions.",
|
||||
3: "Mostly correct with minor omissions or imprecision.",
|
||||
4: "Correct and complete with only trivial imprecision.",
|
||||
5: "Fully correct, complete, and in the requested format.",
|
||||
}
|
||||
|
||||
|
||||
_RUBRIC_HEADER = """You are an evaluator grading a single answer produced by an AI assistant \
|
||||
that was given a COMPRESSED handoff summary of an earlier conversation and \
|
||||
asked a probe question. You are NOT evaluating the compression summary \
|
||||
directly — you are evaluating whether the answer the assistant produced \
|
||||
from that summary is correct, complete, and useful.
|
||||
|
||||
Grade on six dimensions, each 0-5:
|
||||
|
||||
{dimension_block}
|
||||
|
||||
0-5 scale:
|
||||
{scale_block}
|
||||
|
||||
Grade strictly. Fractional scores are NOT allowed — output integers only. \
|
||||
If the answer is ambiguous, use the lower of the two candidate scores."""
|
||||
|
||||
|
||||
def build_judge_prompt(
|
||||
*,
|
||||
probe_question: str,
|
||||
probe_type: str,
|
||||
expected_facts: List[str],
|
||||
assistant_answer: str,
|
||||
) -> str:
|
||||
"""Build the full judge prompt for one (probe, answer) pair.
|
||||
|
||||
The judge is told the expected_facts up front so grading is anchored to
|
||||
concrete signal rather than judge taste. Expected facts are intentionally
|
||||
NOT shown to the assistant that produces the answer.
|
||||
"""
|
||||
dim_block = "\n".join(
|
||||
f"- {d}: {DIMENSION_DESCRIPTIONS[d]}" for d in DIMENSIONS
|
||||
)
|
||||
scale_block = "\n".join(
|
||||
f" {score}: {desc}" for score, desc in sorted(SCORE_SCALE.items())
|
||||
)
|
||||
header = _RUBRIC_HEADER.format(
|
||||
dimension_block=dim_block,
|
||||
scale_block=scale_block,
|
||||
)
|
||||
|
||||
expected_block = (
|
||||
"\n".join(f"- {f}" for f in expected_facts) if expected_facts else "(none provided)"
|
||||
)
|
||||
|
||||
output_schema = (
|
||||
"Respond with ONLY a JSON object, no prose before or after, matching "
|
||||
"this schema exactly:\n"
|
||||
"{\n"
|
||||
' "accuracy": <int 0-5>,\n'
|
||||
' "context_awareness": <int 0-5>,\n'
|
||||
' "artifact_trail": <int 0-5>,\n'
|
||||
' "completeness": <int 0-5>,\n'
|
||||
' "continuity": <int 0-5>,\n'
|
||||
' "instruction_following": <int 0-5>,\n'
|
||||
' "notes": "<one short sentence, <=200 chars, identifying the '
|
||||
'single biggest issue with the answer if any>"\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
return (
|
||||
f"{header}\n\n"
|
||||
f"PROBE TYPE: {probe_type}\n\n"
|
||||
f"PROBE QUESTION:\n{probe_question}\n\n"
|
||||
f"EXPECTED FACTS (the answer should contain these concrete anchors; "
|
||||
f"missing any is a material defect in accuracy and/or completeness):\n"
|
||||
f"{expected_block}\n\n"
|
||||
f"ASSISTANT ANSWER TO GRADE:\n{assistant_answer}\n\n"
|
||||
f"{output_schema}"
|
||||
)
|
||||
|
||||
|
||||
def parse_judge_response(raw: str) -> Dict[str, Any]:
|
||||
"""Parse the judge model's JSON response into a score dict.
|
||||
|
||||
Tolerates surrounding prose (judges ignore instructions sometimes) by
|
||||
extracting the first {...} block. Validates that every dimension is
|
||||
present as an integer 0-5.
|
||||
|
||||
Returns dict with keys: scores (dim->int), notes (str), overall (float).
|
||||
Raises ValueError if the response cannot be parsed into a complete
|
||||
score set.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
|
||||
if not raw or not raw.strip():
|
||||
raise ValueError("empty judge response")
|
||||
|
||||
# Strip code fences and any ```json prefix judges sometimes emit.
|
||||
stripped = raw.strip()
|
||||
fence_match = re.match(r"^```(?:json)?\s*(.*?)\s*```$", stripped, re.DOTALL)
|
||||
if fence_match:
|
||||
stripped = fence_match.group(1).strip()
|
||||
|
||||
# Extract the first {...} block greedy-to-matching-brace.
|
||||
brace_match = re.search(r"\{.*\}", stripped, re.DOTALL)
|
||||
if not brace_match:
|
||||
raise ValueError(f"no JSON object found in judge response: {raw[:200]!r}")
|
||||
candidate = brace_match.group(0)
|
||||
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"judge response not valid JSON: {exc}; raw={candidate[:200]!r}")
|
||||
|
||||
scores: Dict[str, int] = {}
|
||||
for dim in DIMENSIONS:
|
||||
if dim not in parsed:
|
||||
raise ValueError(f"judge response missing dimension {dim!r}: {parsed}")
|
||||
value = parsed[dim]
|
||||
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
||||
raise ValueError(f"dimension {dim} is not numeric: {value!r}")
|
||||
int_val = int(round(value))
|
||||
if int_val < 0 or int_val > 5:
|
||||
raise ValueError(f"dimension {dim} out of range: {int_val}")
|
||||
scores[dim] = int_val
|
||||
|
||||
notes_val = parsed.get("notes", "")
|
||||
notes = str(notes_val)[:200] if notes_val else ""
|
||||
|
||||
overall = sum(scores.values()) / len(scores)
|
||||
return {
|
||||
"scores": scores,
|
||||
"notes": notes,
|
||||
"overall": overall,
|
||||
}
|
||||
@@ -1,383 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compression eval — entry point.
|
||||
|
||||
Runs the full probe-based eval over one or more fixtures, produces a
|
||||
markdown report in ``results/<label>/report.md`` paired with per-run JSON
|
||||
for later diffing.
|
||||
|
||||
Not a pytest. Requires a configured provider + credentials (same path the
|
||||
agent uses). Does not run in CI. See README.md for usage examples.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
_HERE = Path(__file__).resolve().parent
|
||||
_REPO_ROOT = _HERE.parents[1]
|
||||
if str(_REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
# Make our sibling modules importable whether invoked as a script or as -m.
|
||||
if str(_HERE) not in sys.path:
|
||||
sys.path.insert(0, str(_HERE))
|
||||
|
||||
try:
|
||||
import fire # noqa: F401
|
||||
except ImportError:
|
||||
fire = None # fallback to argparse if fire is unavailable
|
||||
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider # noqa: E402
|
||||
|
||||
from compressor_driver import run_compression # noqa: E402
|
||||
from grader import answer_probe, grade_probe # noqa: E402
|
||||
from report import ( # noqa: E402
|
||||
load_baseline_summaries,
|
||||
render_report,
|
||||
summarize_fixture_runs,
|
||||
write_run_json,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("compression_eval")
|
||||
|
||||
|
||||
FIXTURES_DIR = _HERE / "fixtures"
|
||||
PROBES_DIR = _HERE / "probes"
|
||||
RESULTS_DIR = _HERE / "results"
|
||||
|
||||
|
||||
def _load_fixture(name: str) -> Dict[str, Any]:
|
||||
path = FIXTURES_DIR / f"{name}.json"
|
||||
if not path.exists():
|
||||
available = sorted(p.stem for p in FIXTURES_DIR.glob("*.json"))
|
||||
raise FileNotFoundError(
|
||||
f"Fixture not found: {name}. Available: {available}"
|
||||
)
|
||||
with path.open() as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _load_probes(name: str) -> Dict[str, Any]:
|
||||
path = PROBES_DIR / f"{name}.probes.json"
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Probe bank not found for fixture {name}: {path}")
|
||||
with path.open() as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _resolve_runtime(
|
||||
*,
|
||||
provider_override: Optional[str],
|
||||
model_override: Optional[str],
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve provider credentials via the same path the agent uses."""
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=provider_override,
|
||||
target_model=model_override,
|
||||
)
|
||||
if not runtime.get("api_key") and not runtime.get("base_url"):
|
||||
raise RuntimeError(
|
||||
"No provider configured. Run `hermes setup` or set provider "
|
||||
"credentials in the environment before running the eval."
|
||||
)
|
||||
return runtime
|
||||
|
||||
|
||||
def _available_fixtures() -> List[str]:
|
||||
return sorted(p.stem for p in FIXTURES_DIR.glob("*.json"))
|
||||
|
||||
|
||||
def _run_one_fixture(
|
||||
*,
|
||||
fixture_name: str,
|
||||
run_index: int,
|
||||
compressor_runtime: Dict[str, Any],
|
||||
compressor_model: str,
|
||||
judge_runtime: Dict[str, Any],
|
||||
judge_model: str,
|
||||
focus_topic: Optional[str],
|
||||
) -> Dict[str, Any]:
|
||||
fx = _load_fixture(fixture_name)
|
||||
probes = _load_probes(fixture_name)
|
||||
|
||||
logger.info(
|
||||
"[%s run=%d] compressing (%d messages, ctx=%d)",
|
||||
fixture_name, run_index, len(fx["messages"]), fx["context_length"],
|
||||
)
|
||||
compression = run_compression(
|
||||
messages=fx["messages"],
|
||||
compressor_model=compressor_model,
|
||||
compressor_provider=compressor_runtime["provider"],
|
||||
compressor_base_url=compressor_runtime["base_url"],
|
||||
compressor_api_key=compressor_runtime["api_key"],
|
||||
compressor_api_mode=compressor_runtime.get("api_mode", ""),
|
||||
context_length=fx["context_length"],
|
||||
focus_topic=focus_topic,
|
||||
# Force the compressor to use the model we're testing, bypassing
|
||||
# any auxiliary.compression.model config override. Without this,
|
||||
# ContextCompressor.call_llm(task="compression") routes through
|
||||
# the user's config which may pin a different model (e.g.
|
||||
# google/gemini-3-flash-preview).
|
||||
summary_model_override=compressor_model,
|
||||
)
|
||||
logger.info(
|
||||
"[%s run=%d] compressed %d -> %d tokens (%.1f%%)",
|
||||
fixture_name, run_index,
|
||||
compression["pre_tokens"], compression["post_tokens"],
|
||||
compression["compression_ratio"] * 100,
|
||||
)
|
||||
|
||||
probe_results: List[Dict[str, Any]] = []
|
||||
for probe in probes["probes"]:
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
answer = answer_probe(
|
||||
compressed_messages=compression["compressed_messages"],
|
||||
probe_question=probe["question"],
|
||||
provider=compressor_runtime["provider"],
|
||||
model=compressor_model,
|
||||
base_url=compressor_runtime["base_url"],
|
||||
api_key=compressor_runtime["api_key"],
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s run=%d probe=%s] continuation failed: %s",
|
||||
fixture_name, run_index, probe["id"], exc,
|
||||
)
|
||||
answer = ""
|
||||
|
||||
try:
|
||||
grade = grade_probe(
|
||||
probe_question=probe["question"],
|
||||
probe_type=probe["type"],
|
||||
expected_facts=probe.get("expected_facts", []),
|
||||
assistant_answer=answer,
|
||||
judge_provider=judge_runtime["provider"],
|
||||
judge_model=judge_model,
|
||||
judge_base_url=judge_runtime["base_url"],
|
||||
judge_api_key=judge_runtime["api_key"],
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s run=%d probe=%s] grading failed: %s",
|
||||
fixture_name, run_index, probe["id"], exc,
|
||||
)
|
||||
from rubric import DIMENSIONS
|
||||
grade = {
|
||||
"scores": {d: 0 for d in DIMENSIONS},
|
||||
"notes": f"grading error: {exc}",
|
||||
"overall": 0.0,
|
||||
"raw": "",
|
||||
"parse_error": str(exc),
|
||||
}
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
logger.info(
|
||||
"[%s run=%d probe=%s] overall=%.2f (%.1fs)",
|
||||
fixture_name, run_index, probe["id"], grade["overall"], elapsed,
|
||||
)
|
||||
|
||||
probe_results.append({
|
||||
"id": probe["id"],
|
||||
"type": probe["type"],
|
||||
"question": probe["question"],
|
||||
"expected_facts": probe.get("expected_facts", []),
|
||||
"answer": answer,
|
||||
"scores": grade["scores"],
|
||||
"overall": grade["overall"],
|
||||
"notes": grade["notes"],
|
||||
"parse_error": grade["parse_error"],
|
||||
"elapsed_seconds": elapsed,
|
||||
})
|
||||
|
||||
return {
|
||||
"fixture_name": fixture_name,
|
||||
"run_index": run_index,
|
||||
"compression": {
|
||||
"pre_tokens": compression["pre_tokens"],
|
||||
"post_tokens": compression["post_tokens"],
|
||||
"compression_ratio": compression["compression_ratio"],
|
||||
"pre_message_count": compression["pre_message_count"],
|
||||
"post_message_count": compression["post_message_count"],
|
||||
"summary_text": compression["summary_text"],
|
||||
},
|
||||
"probes": probe_results,
|
||||
}
|
||||
|
||||
|
||||
def _coerce_fixtures_arg(arg: Optional[str]) -> List[str]:
|
||||
if not arg:
|
||||
return _available_fixtures()
|
||||
return [s.strip() for s in arg.split(",") if s.strip()]
|
||||
|
||||
|
||||
def main(
|
||||
fixtures: Optional[str] = None,
|
||||
runs: int = 3,
|
||||
judge_model: Optional[str] = None,
|
||||
judge_provider: Optional[str] = None,
|
||||
compressor_model: Optional[str] = None,
|
||||
compressor_provider: Optional[str] = None,
|
||||
label: Optional[str] = None,
|
||||
focus_topic: Optional[str] = None,
|
||||
compare_to: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
) -> int:
|
||||
"""Run the compression eval.
|
||||
|
||||
Args:
|
||||
fixtures: Comma-separated fixture names; default = all in fixtures/.
|
||||
runs: Runs per fixture. Medians reported. Default 3.
|
||||
judge_model: Override the judge model (default = same as
|
||||
compressor model resolved from config).
|
||||
judge_provider: Override the judge provider.
|
||||
compressor_model: Override the compressor model (default =
|
||||
whatever resolve_runtime_provider returns for the active
|
||||
configuration).
|
||||
compressor_provider: Override the compressor provider.
|
||||
label: Output subdirectory under results/. Default = timestamp.
|
||||
focus_topic: Optional focus topic passed through to
|
||||
ContextCompressor.compress(focus_topic=...).
|
||||
compare_to: Path to a previous run directory (e.g.
|
||||
results/2026-04-24_baseline) to diff against in the report.
|
||||
verbose: Print debug logs.
|
||||
"""
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if verbose else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
fixture_names = _coerce_fixtures_arg(fixtures)
|
||||
# Validate every fixture has a probe bank before spending any money.
|
||||
for name in fixture_names:
|
||||
_load_fixture(name)
|
||||
_load_probes(name)
|
||||
|
||||
compressor_runtime = _resolve_runtime(
|
||||
provider_override=compressor_provider,
|
||||
model_override=compressor_model,
|
||||
)
|
||||
effective_compressor_model = (
|
||||
compressor_model or compressor_runtime.get("resolved_model") or "auto"
|
||||
)
|
||||
if effective_compressor_model == "auto":
|
||||
# resolve_runtime_provider doesn't always fill resolved_model;
|
||||
# fall back to reading model.default from config.
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
mc = cfg.get("model", {}) or {}
|
||||
if isinstance(mc, dict):
|
||||
effective_compressor_model = (
|
||||
mc.get("default") or mc.get("model") or "anthropic/claude-sonnet-4.6"
|
||||
)
|
||||
else:
|
||||
effective_compressor_model = str(mc) or "anthropic/claude-sonnet-4.6"
|
||||
|
||||
if judge_provider or judge_model:
|
||||
judge_runtime = _resolve_runtime(
|
||||
provider_override=judge_provider,
|
||||
model_override=judge_model,
|
||||
)
|
||||
effective_judge_model = judge_model or effective_compressor_model
|
||||
else:
|
||||
judge_runtime = compressor_runtime
|
||||
effective_judge_model = effective_compressor_model
|
||||
|
||||
effective_label = label or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
out_dir = RESULTS_DIR / effective_label
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(
|
||||
"Compression eval starting: label=%s fixtures=%s runs=%d "
|
||||
"compressor=%s judge=%s out=%s",
|
||||
effective_label, fixture_names, runs,
|
||||
effective_compressor_model, effective_judge_model, out_dir,
|
||||
)
|
||||
|
||||
all_summaries: List[Dict[str, Any]] = []
|
||||
for fixture_name in fixture_names:
|
||||
per_run: List[Dict[str, Any]] = []
|
||||
for run_i in range(1, runs + 1):
|
||||
payload = _run_one_fixture(
|
||||
fixture_name=fixture_name,
|
||||
run_index=run_i,
|
||||
compressor_runtime=compressor_runtime,
|
||||
compressor_model=effective_compressor_model,
|
||||
judge_runtime=judge_runtime,
|
||||
judge_model=effective_judge_model,
|
||||
focus_topic=focus_topic,
|
||||
)
|
||||
write_run_json(
|
||||
results_dir=out_dir,
|
||||
fixture_name=fixture_name,
|
||||
run_index=run_i,
|
||||
payload=payload,
|
||||
)
|
||||
per_run.append(payload)
|
||||
summary = summarize_fixture_runs(per_run)
|
||||
all_summaries.append(summary)
|
||||
|
||||
baseline_summaries: Optional[List[Dict[str, Any]]] = None
|
||||
if compare_to:
|
||||
baseline_path = Path(compare_to)
|
||||
if not baseline_path.is_absolute():
|
||||
baseline_path = _HERE / baseline_path
|
||||
baseline_summaries = load_baseline_summaries(baseline_path)
|
||||
|
||||
report_md = render_report(
|
||||
label=effective_label,
|
||||
compressor_model=effective_compressor_model,
|
||||
judge_model=effective_judge_model,
|
||||
runs_per_fixture=runs,
|
||||
summaries=all_summaries,
|
||||
baseline_summaries=baseline_summaries,
|
||||
)
|
||||
report_path = out_dir / "report.md"
|
||||
report_path.write_text(report_md)
|
||||
|
||||
# Also write a machine-readable summary.json alongside the human report.
|
||||
summary_path = out_dir / "summary.json"
|
||||
with summary_path.open("w") as fh:
|
||||
json.dump(
|
||||
{
|
||||
"label": effective_label,
|
||||
"compressor_model": effective_compressor_model,
|
||||
"judge_model": effective_judge_model,
|
||||
"runs_per_fixture": runs,
|
||||
"fixtures": all_summaries,
|
||||
},
|
||||
fh,
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
print()
|
||||
print(report_md)
|
||||
print(f"Report written to {report_path}")
|
||||
print(f"Per-run JSON in {out_dir}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if fire is not None:
|
||||
# fire preserves docstrings as --help and handles kwarg-style CLI.
|
||||
sys.exit(fire.Fire(main))
|
||||
else:
|
||||
import argparse
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--fixtures")
|
||||
p.add_argument("--runs", type=int, default=3)
|
||||
p.add_argument("--judge-model", dest="judge_model")
|
||||
p.add_argument("--judge-provider", dest="judge_provider")
|
||||
p.add_argument("--compressor-model", dest="compressor_model")
|
||||
p.add_argument("--compressor-provider", dest="compressor_provider")
|
||||
p.add_argument("--label")
|
||||
p.add_argument("--focus-topic", dest="focus_topic")
|
||||
p.add_argument("--compare-to", dest="compare_to")
|
||||
p.add_argument("--verbose", action="store_true")
|
||||
args = p.parse_args()
|
||||
sys.exit(main(**vars(args)))
|
||||
@@ -1,381 +0,0 @@
|
||||
"""One-shot fixture scrubber for scripts/compression_eval/fixtures/.
|
||||
|
||||
Source: ~/.hermes/sessions/<file>.jsonl
|
||||
Output: .worktrees/.../scripts/compression_eval/fixtures/<name>.json
|
||||
|
||||
Scrubbing passes:
|
||||
1. agent.redact.redact_sensitive_text — API keys, tokens, connection strings
|
||||
2. Username paths — /home/teknium/ → /home/user/, ~/.hermes/ preserved as-is
|
||||
(that path is universal)
|
||||
3. Personal handles — "Teknium"/"teknium"/"teknium1" → "user"
|
||||
4. Reasoning scratchpads — strip <REASONING_SCRATCHPAD>...</REASONING_SCRATCHPAD>
|
||||
blocks and <think>...</think> tags (personality leakage risk)
|
||||
5. session_meta line — drop entirely, we only need the messages
|
||||
6. User message personality — lightly paraphrase the first user message to keep
|
||||
task intent while removing "vibe"; subsequent user turns kept verbatim
|
||||
since they're short instructions
|
||||
|
||||
The fixture format matches DESIGN.md:
|
||||
{
|
||||
"name": "...",
|
||||
"description": "...",
|
||||
"model": "...", # best guess from original session
|
||||
"context_length": 200000,
|
||||
"messages": [...], # OpenAI-format, only role/content/tool_calls/tool_call_id/tool_name
|
||||
"notes": "Scrubbed from ~/.hermes/sessions/... on 2026-04-24"
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
# Resolve the hermes-agent checkout relative to this script so agent.redact
|
||||
# imports cleanly whether we run from a worktree or a main clone.
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
from agent.redact import redact_sensitive_text # noqa: E402
|
||||
|
||||
|
||||
SESSION_DIR = Path.home() / ".hermes" / "sessions"
|
||||
# Resolve FIXTURES_DIR relative to this script so the scrubber runs the
|
||||
# same way inside a worktree, a main checkout, or from a contributor's
|
||||
# clone at a different path.
|
||||
FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
|
||||
|
||||
# (source_file, output_name, description, user_first_paraphrase, model_guess, context_length, truncate_at)
|
||||
# truncate_at: keep messages[:truncate_at] (None = keep all). Applied BEFORE
|
||||
# orphan-empty-assistant cleanup.
|
||||
SPECS = [
|
||||
(
|
||||
"20260321_060441_fef7be92.jsonl",
|
||||
"feature-impl-context-priority",
|
||||
"~75-turn feature-impl: user asks how multiple project-context files "
|
||||
"(.hermes.md / AGENTS.md / CLAUDE.md / .cursorrules) are handled when "
|
||||
"all are present; agent investigates the codebase, designs a priority "
|
||||
"order, patches the loader + tests, live-tests with a scenario "
|
||||
"directory, commits to a feature branch, opens a PR, and merges after "
|
||||
"approval. Exercises investigate → decide → implement → verify → "
|
||||
"ship flow with clear artifact trail (2 files modified, 1 PR).",
|
||||
(
|
||||
"If .hermes.md, AGENTS.md, CLAUDE.md, and .cursorrules all exist in "
|
||||
"the same directory, does the agent load all of them or pick one? "
|
||||
"Use the hermes-agent-dev skill to check."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
74, # cut at "Merged and pulled. Main is current." — drops trailing unrelated cron-delivery messages
|
||||
),
|
||||
(
|
||||
"20260412_233741_3f2119a8.jsonl",
|
||||
"debug-session-feishu-id-model",
|
||||
"~60-turn debug/triage PR-review session: a third-party bug report "
|
||||
"says the gateway's Feishu adapter misuses the open_id / union_id / "
|
||||
"user_id identity model (open_id is app-scoped, not the bot's "
|
||||
"canonical ID). An open community PR (#8388) tries to fix it. Agent "
|
||||
"reviews the PR against current main, fetches upstream Feishu/Lark "
|
||||
"identity docs, and produces a decision. Exercises long tool-heavy "
|
||||
"context with PR diffs, upstream docs, and a clear decision at the "
|
||||
"end — the classic 'can the summary still name the PR number, the "
|
||||
"root cause, and the decision?' scenario.",
|
||||
(
|
||||
"A community user reports the Feishu/Lark adapter gets the identity "
|
||||
"model wrong — open_id is app-scoped, not the bot's canonical ID. "
|
||||
"There's an open PR #8388 trying to fix it. Use the hermes-agent-dev "
|
||||
"skill and the pr-triage-salvage skill to review it."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
58, # end at "Here's my review: ..." — clean decision point before the "close it, implement cleaner" pivot
|
||||
),
|
||||
(
|
||||
"20260328_160817_77bd258b.jsonl",
|
||||
"config-build-competitive-scouts",
|
||||
"~60-turn iterative config/build session: user wants a set of weekly "
|
||||
"cron jobs that scan competing AI coding agents (openclaw, nanoclaw, "
|
||||
"ironclaw, codex, opencode, claude-code, kilo-code, gemini-cli, "
|
||||
"cline, aider, roo) for merged PRs or web updates worth porting to "
|
||||
"hermes-agent. User adds one target per turn; agent creates each cron "
|
||||
"job and re-states the accumulated schedule. Exercises artifact trail "
|
||||
"(which jobs are configured, which days) and iterative state "
|
||||
"accumulation — the canonical case for iterative-merge summarization.",
|
||||
(
|
||||
"Set up a cron job for the agent every Sunday to scan all PRs "
|
||||
"merged into openclaw that week, decide which are worth adding to "
|
||||
"hermes-agent, and open PRs porting those features."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Tool output truncation is DELIBERATELY DISABLED.
|
||||
#
|
||||
# An earlier iteration truncated tool outputs > 2KB to keep fixture JSON
|
||||
# files small, but that defeats the whole purpose of the eval. Real
|
||||
# sessions have 30KB skill_view dumps, 10KB read_file outputs, 5KB
|
||||
# web_extract bodies — compression has to either head-protect them,
|
||||
# summarize them, or drop them. A fixture without that load doesn't
|
||||
# exercise the compressor. The size win wasn't worth the signal loss.
|
||||
#
|
||||
# The function remains so the scrubbing_passes record in the fixture
|
||||
# JSON continues to truthfully describe what was applied (no-op in this
|
||||
# configuration).
|
||||
_TOOL_OUTPUT_MAX = None # None disables truncation entirely
|
||||
|
||||
|
||||
def _maybe_truncate_tool_output(text: str, tool_name: str) -> str:
|
||||
if _TOOL_OUTPUT_MAX is None or not text or len(text) <= _TOOL_OUTPUT_MAX:
|
||||
return text
|
||||
keep = _TOOL_OUTPUT_MAX - 200
|
||||
head = text[:keep]
|
||||
return (
|
||||
head
|
||||
+ f"\n\n[... tool output truncated for fixture — original was {len(text)} chars"
|
||||
+ (f" from {tool_name}" if tool_name else "")
|
||||
+ "]"
|
||||
)
|
||||
|
||||
|
||||
_PATH_RE = re.compile(r"/home/teknium\b")
|
||||
# No \b boundaries — some tool content stores newlines as the literal
|
||||
# two-char sequence "\\n" (escaped JSON), so a "\\nTeknium..." run has a
|
||||
# word char ('n') immediately before 'T' and \b fails. Substring match is
|
||||
# safer here; "Teknium" as a substring of an unrelated word is
|
||||
# implausible in this corpus.
|
||||
_USER_RE = re.compile(r"teknium1|Teknium|teknium", re.IGNORECASE)
|
||||
# Only strip scratchpads in ASSISTANT content, not tool results (might be legit)
|
||||
_SCRATCH_RE = re.compile(
|
||||
r"<REASONING_SCRATCHPAD>.*?</REASONING_SCRATCHPAD>\s*", re.DOTALL
|
||||
)
|
||||
_THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
||||
# Discord/Telegram user mention leakage in messaging-platform sessions
|
||||
_USER_MENTION_RE = re.compile(r"<@\*{3}>|<@\d+>")
|
||||
# Contributor emails (from git show output etc) — anything@domain.tld
|
||||
# Keep noreply@github-style placeholders obvious; real personal emails get
|
||||
# replaced with a contributor placeholder.
|
||||
_EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
||||
# "Author: Name <email>" git-show headers — rewrite the whole line
|
||||
_GIT_AUTHOR_RE = re.compile(r"Author:\s*[^<\n]+<[^>]+>")
|
||||
|
||||
|
||||
def _scrub_text(text: str, *, drop_scratchpads: bool = False) -> str:
|
||||
"""Apply the pipeline to a raw text string.
|
||||
|
||||
drop_scratchpads only affects assistant messages — tool outputs that
|
||||
happen to contain similar markers are left alone.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
if drop_scratchpads:
|
||||
text = _SCRATCH_RE.sub("", text)
|
||||
text = _THINK_RE.sub("", text)
|
||||
text = _PATH_RE.sub("/home/user", text)
|
||||
text = _USER_RE.sub("user", text)
|
||||
text = _USER_MENTION_RE.sub("<@user>", text)
|
||||
# Rewrite git "Author: Name <email>" lines before generic email replace
|
||||
text = _GIT_AUTHOR_RE.sub("Author: contributor <contributor@example.com>", text)
|
||||
text = _EMAIL_RE.sub("contributor@example.com", text)
|
||||
text = redact_sensitive_text(text)
|
||||
return text
|
||||
|
||||
|
||||
def _content_to_str(content: Any) -> str:
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for p in content:
|
||||
if isinstance(p, dict) and "text" in p:
|
||||
parts.append(p["text"])
|
||||
elif isinstance(p, str):
|
||||
parts.append(p)
|
||||
return "\n".join(parts)
|
||||
return str(content)
|
||||
|
||||
|
||||
def _scrub_tool_calls(tool_calls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
out = []
|
||||
for tc in tool_calls or []:
|
||||
if not isinstance(tc, dict):
|
||||
continue
|
||||
fn = tc.get("function", {}) or {}
|
||||
args = fn.get("arguments", "")
|
||||
if isinstance(args, str):
|
||||
args = _scrub_text(args)
|
||||
new_tc = {
|
||||
"id": tc.get("id", ""),
|
||||
"type": tc.get("type", "function"),
|
||||
"function": {
|
||||
"name": fn.get("name", ""),
|
||||
"arguments": args,
|
||||
},
|
||||
}
|
||||
out.append(new_tc)
|
||||
return out
|
||||
|
||||
|
||||
def _scrub_message(m: Dict[str, Any], *, first_user_paraphrase: str | None, user_turn_idx: List[int]) -> Dict[str, Any] | None:
|
||||
role = m.get("role")
|
||||
if role in (None, "session_meta"):
|
||||
return None
|
||||
|
||||
content = _content_to_str(m.get("content"))
|
||||
|
||||
if role == "assistant":
|
||||
content = _scrub_text(content, drop_scratchpads=True)
|
||||
elif role == "user":
|
||||
# Use paraphrase for the very first user turn only
|
||||
user_turn_idx[0] += 1
|
||||
if user_turn_idx[0] == 1 and first_user_paraphrase is not None:
|
||||
content = first_user_paraphrase
|
||||
else:
|
||||
content = _scrub_text(content)
|
||||
else:
|
||||
content = _scrub_text(content)
|
||||
# Truncate large tool outputs
|
||||
if role == "tool":
|
||||
tn = m.get("tool_name") or m.get("name") or ""
|
||||
content = _maybe_truncate_tool_output(content, tn)
|
||||
|
||||
new_msg: Dict[str, Any] = {"role": role, "content": content}
|
||||
|
||||
if role == "assistant":
|
||||
tcs = m.get("tool_calls") or []
|
||||
if tcs:
|
||||
new_msg["tool_calls"] = _scrub_tool_calls(tcs)
|
||||
if role == "tool":
|
||||
if m.get("tool_call_id"):
|
||||
new_msg["tool_call_id"] = m["tool_call_id"]
|
||||
if m.get("tool_name") or m.get("name"):
|
||||
new_msg["tool_name"] = m.get("tool_name") or m.get("name")
|
||||
|
||||
return new_msg
|
||||
|
||||
|
||||
def build_fixture(
|
||||
source_file: str,
|
||||
output_name: str,
|
||||
description: str,
|
||||
first_user_paraphrase: str,
|
||||
model_guess: str,
|
||||
context_length: int,
|
||||
truncate_at: int | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
src = SESSION_DIR / source_file
|
||||
raw_msgs: List[Dict[str, Any]] = []
|
||||
with src.open() as fh:
|
||||
for line in fh:
|
||||
try:
|
||||
raw_msgs.append(json.loads(line))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Skip session_meta lines up front so truncate_at counts real messages
|
||||
raw_msgs = [m for m in raw_msgs if m.get("role") != "session_meta"]
|
||||
if truncate_at is not None:
|
||||
raw_msgs = raw_msgs[:truncate_at]
|
||||
|
||||
user_turn_counter = [0]
|
||||
scrubbed: List[Dict[str, Any]] = []
|
||||
for m in raw_msgs:
|
||||
new = _scrub_message(
|
||||
m,
|
||||
first_user_paraphrase=first_user_paraphrase,
|
||||
user_turn_idx=user_turn_counter,
|
||||
)
|
||||
if new is not None:
|
||||
scrubbed.append(new)
|
||||
|
||||
# Drop empty-content assistant messages that have no tool_calls
|
||||
# (artifact of scratchpad-only turns post-scrub)
|
||||
pruned: List[Dict[str, Any]] = []
|
||||
for m in scrubbed:
|
||||
if (
|
||||
m["role"] == "assistant"
|
||||
and not (m.get("content") or "").strip()
|
||||
and not m.get("tool_calls")
|
||||
):
|
||||
continue
|
||||
pruned.append(m)
|
||||
# Trim trailing orphan tool messages (no matching assistant)
|
||||
while pruned and pruned[-1]["role"] == "tool":
|
||||
pruned.pop()
|
||||
scrubbed = pruned
|
||||
|
||||
# Inject a synthetic public-safe system message so the compressor has
|
||||
# a head to anchor on. The real system prompts embed personality and
|
||||
# platform-specific content we don't want checked in.
|
||||
system_msg = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful AI coding assistant with access to tools "
|
||||
"(terminal, file editing, search, web, etc.). You operate in a "
|
||||
"conversational loop: the user gives you a task, you call tools "
|
||||
"to accomplish it, and you report back concisely."
|
||||
),
|
||||
}
|
||||
if scrubbed and scrubbed[0].get("role") == "system":
|
||||
scrubbed[0] = system_msg
|
||||
else:
|
||||
scrubbed.insert(0, system_msg)
|
||||
|
||||
fixture = {
|
||||
"name": output_name,
|
||||
"description": description,
|
||||
"model": model_guess,
|
||||
"context_length": context_length,
|
||||
"source": f"~/.hermes/sessions/{source_file}",
|
||||
"truncated_to": truncate_at,
|
||||
"scrubbed_at": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"scrubbing_passes": [
|
||||
"redact_sensitive_text (agent.redact)",
|
||||
"username paths replaced with /home/user",
|
||||
"personal handles (all case variants of the maintainer name) replaced with 'user'",
|
||||
"email addresses replaced with contributor@example.com",
|
||||
"git 'Author: Name <addr>' header lines normalised",
|
||||
"reasoning scratchpad blocks stripped from assistant content",
|
||||
"think tag blocks stripped from assistant content",
|
||||
"messaging-platform user mentions replaced with <@user>",
|
||||
"first user message paraphrased to remove personal voice",
|
||||
"subsequent user messages kept verbatim (after above redactions)",
|
||||
"system prompt replaced with generic public-safe placeholder",
|
||||
"orphan empty-assistant messages and trailing tool messages dropped",
|
||||
"tool outputs preserved verbatim (truncation disabled so the compressor sees real load)",
|
||||
],
|
||||
"messages": scrubbed,
|
||||
}
|
||||
return fixture
|
||||
|
||||
|
||||
def main() -> int:
|
||||
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for spec in SPECS:
|
||||
source_file, output_name, description, paraphrase, model, ctx, truncate = spec
|
||||
fixture = build_fixture(
|
||||
source_file=source_file,
|
||||
output_name=output_name,
|
||||
description=description,
|
||||
first_user_paraphrase=paraphrase,
|
||||
model_guess=model,
|
||||
context_length=ctx,
|
||||
truncate_at=truncate,
|
||||
)
|
||||
out_path = FIXTURES_DIR / f"{output_name}.json"
|
||||
with out_path.open("w") as fh:
|
||||
json.dump(fixture, fh, indent=2, ensure_ascii=False)
|
||||
size_kb = out_path.stat().st_size / 1024
|
||||
print(f" {output_name}.json {size_kb:.1f} KB {len(fixture['messages'])} msgs")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -44,7 +44,6 @@ AUTHOR_MAP = {
|
||||
"teknium@nousresearch.com": "teknium1",
|
||||
"127238744+teknium1@users.noreply.github.com": "teknium1",
|
||||
"343873859@qq.com": "DrStrangerUJN",
|
||||
"uzmpsk.dilekakbas@gmail.com": "dlkakbs",
|
||||
"jefferson@heimdallstrategy.com": "Mind-Dragon",
|
||||
"130918800+devorun@users.noreply.github.com": "devorun",
|
||||
"maks.mir@yahoo.com": "say8hi",
|
||||
@@ -61,7 +60,6 @@ AUTHOR_MAP = {
|
||||
"abner.the.foreman@agentmail.to": "Abnertheforeman",
|
||||
"harryykyle1@gmail.com": "hharry11",
|
||||
"kshitijk4poor@gmail.com": "kshitijk4poor",
|
||||
"keira.voss94@gmail.com": "keiravoss94",
|
||||
"16443023+stablegenius49@users.noreply.github.com": "stablegenius49",
|
||||
"185121704+stablegenius49@users.noreply.github.com": "stablegenius49",
|
||||
"101283333+batuhankocyigit@users.noreply.github.com": "batuhankocyigit",
|
||||
@@ -79,7 +77,6 @@ AUTHOR_MAP = {
|
||||
"77628552+raulvidis@users.noreply.github.com": "raulvidis",
|
||||
"145567217+Aum08Desai@users.noreply.github.com": "Aum08Desai",
|
||||
"256820943+kshitij-eliza@users.noreply.github.com": "kshitij-eliza",
|
||||
"jiechengwu@pony.ai": "Jason2031",
|
||||
"44278268+shitcoinsherpa@users.noreply.github.com": "shitcoinsherpa",
|
||||
"104278804+Sertug17@users.noreply.github.com": "Sertug17",
|
||||
"112503481+caentzminger@users.noreply.github.com": "caentzminger",
|
||||
@@ -172,34 +169,6 @@ AUTHOR_MAP = {
|
||||
"satelerd@gmail.com": "satelerd",
|
||||
"dan@danlynn.com": "danklynn",
|
||||
"mattmaximo@hotmail.com": "MattMaximo",
|
||||
"149063006+j3ffffff@users.noreply.github.com": "j3ffffff",
|
||||
"A-FdL-Prog@users.noreply.github.com": "A-FdL-Prog",
|
||||
"l0hde@users.noreply.github.com": "l0hde",
|
||||
"difujia@users.noreply.github.com": "difujia",
|
||||
"vominh1919@gmail.com": "vominh1919",
|
||||
"yue.gu2023@gmail.com": "YueLich",
|
||||
"51783311+andyylin@users.noreply.github.com": "andyylin",
|
||||
"me@jakubkrcmar.cz": "jakubkrcmar",
|
||||
"prasadus92@gmail.com": "prasadus92",
|
||||
"michael@make.software": "mssteuer",
|
||||
"der@konsi.org": "konsisumer",
|
||||
"abogale2@gmail.com": "amanuel2",
|
||||
"alexazzjjtt@163.com": "alexzhu0",
|
||||
"pub_forgreatagent@antgroup.com": "AntAISecurityLab",
|
||||
"252620095+briandevans@users.noreply.github.com": "briandevans",
|
||||
"danielrpike9@gmail.com": "Bartok9",
|
||||
"skozyuk@cruxexperts.com": "CruxExperts",
|
||||
"154585401+LeonSGP43@users.noreply.github.com": "LeonSGP43",
|
||||
"mgparkprint@gmail.com": "vlwkaos",
|
||||
"tranquil_flow@protonmail.com": "Tranquil-Flow",
|
||||
"wangshengyang2004@163.com": "Wangshengyang2004",
|
||||
"hasan.ali13381@gmail.com": "H-Ali13381",
|
||||
"xienb@proton.me": "XieNBi",
|
||||
"139681654+maymuneth@users.noreply.github.com": "maymuneth",
|
||||
"zengwei@nightq.cn": "nightq",
|
||||
"1434494126@qq.com": "5park1e",
|
||||
"158153005+5park1e@users.noreply.github.com": "5park1e",
|
||||
"innocarpe@gmail.com": "innocarpe",
|
||||
"numman.ali@gmail.com": "nummanali",
|
||||
"rohithsaimidigudla@gmail.com": "whitehatjr1001",
|
||||
"0xNyk@users.noreply.github.com": "0xNyk",
|
||||
@@ -218,11 +187,6 @@ AUTHOR_MAP = {
|
||||
"bryan@intertwinesys.com": "bryanyoung",
|
||||
"christo.mitov@gmail.com": "christomitov",
|
||||
"hermes@nousresearch.com": "NousResearch",
|
||||
"reginaldasr@gmail.com": "ReginaldasR",
|
||||
"ntconguit@gmail.com": "0xharryriddle",
|
||||
"agent@wildcat.local": "ericnicolaides",
|
||||
"georgex8001@gmail.com": "georgex8001",
|
||||
"stefan@dimagents.ai": "dimitrovi",
|
||||
"hermes@noushq.ai": "benbarclay",
|
||||
"chinmingcock@gmail.com": "ChimingLiu",
|
||||
"openclaw@sparklab.ai": "openclaw",
|
||||
@@ -371,9 +335,6 @@ AUTHOR_MAP = {
|
||||
"brian@bde.io": "briandevans",
|
||||
"hubin_ll@qq.com": "LLQWQ",
|
||||
"memosr_email@gmail.com": "memosr",
|
||||
"jperlow@gmail.com": "perlowja",
|
||||
"tangyuanjc@JCdeAIfenshendeMac-mini.local": "tangyuanjc",
|
||||
"harryplusplus@gmail.com": "harryplusplus",
|
||||
"anthhub@163.com": "anthhub",
|
||||
"shenuu@gmail.com": "shenuu",
|
||||
"xiayh17@gmail.com": "xiayh0107",
|
||||
@@ -477,12 +438,6 @@ AUTHOR_MAP = {
|
||||
"topcheer@me.com": "topcheer",
|
||||
"walli@tencent.com": "walli",
|
||||
"zhuofengwang@tencent.com": "Zhuofeng-Wang",
|
||||
# April 2026 salvage-PR batch (#14920, #14986, #14966)
|
||||
"mrunmayeerane17@gmail.com": "mrunmayee17",
|
||||
"69489633+camaragon@users.noreply.github.com": "camaragon",
|
||||
"shamork@outlook.com": "shamork",
|
||||
# April 2026 Discord Copilot /model salvage (#15030)
|
||||
"cshong2017@outlook.com": "Nicecsh",
|
||||
# no-github-match — keep as display names
|
||||
"clio-agent@sisyphuslabs.ai": "Sisyphus",
|
||||
"marco@rutimka.de": "Marco Rutsch",
|
||||
|
||||
@@ -248,6 +248,7 @@ Type these during an interactive chat session.
|
||||
```
|
||||
/config Show config (CLI)
|
||||
/model [name] Show or change model
|
||||
/provider Show provider info
|
||||
/personality [name] Set personality
|
||||
/reasoning [level] Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
|
||||
/verbose Cycle: off → new → all → verbose
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
---
|
||||
name: spotify
|
||||
description: Control Spotify — play music, search the catalog, manage playlists and library, inspect devices and playback state. Loads when the user asks to play/pause/queue music, search tracks/albums/artists, manage playlists, or check what's playing. Assumes the Hermes Spotify toolset is enabled and `hermes auth spotify` has been run.
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
prerequisites:
|
||||
tools: [spotify_playback, spotify_devices, spotify_queue, spotify_search, spotify_playlists, spotify_albums, spotify_library]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [spotify, music, playback, playlists, media]
|
||||
related_skills: [gif-search]
|
||||
---
|
||||
|
||||
# Spotify
|
||||
|
||||
Control the user's Spotify account via the Hermes Spotify toolset (7 tools). Setup guide: https://hermes-agent.nousresearch.com/docs/user-guide/features/spotify
|
||||
|
||||
## When to use this skill
|
||||
|
||||
The user says something like "play X", "pause", "skip", "queue up X", "what's playing", "search for X", "add to my X playlist", "make a playlist", "save this to my library", etc.
|
||||
|
||||
## The 7 tools
|
||||
|
||||
- `spotify_playback` — play, pause, next, previous, seek, set_repeat, set_shuffle, set_volume, get_state, get_currently_playing, recently_played
|
||||
- `spotify_devices` — list, transfer
|
||||
- `spotify_queue` — get, add
|
||||
- `spotify_search` — search the catalog
|
||||
- `spotify_playlists` — list, get, create, add_items, remove_items, update_details
|
||||
- `spotify_albums` — get, tracks
|
||||
- `spotify_library` — list/save/remove with `kind: "tracks"|"albums"`
|
||||
|
||||
Playback-mutating actions require Spotify Premium; search/library/playlist ops work on Free.
|
||||
|
||||
## Canonical patterns (minimize tool calls)
|
||||
|
||||
### "Play <artist/track/album>"
|
||||
One search, then play by URI. Do NOT loop through search results describing them unless the user asked for options.
|
||||
|
||||
```
|
||||
spotify_search({"query": "miles davis kind of blue", "types": ["album"], "limit": 1})
|
||||
→ got album URI spotify:album:1weenld61qoidwYuZ1GESA
|
||||
spotify_playback({"action": "play", "context_uri": "spotify:album:1weenld61qoidwYuZ1GESA"})
|
||||
```
|
||||
|
||||
For "play some <artist>" (no specific song), prefer `types: ["artist"]` and play the artist context URI — Spotify handles smart shuffle. If the user says "the song" or "that track", search `types: ["track"]` and pass `uris: [track_uri]` to play.
|
||||
|
||||
### "What's playing?" / "What am I listening to?"
|
||||
Single call — don't chain get_state after get_currently_playing.
|
||||
|
||||
```
|
||||
spotify_playback({"action": "get_currently_playing"})
|
||||
```
|
||||
|
||||
If it returns 204/empty (`is_playing: false`), tell the user nothing is playing. Don't retry.
|
||||
|
||||
### "Pause" / "Skip" / "Volume 50"
|
||||
Direct action, no preflight inspection needed.
|
||||
|
||||
```
|
||||
spotify_playback({"action": "pause"})
|
||||
spotify_playback({"action": "next"})
|
||||
spotify_playback({"action": "set_volume", "volume_percent": 50})
|
||||
```
|
||||
|
||||
### "Add to my <playlist name> playlist"
|
||||
1. `spotify_playlists list` to find the playlist ID by name
|
||||
2. Get the track URI (from currently playing, or search)
|
||||
3. `spotify_playlists add_items` with the playlist_id and URIs
|
||||
|
||||
```
|
||||
spotify_playlists({"action": "list"})
|
||||
→ found "Late Night Jazz" = 37i9dQZF1DX4wta20PHgwo
|
||||
spotify_playback({"action": "get_currently_playing"})
|
||||
→ current track uri = spotify:track:0DiWol3AO6WpXZgp0goxAV
|
||||
spotify_playlists({"action": "add_items",
|
||||
"playlist_id": "37i9dQZF1DX4wta20PHgwo",
|
||||
"uris": ["spotify:track:0DiWol3AO6WpXZgp0goxAV"]})
|
||||
```
|
||||
|
||||
### "Create a playlist called X and add the last 3 songs I played"
|
||||
```
|
||||
spotify_playback({"action": "recently_played", "limit": 3})
|
||||
spotify_playlists({"action": "create", "name": "Focus 2026"})
|
||||
→ got playlist_id back in response
|
||||
spotify_playlists({"action": "add_items", "playlist_id": <id>, "uris": [<3 uris>]})
|
||||
```
|
||||
|
||||
### "Save / unsave / is this saved?"
|
||||
Use `spotify_library` with the right `kind`.
|
||||
|
||||
```
|
||||
spotify_library({"kind": "tracks", "action": "save", "uris": ["spotify:track:..."]})
|
||||
spotify_library({"kind": "albums", "action": "list", "limit": 50})
|
||||
```
|
||||
|
||||
### "Transfer playback to my <device>"
|
||||
```
|
||||
spotify_devices({"action": "list"})
|
||||
→ pick the device_id by matching name/type
|
||||
spotify_devices({"action": "transfer", "device_id": "<id>", "play": true})
|
||||
```
|
||||
|
||||
## Critical failure modes
|
||||
|
||||
**`403 Forbidden — No active device found`** on any playback action means Spotify isn't running anywhere. Tell the user: "Open Spotify on your phone/desktop/web player first, start any track for a second, then retry." Don't retry the tool call blindly — it will fail the same way. You can call `spotify_devices list` to confirm; an empty list means no active device.
|
||||
|
||||
**`403 Forbidden — Premium required`** means the user is on Free and tried to mutate playback. Don't retry; tell them this action needs Premium. Reads still work (search, playlists, library, get_state).
|
||||
|
||||
**`204 No Content` on `get_currently_playing`** is NOT an error — it means nothing is playing. The tool returns `is_playing: false`. Just report that to the user.
|
||||
|
||||
**`429 Too Many Requests`** = rate limit. Wait and retry once. If it keeps happening, you're looping — stop.
|
||||
|
||||
**`401 Unauthorized` after a retry** — refresh token revoked. Tell the user to run `hermes auth spotify` again.
|
||||
|
||||
## URI and ID formats
|
||||
|
||||
Spotify uses three interchangeable ID formats. The tools accept all three and normalize:
|
||||
|
||||
- URI: `spotify:track:0DiWol3AO6WpXZgp0goxAV` (preferred)
|
||||
- URL: `https://open.spotify.com/track/0DiWol3AO6WpXZgp0goxAV`
|
||||
- Bare ID: `0DiWol3AO6WpXZgp0goxAV`
|
||||
|
||||
When in doubt, use full URIs. Search results return URIs in the `uri` field — pass those directly.
|
||||
|
||||
Entity types: `track`, `album`, `artist`, `playlist`, `show`, `episode`. Use the right type for the action — `spotify_playback.play` with a `context_uri` expects album/playlist/artist; `uris` expects an array of track URIs.
|
||||
|
||||
## What NOT to do
|
||||
|
||||
- **Don't call `get_state` before every action.** Spotify accepts play/pause/skip without preflight. Only inspect state when the user asked "what's playing" or you need to reason about device/track.
|
||||
- **Don't describe search results unless asked.** If the user said "play X", search, grab the top URI, play it. They'll hear it's wrong if it's wrong.
|
||||
- **Don't retry on `403 Premium required` or `403 No active device`.** Those are permanent until user action.
|
||||
- **Don't use `spotify_search` to find a playlist by name** — that searches the public Spotify catalog. User playlists come from `spotify_playlists list`.
|
||||
- **Don't mix `kind: "tracks"` with album URIs** in `spotify_library` (or vice versa). The tool normalizes IDs but the API endpoint differs.
|
||||
@@ -904,15 +904,9 @@ class TestRegisterSessionMcpServers:
|
||||
]
|
||||
|
||||
with patch("tools.mcp_tool.register_mcp_servers", return_value=["mcp_srv_search"]), \
|
||||
patch("model_tools.get_tool_definitions", return_value=fake_tools) as mock_defs:
|
||||
patch("model_tools.get_tool_definitions", return_value=fake_tools):
|
||||
await agent._register_session_mcp_servers(state, [server])
|
||||
|
||||
mock_defs.assert_called_once_with(
|
||||
enabled_toolsets=["hermes-acp", "mcp-srv"],
|
||||
disabled_toolsets=None,
|
||||
quiet_mode=True,
|
||||
)
|
||||
assert state.agent.enabled_toolsets == ["hermes-acp", "mcp-srv"]
|
||||
assert state.agent.tools == fake_tools
|
||||
assert state.agent.valid_tool_names == {"mcp_srv_search", "terminal"}
|
||||
# _invalidate_system_prompt should have been called
|
||||
|
||||
@@ -138,43 +138,6 @@ class TestListAndCleanup:
|
||||
class TestPersistence:
|
||||
"""Verify that sessions are persisted to SessionDB and can be restored."""
|
||||
|
||||
def test_create_session_includes_registered_mcp_toolsets(self, tmp_path, monkeypatch):
|
||||
captured = {}
|
||||
|
||||
def fake_resolve_runtime_provider(requested=None, **kwargs):
|
||||
return {
|
||||
"provider": "openrouter",
|
||||
"api_mode": "chat_completions",
|
||||
"base_url": "https://openrouter.example/v1",
|
||||
"api_key": "***",
|
||||
"command": None,
|
||||
"args": [],
|
||||
}
|
||||
|
||||
def fake_agent(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return SimpleNamespace(model=kwargs.get("model"), enabled_toolsets=kwargs.get("enabled_toolsets"))
|
||||
|
||||
monkeypatch.setattr("hermes_cli.config.load_config", lambda: {
|
||||
"model": {"provider": "openrouter", "default": "test-model"},
|
||||
"mcp_servers": {
|
||||
"olympus": {"command": "python", "enabled": True},
|
||||
"exa": {"url": "https://exa.ai/mcp"},
|
||||
"disabled": {"command": "python", "enabled": False},
|
||||
},
|
||||
})
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.runtime_provider.resolve_runtime_provider",
|
||||
fake_resolve_runtime_provider,
|
||||
)
|
||||
db = SessionDB(tmp_path / "state.db")
|
||||
|
||||
with patch("run_agent.AIAgent", side_effect=fake_agent):
|
||||
manager = SessionManager(db=db)
|
||||
manager.create_session(cwd="/work")
|
||||
|
||||
assert captured["enabled_toolsets"] == ["hermes-acp", "mcp-olympus", "mcp-exa"]
|
||||
|
||||
def test_create_session_writes_to_db(self, manager):
|
||||
state = manager.create_session(cwd="/project")
|
||||
db = manager._get_db()
|
||||
|
||||
@@ -1,165 +0,0 @@
|
||||
"""Tests for Bug #12905 fixes in agent/anthropic_adapter.py — macOS Keychain support."""
|
||||
|
||||
import json
|
||||
import platform
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.anthropic_adapter import (
|
||||
_read_claude_code_credentials_from_keychain,
|
||||
read_claude_code_credentials,
|
||||
)
|
||||
|
||||
|
||||
class TestReadClaudeCodeCredentialsFromKeychain:
|
||||
"""Bug 4: macOS Keychain support for Claude Code >=2.1.114."""
|
||||
|
||||
def test_returns_none_on_linux(self):
|
||||
"""Keychain reading is Darwin-only; must return None on other platforms."""
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Linux"):
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_on_windows(self):
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Windows"):
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_when_security_command_not_found(self):
|
||||
"""OSError from missing security binary must be handled gracefully."""
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run",
|
||||
side_effect=OSError("security not found")):
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_on_nonzero_exit_code(self):
|
||||
"""security returns non-zero when the Keychain entry doesn't exist."""
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_for_empty_stdout(self):
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_for_non_json_payload(self):
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout="not valid json", stderr="")
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_when_password_field_is_missing_claude_ai_oauth(self):
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout=json.dumps({"someOtherService": {"accessToken": "tok"}}),
|
||||
stderr="",
|
||||
)
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_returns_none_when_access_token_is_empty(self):
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout=json.dumps({"claudeAiOauth": {"accessToken": "", "refreshToken": "x"}}),
|
||||
stderr="",
|
||||
)
|
||||
assert _read_claude_code_credentials_from_keychain() is None
|
||||
|
||||
def test_parses_valid_keychain_entry(self):
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout=json.dumps({
|
||||
"claudeAiOauth": {
|
||||
"accessToken": "kc-access-token-abc",
|
||||
"refreshToken": "kc-refresh-token-xyz",
|
||||
"expiresAt": 9999999999999,
|
||||
}
|
||||
}),
|
||||
stderr="",
|
||||
)
|
||||
creds = _read_claude_code_credentials_from_keychain()
|
||||
assert creds is not None
|
||||
assert creds["accessToken"] == "kc-access-token-abc"
|
||||
assert creds["refreshToken"] == "kc-refresh-token-xyz"
|
||||
assert creds["expiresAt"] == 9999999999999
|
||||
assert creds["source"] == "macos_keychain"
|
||||
|
||||
|
||||
class TestReadClaudeCodeCredentialsPriority:
|
||||
"""Bug 4: Keychain must be checked before the JSON file."""
|
||||
|
||||
def test_keychain_takes_priority_over_json_file(self, tmp_path, monkeypatch):
|
||||
"""When both Keychain and JSON file have credentials, Keychain wins."""
|
||||
# Set up JSON file with "older" token
|
||||
json_cred_file = tmp_path / ".claude" / ".credentials.json"
|
||||
json_cred_file.parent.mkdir(parents=True)
|
||||
json_cred_file.write_text(json.dumps({
|
||||
"claudeAiOauth": {
|
||||
"accessToken": "json-token",
|
||||
"refreshToken": "json-refresh",
|
||||
"expiresAt": 9999999999999,
|
||||
}
|
||||
}))
|
||||
monkeypatch.setattr("agent.anthropic_adapter.Path.home", lambda: tmp_path)
|
||||
|
||||
# Mock Keychain to return a "newer" token
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout=json.dumps({
|
||||
"claudeAiOauth": {
|
||||
"accessToken": "keychain-token",
|
||||
"refreshToken": "keychain-refresh",
|
||||
"expiresAt": 9999999999999,
|
||||
}
|
||||
}),
|
||||
stderr="",
|
||||
)
|
||||
creds = read_claude_code_credentials()
|
||||
|
||||
# Keychain token should be returned, not JSON file token
|
||||
assert creds is not None
|
||||
assert creds["accessToken"] == "keychain-token"
|
||||
assert creds["source"] == "macos_keychain"
|
||||
|
||||
def test_falls_back_to_json_when_keychain_returns_none(self, tmp_path, monkeypatch):
|
||||
"""When Keychain has no entry, JSON file is used as fallback."""
|
||||
json_cred_file = tmp_path / ".claude" / ".credentials.json"
|
||||
json_cred_file.parent.mkdir(parents=True)
|
||||
json_cred_file.write_text(json.dumps({
|
||||
"claudeAiOauth": {
|
||||
"accessToken": "json-fallback-token",
|
||||
"refreshToken": "json-refresh",
|
||||
"expiresAt": 9999999999999,
|
||||
}
|
||||
}))
|
||||
monkeypatch.setattr("agent.anthropic_adapter.Path.home", lambda: tmp_path)
|
||||
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
# Simulate Keychain entry not found
|
||||
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
|
||||
creds = read_claude_code_credentials()
|
||||
|
||||
assert creds is not None
|
||||
assert creds["accessToken"] == "json-fallback-token"
|
||||
assert creds["source"] == "claude_code_credentials_file"
|
||||
|
||||
def test_returns_none_when_neither_keychain_nor_json_has_creds(self, tmp_path, monkeypatch):
|
||||
"""No credentials anywhere — must return None cleanly."""
|
||||
monkeypatch.setattr("agent.anthropic_adapter.Path.home", lambda: tmp_path)
|
||||
|
||||
with patch("agent.anthropic_adapter.platform.system", return_value="Darwin"), \
|
||||
patch("agent.anthropic_adapter.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
|
||||
creds = read_claude_code_credentials()
|
||||
|
||||
assert creds is None
|
||||
@@ -19,7 +19,6 @@ from agent.auxiliary_client import (
|
||||
_read_codex_access_token,
|
||||
_get_provider_chain,
|
||||
_is_payment_error,
|
||||
_normalize_aux_provider,
|
||||
_try_payment_fallback,
|
||||
_resolve_auto,
|
||||
)
|
||||
@@ -55,17 +54,6 @@ def codex_auth_dir(tmp_path, monkeypatch):
|
||||
return codex_dir
|
||||
|
||||
|
||||
class TestNormalizeAuxProvider:
|
||||
def test_maps_github_copilot_aliases(self):
|
||||
assert _normalize_aux_provider("github") == "copilot"
|
||||
assert _normalize_aux_provider("github-copilot") == "copilot"
|
||||
assert _normalize_aux_provider("github-models") == "copilot"
|
||||
|
||||
def test_maps_github_copilot_acp_aliases(self):
|
||||
assert _normalize_aux_provider("github-copilot-acp") == "copilot-acp"
|
||||
assert _normalize_aux_provider("copilot-acp-agent") == "copilot-acp"
|
||||
|
||||
|
||||
class TestReadCodexAccessToken:
|
||||
def test_valid_auth_store(self, tmp_path, monkeypatch):
|
||||
hermes_home = tmp_path / "hermes"
|
||||
@@ -1215,201 +1203,3 @@ class TestAnthropicCompatImageConversion:
|
||||
}]
|
||||
result = _convert_openai_images_to_anthropic(messages)
|
||||
assert result[0]["content"][0]["source"]["media_type"] == "image/jpeg"
|
||||
|
||||
|
||||
class _AuxAuth401(Exception):
|
||||
status_code = 401
|
||||
|
||||
def __init__(self, message="Provided authentication token is expired"):
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class _DummyResponse:
|
||||
def __init__(self, text="ok"):
|
||||
self.choices = [MagicMock(message=MagicMock(content=text))]
|
||||
|
||||
|
||||
class _FailingThenSuccessCompletions:
|
||||
def __init__(self):
|
||||
self.calls = 0
|
||||
|
||||
def create(self, **kwargs):
|
||||
self.calls += 1
|
||||
if self.calls == 1:
|
||||
raise _AuxAuth401()
|
||||
return _DummyResponse("sync-ok")
|
||||
|
||||
|
||||
class _AsyncFailingThenSuccessCompletions:
|
||||
def __init__(self):
|
||||
self.calls = 0
|
||||
|
||||
async def create(self, **kwargs):
|
||||
self.calls += 1
|
||||
if self.calls == 1:
|
||||
raise _AuxAuth401()
|
||||
return _DummyResponse("async-ok")
|
||||
|
||||
|
||||
class TestAuxiliaryAuthRefreshRetry:
|
||||
def test_call_llm_refreshes_codex_on_401_for_vision(self):
|
||||
failing_client = MagicMock()
|
||||
failing_client.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
failing_client.chat.completions = _FailingThenSuccessCompletions()
|
||||
|
||||
fresh_client = MagicMock()
|
||||
fresh_client.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
fresh_client.chat.completions.create.return_value = _DummyResponse("fresh-sync")
|
||||
|
||||
with (
|
||||
patch(
|
||||
"agent.auxiliary_client.resolve_vision_provider_client",
|
||||
side_effect=[("openai-codex", failing_client, "gpt-5.2-codex"), ("openai-codex", fresh_client, "gpt-5.2-codex")],
|
||||
),
|
||||
patch("agent.auxiliary_client._refresh_provider_credentials", return_value=True) as mock_refresh,
|
||||
):
|
||||
resp = call_llm(
|
||||
task="vision",
|
||||
provider="openai-codex",
|
||||
model="gpt-5.2-codex",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert resp.choices[0].message.content == "fresh-sync"
|
||||
mock_refresh.assert_called_once_with("openai-codex")
|
||||
|
||||
def test_call_llm_refreshes_codex_on_401_for_non_vision(self):
|
||||
stale_client = MagicMock()
|
||||
stale_client.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
stale_client.chat.completions.create.side_effect = _AuxAuth401("stale codex token")
|
||||
|
||||
fresh_client = MagicMock()
|
||||
fresh_client.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
fresh_client.chat.completions.create.return_value = _DummyResponse("fresh-non-vision")
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model", return_value=("openai-codex", "gpt-5.2-codex", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client", side_effect=[(stale_client, "gpt-5.2-codex"), (fresh_client, "gpt-5.2-codex")]),
|
||||
patch("agent.auxiliary_client._refresh_provider_credentials", return_value=True) as mock_refresh,
|
||||
):
|
||||
resp = call_llm(
|
||||
task="compression",
|
||||
provider="openai-codex",
|
||||
model="gpt-5.2-codex",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert resp.choices[0].message.content == "fresh-non-vision"
|
||||
mock_refresh.assert_called_once_with("openai-codex")
|
||||
assert stale_client.chat.completions.create.call_count == 1
|
||||
assert fresh_client.chat.completions.create.call_count == 1
|
||||
|
||||
def test_call_llm_refreshes_anthropic_on_401_for_non_vision(self):
|
||||
stale_client = MagicMock()
|
||||
stale_client.base_url = "https://api.anthropic.com"
|
||||
stale_client.chat.completions.create.side_effect = _AuxAuth401("anthropic token expired")
|
||||
|
||||
fresh_client = MagicMock()
|
||||
fresh_client.base_url = "https://api.anthropic.com"
|
||||
fresh_client.chat.completions.create.return_value = _DummyResponse("fresh-anthropic")
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model", return_value=("anthropic", "claude-haiku-4-5-20251001", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client", side_effect=[(stale_client, "claude-haiku-4-5-20251001"), (fresh_client, "claude-haiku-4-5-20251001")]),
|
||||
patch("agent.auxiliary_client._refresh_provider_credentials", return_value=True) as mock_refresh,
|
||||
):
|
||||
resp = call_llm(
|
||||
task="compression",
|
||||
provider="anthropic",
|
||||
model="claude-haiku-4-5-20251001",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert resp.choices[0].message.content == "fresh-anthropic"
|
||||
mock_refresh.assert_called_once_with("anthropic")
|
||||
assert stale_client.chat.completions.create.call_count == 1
|
||||
assert fresh_client.chat.completions.create.call_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_call_llm_refreshes_codex_on_401_for_vision(self):
|
||||
failing_client = MagicMock()
|
||||
failing_client.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
failing_client.chat.completions = _AsyncFailingThenSuccessCompletions()
|
||||
|
||||
fresh_client = MagicMock()
|
||||
fresh_client.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
fresh_client.chat.completions.create = AsyncMock(return_value=_DummyResponse("fresh-async"))
|
||||
|
||||
with (
|
||||
patch(
|
||||
"agent.auxiliary_client.resolve_vision_provider_client",
|
||||
side_effect=[("openai-codex", failing_client, "gpt-5.2-codex"), ("openai-codex", fresh_client, "gpt-5.2-codex")],
|
||||
),
|
||||
patch("agent.auxiliary_client._refresh_provider_credentials", return_value=True) as mock_refresh,
|
||||
):
|
||||
resp = await async_call_llm(
|
||||
task="vision",
|
||||
provider="openai-codex",
|
||||
model="gpt-5.2-codex",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert resp.choices[0].message.content == "fresh-async"
|
||||
mock_refresh.assert_called_once_with("openai-codex")
|
||||
|
||||
def test_refresh_provider_credentials_force_refreshes_anthropic_oauth_and_evicts_cache(self, monkeypatch):
|
||||
stale_client = MagicMock()
|
||||
cache_key = ("anthropic", False, None, None, None)
|
||||
|
||||
monkeypatch.setenv("ANTHROPIC_TOKEN", "")
|
||||
monkeypatch.setenv("CLAUDE_CODE_OAUTH_TOKEN", "")
|
||||
monkeypatch.setenv("ANTHROPIC_API_KEY", "")
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._client_cache", {cache_key: (stale_client, "claude-haiku-4-5-20251001", None)}),
|
||||
patch("agent.anthropic_adapter.read_claude_code_credentials", return_value={
|
||||
"accessToken": "expired-token",
|
||||
"refreshToken": "refresh-token",
|
||||
"expiresAt": 0,
|
||||
}),
|
||||
patch("agent.anthropic_adapter.refresh_anthropic_oauth_pure", return_value={
|
||||
"access_token": "fresh-token",
|
||||
"refresh_token": "refresh-token-2",
|
||||
"expires_at_ms": 9999999999999,
|
||||
}) as mock_refresh_oauth,
|
||||
patch("agent.anthropic_adapter._write_claude_code_credentials") as mock_write,
|
||||
):
|
||||
from agent.auxiliary_client import _refresh_provider_credentials
|
||||
|
||||
assert _refresh_provider_credentials("anthropic") is True
|
||||
|
||||
mock_refresh_oauth.assert_called_once_with("refresh-token", use_json=False)
|
||||
mock_write.assert_called_once_with("fresh-token", "refresh-token-2", 9999999999999)
|
||||
stale_client.close.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_call_llm_refreshes_anthropic_on_401_for_non_vision(self):
|
||||
stale_client = MagicMock()
|
||||
stale_client.base_url = "https://api.anthropic.com"
|
||||
stale_client.chat.completions.create = AsyncMock(side_effect=_AuxAuth401("anthropic token expired"))
|
||||
|
||||
fresh_client = MagicMock()
|
||||
fresh_client.base_url = "https://api.anthropic.com"
|
||||
fresh_client.chat.completions.create = AsyncMock(return_value=_DummyResponse("fresh-async-anthropic"))
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model", return_value=("anthropic", "claude-haiku-4-5-20251001", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client", side_effect=[(stale_client, "claude-haiku-4-5-20251001"), (fresh_client, "claude-haiku-4-5-20251001")]),
|
||||
patch("agent.auxiliary_client._refresh_provider_credentials", return_value=True) as mock_refresh,
|
||||
):
|
||||
resp = await async_call_llm(
|
||||
task="compression",
|
||||
provider="anthropic",
|
||||
model="claude-haiku-4-5-20251001",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert resp.choices[0].message.content == "fresh-async-anthropic"
|
||||
mock_refresh.assert_called_once_with("anthropic")
|
||||
assert stale_client.chat.completions.create.await_count == 1
|
||||
assert fresh_client.chat.completions.create.await_count == 1
|
||||
|
||||
@@ -100,26 +100,6 @@ class TestResolveProviderClientMainAlias:
|
||||
assert client is not None
|
||||
assert "beans.local" in str(client.base_url)
|
||||
|
||||
def test_main_resolves_github_copilot_alias(self, tmp_path):
|
||||
_write_config(tmp_path, {
|
||||
"model": {"default": "gpt-5.4", "provider": "github-copilot"},
|
||||
})
|
||||
with (
|
||||
patch("hermes_cli.auth.resolve_api_key_provider_credentials", return_value={
|
||||
"api_key": "ghu_test_token",
|
||||
"base_url": "https://api.githubcopilot.com",
|
||||
}),
|
||||
patch("agent.auxiliary_client.OpenAI") as mock_openai,
|
||||
):
|
||||
mock_openai.return_value = MagicMock()
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
|
||||
client, model = resolve_provider_client("main", "gpt-5.4")
|
||||
|
||||
assert client is not None
|
||||
assert model == "gpt-5.4"
|
||||
assert mock_openai.called
|
||||
|
||||
|
||||
class TestResolveProviderClientNamedCustom:
|
||||
"""resolve_provider_client should resolve named custom providers directly."""
|
||||
@@ -272,158 +252,3 @@ class TestVisionPathApiMode:
|
||||
mock_gcc.assert_called_once()
|
||||
_, kwargs = mock_gcc.call_args
|
||||
assert kwargs.get("api_mode") == "chat_completions"
|
||||
|
||||
|
||||
class TestProvidersDictApiModeAnthropicMessages:
|
||||
"""Regression guard for #15033.
|
||||
|
||||
Named providers declared under the ``providers:`` dict with
|
||||
``api_mode: anthropic_messages`` must route auxiliary calls through
|
||||
the Anthropic Messages API (via AnthropicAuxiliaryClient), not
|
||||
through an OpenAI chat-completions client.
|
||||
|
||||
The bug had two halves: the providers-dict branch of
|
||||
``_get_named_custom_provider`` dropped the ``api_mode`` field, and
|
||||
``resolve_provider_client``'s named-custom branch never read it.
|
||||
"""
|
||||
|
||||
def test_providers_dict_propagates_api_mode(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("MYRELAY_API_KEY", "sk-test")
|
||||
_write_config(tmp_path, {
|
||||
"providers": {
|
||||
"myrelay": {
|
||||
"name": "myrelay",
|
||||
"base_url": "https://example-relay.test/anthropic",
|
||||
"key_env": "MYRELAY_API_KEY",
|
||||
"api_mode": "anthropic_messages",
|
||||
"default_model": "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
})
|
||||
from hermes_cli.runtime_provider import _get_named_custom_provider
|
||||
entry = _get_named_custom_provider("myrelay")
|
||||
assert entry is not None
|
||||
assert entry.get("api_mode") == "anthropic_messages"
|
||||
assert entry.get("base_url") == "https://example-relay.test/anthropic"
|
||||
assert entry.get("api_key") == "sk-test"
|
||||
|
||||
def test_providers_dict_invalid_api_mode_is_dropped(self, tmp_path):
|
||||
_write_config(tmp_path, {
|
||||
"providers": {
|
||||
"weird": {
|
||||
"name": "weird",
|
||||
"base_url": "https://example.test",
|
||||
"api_mode": "bogus_nonsense",
|
||||
"default_model": "x",
|
||||
},
|
||||
},
|
||||
})
|
||||
from hermes_cli.runtime_provider import _get_named_custom_provider
|
||||
entry = _get_named_custom_provider("weird")
|
||||
assert entry is not None
|
||||
assert "api_mode" not in entry
|
||||
|
||||
def test_providers_dict_without_api_mode_is_unchanged(self, tmp_path):
|
||||
_write_config(tmp_path, {
|
||||
"providers": {
|
||||
"localchat": {
|
||||
"name": "localchat",
|
||||
"base_url": "http://127.0.0.1:1234/v1",
|
||||
"api_key": "local-key",
|
||||
"default_model": "llama-3",
|
||||
},
|
||||
},
|
||||
})
|
||||
from hermes_cli.runtime_provider import _get_named_custom_provider
|
||||
entry = _get_named_custom_provider("localchat")
|
||||
assert entry is not None
|
||||
assert "api_mode" not in entry
|
||||
|
||||
def test_resolve_provider_client_returns_anthropic_client(self, tmp_path, monkeypatch):
|
||||
"""Named custom provider with api_mode=anthropic_messages must
|
||||
route through AnthropicAuxiliaryClient."""
|
||||
monkeypatch.setenv("MYRELAY_API_KEY", "sk-test")
|
||||
_write_config(tmp_path, {
|
||||
"providers": {
|
||||
"myrelay": {
|
||||
"name": "myrelay",
|
||||
"base_url": "https://example-relay.test/anthropic",
|
||||
"key_env": "MYRELAY_API_KEY",
|
||||
"api_mode": "anthropic_messages",
|
||||
"default_model": "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
})
|
||||
from agent.auxiliary_client import (
|
||||
resolve_provider_client,
|
||||
AnthropicAuxiliaryClient,
|
||||
AsyncAnthropicAuxiliaryClient,
|
||||
)
|
||||
sync_client, sync_model = resolve_provider_client("myrelay", async_mode=False)
|
||||
assert isinstance(sync_client, AnthropicAuxiliaryClient), (
|
||||
f"expected AnthropicAuxiliaryClient, got {type(sync_client).__name__}"
|
||||
)
|
||||
assert sync_model == "claude-opus-4-7"
|
||||
|
||||
async_client, async_model = resolve_provider_client("myrelay", async_mode=True)
|
||||
assert isinstance(async_client, AsyncAnthropicAuxiliaryClient), (
|
||||
f"expected AsyncAnthropicAuxiliaryClient, got {type(async_client).__name__}"
|
||||
)
|
||||
assert async_model == "claude-opus-4-7"
|
||||
|
||||
def test_aux_task_override_routes_named_provider_to_anthropic(self, tmp_path, monkeypatch):
|
||||
"""The full chain: auxiliary.<task>.provider: myrelay with
|
||||
api_mode anthropic_messages must produce an Anthropic client."""
|
||||
monkeypatch.setenv("MYRELAY_API_KEY", "sk-test")
|
||||
_write_config(tmp_path, {
|
||||
"providers": {
|
||||
"myrelay": {
|
||||
"name": "myrelay",
|
||||
"base_url": "https://example-relay.test/anthropic",
|
||||
"key_env": "MYRELAY_API_KEY",
|
||||
"api_mode": "anthropic_messages",
|
||||
"default_model": "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
"auxiliary": {
|
||||
"flush_memories": {
|
||||
"provider": "myrelay",
|
||||
"model": "claude-sonnet-4.6",
|
||||
},
|
||||
},
|
||||
"model": {"provider": "openrouter", "default": "anthropic/claude-sonnet-4.6"},
|
||||
})
|
||||
from agent.auxiliary_client import (
|
||||
get_async_text_auxiliary_client,
|
||||
get_text_auxiliary_client,
|
||||
AnthropicAuxiliaryClient,
|
||||
AsyncAnthropicAuxiliaryClient,
|
||||
)
|
||||
async_client, async_model = get_async_text_auxiliary_client("flush_memories")
|
||||
assert isinstance(async_client, AsyncAnthropicAuxiliaryClient)
|
||||
assert async_model == "claude-sonnet-4.6"
|
||||
|
||||
sync_client, sync_model = get_text_auxiliary_client("flush_memories")
|
||||
assert isinstance(sync_client, AnthropicAuxiliaryClient)
|
||||
assert sync_model == "claude-sonnet-4.6"
|
||||
|
||||
def test_provider_without_api_mode_still_uses_openai(self, tmp_path):
|
||||
"""Named providers that don't declare api_mode should still go
|
||||
through the plain OpenAI-wire path (no regression)."""
|
||||
_write_config(tmp_path, {
|
||||
"providers": {
|
||||
"localchat": {
|
||||
"name": "localchat",
|
||||
"base_url": "http://127.0.0.1:1234/v1",
|
||||
"api_key": "local-key",
|
||||
"default_model": "llama-3",
|
||||
},
|
||||
},
|
||||
})
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
from openai import OpenAI, AsyncOpenAI
|
||||
sync_client, _ = resolve_provider_client("localchat", async_mode=False)
|
||||
# sync returns the raw OpenAI client
|
||||
assert isinstance(sync_client, OpenAI)
|
||||
async_client, _ = resolve_provider_client("localchat", async_mode=True)
|
||||
assert isinstance(async_client, AsyncOpenAI)
|
||||
|
||||
@@ -144,60 +144,3 @@ class CopilotACPClientSafetyTests(unittest.TestCase):
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
|
||||
# ── HOME env propagation tests (from PR #11285) ─────────────────────
|
||||
|
||||
from unittest.mock import patch as _patch
|
||||
import pytest
|
||||
|
||||
|
||||
def _make_home_client(tmp_path):
|
||||
return CopilotACPClient(
|
||||
api_key="copilot-acp",
|
||||
base_url="acp://copilot",
|
||||
acp_command="copilot",
|
||||
acp_args=["--acp", "--stdio"],
|
||||
acp_cwd=str(tmp_path),
|
||||
)
|
||||
|
||||
|
||||
def _fake_popen_capture(captured):
|
||||
def _fake(cmd, **kwargs):
|
||||
captured["cmd"] = cmd
|
||||
captured["kwargs"] = kwargs
|
||||
raise FileNotFoundError("copilot not found")
|
||||
return _fake
|
||||
|
||||
|
||||
def test_run_prompt_prefers_profile_home_when_available(monkeypatch, tmp_path):
|
||||
hermes_home = tmp_path / "hermes"
|
||||
profile_home = hermes_home / "home"
|
||||
profile_home.mkdir(parents=True)
|
||||
|
||||
monkeypatch.delenv("HOME", raising=False)
|
||||
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
|
||||
|
||||
captured = {}
|
||||
client = _make_home_client(tmp_path)
|
||||
|
||||
with _patch("agent.copilot_acp_client.subprocess.Popen", side_effect=_fake_popen_capture(captured)):
|
||||
with pytest.raises(RuntimeError, match="Could not start Copilot ACP command"):
|
||||
client._run_prompt("hello", timeout_seconds=1)
|
||||
|
||||
assert captured["kwargs"]["env"]["HOME"] == str(profile_home)
|
||||
|
||||
|
||||
def test_run_prompt_passes_home_when_parent_env_is_clean(monkeypatch, tmp_path):
|
||||
monkeypatch.delenv("HOME", raising=False)
|
||||
monkeypatch.delenv("HERMES_HOME", raising=False)
|
||||
|
||||
captured = {}
|
||||
client = _make_home_client(tmp_path)
|
||||
|
||||
with _patch("agent.copilot_acp_client.subprocess.Popen", side_effect=_fake_popen_capture(captured)):
|
||||
with pytest.raises(RuntimeError, match="Could not start Copilot ACP command"):
|
||||
client._run_prompt("hello", timeout_seconds=1)
|
||||
|
||||
assert "env" in captured["kwargs"]
|
||||
assert captured["kwargs"]["env"]["HOME"]
|
||||
|
||||
@@ -1102,271 +1102,3 @@ def test_load_pool_does_not_seed_qwen_oauth_when_no_token(tmp_path, monkeypatch)
|
||||
|
||||
assert not pool.has_credentials()
|
||||
assert pool.entries() == []
|
||||
|
||||
|
||||
def test_nous_seed_from_singletons_preserves_obtained_at_timestamps(tmp_path, monkeypatch):
|
||||
"""Regression test for #15099 secondary issue.
|
||||
|
||||
When ``_seed_from_singletons`` materialises a device_code pool entry from
|
||||
the ``providers.nous`` singleton, it must carry the mint/refresh
|
||||
timestamps (``obtained_at``, ``agent_key_obtained_at``, ``expires_in``,
|
||||
etc.) into the pool entry. Without them, freshness-sensitive consumers
|
||||
(self-heal hooks, pool pruning by age) treat just-minted credentials as
|
||||
older than they actually are and evict them.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
_write_auth_store(
|
||||
tmp_path,
|
||||
{
|
||||
"version": 1,
|
||||
"providers": {
|
||||
"nous": {
|
||||
"access_token": "at_XXXXXXXX",
|
||||
"refresh_token": "rt_YYYYYYYY",
|
||||
"client_id": "hermes-cli",
|
||||
"portal_base_url": "https://portal.nousresearch.com",
|
||||
"inference_base_url": "https://inference.nousresearch.com/v1",
|
||||
"token_type": "Bearer",
|
||||
"scope": "openid profile",
|
||||
"obtained_at": "2026-04-24T10:00:00+00:00",
|
||||
"expires_at": "2026-04-24T11:00:00+00:00",
|
||||
"expires_in": 3600,
|
||||
"agent_key": "sk-nous-AAAA",
|
||||
"agent_key_id": "ak_123",
|
||||
"agent_key_expires_at": "2026-04-25T10:00:00+00:00",
|
||||
"agent_key_expires_in": 86400,
|
||||
"agent_key_reused": False,
|
||||
"agent_key_obtained_at": "2026-04-24T10:00:05+00:00",
|
||||
"tls": {"insecure": False, "ca_bundle": None},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
from agent.credential_pool import load_pool
|
||||
|
||||
pool = load_pool("nous")
|
||||
entries = pool.entries()
|
||||
|
||||
device_entries = [e for e in entries if e.source == "device_code"]
|
||||
assert len(device_entries) == 1, f"expected single device_code entry; got {len(device_entries)}"
|
||||
e = device_entries[0]
|
||||
|
||||
# Direct dataclass fields — must survive the singleton → pool copy.
|
||||
assert e.access_token == "at_XXXXXXXX"
|
||||
assert e.refresh_token == "rt_YYYYYYYY"
|
||||
assert e.expires_at == "2026-04-24T11:00:00+00:00"
|
||||
assert e.agent_key == "sk-nous-AAAA"
|
||||
assert e.agent_key_expires_at == "2026-04-25T10:00:00+00:00"
|
||||
|
||||
# Extra fields — this is what regressed. These must be carried through
|
||||
# via ``extra`` dict or __getattr__, NOT silently dropped.
|
||||
assert e.obtained_at == "2026-04-24T10:00:00+00:00", (
|
||||
f"obtained_at was dropped during seed; got {e.obtained_at!r}. This breaks "
|
||||
f"downstream pool-freshness consumers (#15099)."
|
||||
)
|
||||
assert e.agent_key_obtained_at == "2026-04-24T10:00:05+00:00"
|
||||
assert e.expires_in == 3600
|
||||
assert e.agent_key_id == "ak_123"
|
||||
assert e.agent_key_expires_in == 86400
|
||||
assert e.agent_key_reused is False
|
||||
|
||||
|
||||
class TestLeastUsedStrategy:
|
||||
"""Regression: least_used strategy must increment request_count on select."""
|
||||
|
||||
def test_request_count_increments(self):
|
||||
"""Each select() call should increment the chosen entry's request_count."""
|
||||
from unittest.mock import patch as _patch
|
||||
from agent.credential_pool import CredentialPool, PooledCredential, STRATEGY_LEAST_USED
|
||||
|
||||
entries = [
|
||||
PooledCredential(provider="test", id="a", label="a", auth_type="api_key",
|
||||
source="a", access_token="tok-a", priority=0, request_count=0),
|
||||
PooledCredential(provider="test", id="b", label="b", auth_type="api_key",
|
||||
source="b", access_token="tok-b", priority=1, request_count=0),
|
||||
]
|
||||
with _patch("agent.credential_pool.get_pool_strategy", return_value=STRATEGY_LEAST_USED):
|
||||
pool = CredentialPool("test", entries)
|
||||
|
||||
# First select should pick entry with lowest count (both 0 → first)
|
||||
e1 = pool.select()
|
||||
assert e1 is not None
|
||||
count_after_first = e1.request_count
|
||||
assert count_after_first == 1, f"Expected 1 after first select, got {count_after_first}"
|
||||
|
||||
# Second select should pick the OTHER entry (now has lower count)
|
||||
e2 = pool.select()
|
||||
assert e2 is not None
|
||||
assert e2.id != e1.id or e2.request_count == 2, (
|
||||
"least_used should alternate or increment"
|
||||
)
|
||||
|
||||
|
||||
# ── PR #10160 salvage: Nous OAuth cross-process sync tests ─────────────────
|
||||
|
||||
def test_sync_nous_entry_from_auth_store_adopts_newer_tokens(tmp_path, monkeypatch):
|
||||
"""When auth.json has a newer refresh token, the pool entry should adopt it."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
_write_auth_store(
|
||||
tmp_path,
|
||||
{
|
||||
"version": 1,
|
||||
"active_provider": "nous",
|
||||
"providers": {
|
||||
"nous": {
|
||||
"portal_base_url": "https://portal.example.com",
|
||||
"inference_base_url": "https://inference.example.com/v1",
|
||||
"client_id": "hermes-cli",
|
||||
"token_type": "Bearer",
|
||||
"scope": "inference:mint_agent_key",
|
||||
"access_token": "access-OLD",
|
||||
"refresh_token": "refresh-OLD",
|
||||
"expires_at": "2026-03-24T12:00:00+00:00",
|
||||
"agent_key": "agent-key-OLD",
|
||||
"agent_key_expires_at": "2026-03-24T13:30:00+00:00",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
from agent.credential_pool import load_pool
|
||||
|
||||
pool = load_pool("nous")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
assert entry.refresh_token == "refresh-OLD"
|
||||
|
||||
# Simulate another process refreshing the token in auth.json
|
||||
_write_auth_store(
|
||||
tmp_path,
|
||||
{
|
||||
"version": 1,
|
||||
"active_provider": "nous",
|
||||
"providers": {
|
||||
"nous": {
|
||||
"portal_base_url": "https://portal.example.com",
|
||||
"inference_base_url": "https://inference.example.com/v1",
|
||||
"client_id": "hermes-cli",
|
||||
"token_type": "Bearer",
|
||||
"scope": "inference:mint_agent_key",
|
||||
"access_token": "access-NEW",
|
||||
"refresh_token": "refresh-NEW",
|
||||
"expires_at": "2026-03-24T12:30:00+00:00",
|
||||
"agent_key": "agent-key-NEW",
|
||||
"agent_key_expires_at": "2026-03-24T14:00:00+00:00",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
synced = pool._sync_nous_entry_from_auth_store(entry)
|
||||
assert synced is not entry
|
||||
assert synced.access_token == "access-NEW"
|
||||
assert synced.refresh_token == "refresh-NEW"
|
||||
assert synced.agent_key == "agent-key-NEW"
|
||||
assert synced.agent_key_expires_at == "2026-03-24T14:00:00+00:00"
|
||||
|
||||
def test_sync_nous_entry_noop_when_tokens_match(tmp_path, monkeypatch):
|
||||
"""When auth.json has the same refresh token, sync should be a no-op."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
_write_auth_store(
|
||||
tmp_path,
|
||||
{
|
||||
"version": 1,
|
||||
"active_provider": "nous",
|
||||
"providers": {
|
||||
"nous": {
|
||||
"portal_base_url": "https://portal.example.com",
|
||||
"inference_base_url": "https://inference.example.com/v1",
|
||||
"client_id": "hermes-cli",
|
||||
"token_type": "Bearer",
|
||||
"scope": "inference:mint_agent_key",
|
||||
"access_token": "access-token",
|
||||
"refresh_token": "refresh-token",
|
||||
"expires_at": "2026-03-24T12:00:00+00:00",
|
||||
"agent_key": "agent-key",
|
||||
"agent_key_expires_at": "2026-03-24T13:30:00+00:00",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
from agent.credential_pool import load_pool
|
||||
|
||||
pool = load_pool("nous")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
|
||||
synced = pool._sync_nous_entry_from_auth_store(entry)
|
||||
assert synced is entry
|
||||
|
||||
def test_nous_exhausted_entry_recovers_via_auth_store_sync(tmp_path, monkeypatch):
|
||||
"""An exhausted Nous entry should recover when auth.json has newer tokens."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
from agent.credential_pool import load_pool, STATUS_EXHAUSTED
|
||||
from dataclasses import replace as dc_replace
|
||||
|
||||
_write_auth_store(
|
||||
tmp_path,
|
||||
{
|
||||
"version": 1,
|
||||
"active_provider": "nous",
|
||||
"providers": {
|
||||
"nous": {
|
||||
"portal_base_url": "https://portal.example.com",
|
||||
"inference_base_url": "https://inference.example.com/v1",
|
||||
"client_id": "hermes-cli",
|
||||
"token_type": "Bearer",
|
||||
"scope": "inference:mint_agent_key",
|
||||
"access_token": "access-OLD",
|
||||
"refresh_token": "refresh-OLD",
|
||||
"expires_at": "2026-03-24T12:00:00+00:00",
|
||||
"agent_key": "agent-key",
|
||||
"agent_key_expires_at": "2026-03-24T13:30:00+00:00",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
pool = load_pool("nous")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
|
||||
# Mark entry as exhausted (simulating a failed refresh)
|
||||
exhausted = dc_replace(
|
||||
entry,
|
||||
last_status=STATUS_EXHAUSTED,
|
||||
last_status_at=time.time(),
|
||||
last_error_code=401,
|
||||
)
|
||||
pool._replace_entry(entry, exhausted)
|
||||
pool._persist()
|
||||
|
||||
# Simulate another process having successfully refreshed
|
||||
_write_auth_store(
|
||||
tmp_path,
|
||||
{
|
||||
"version": 1,
|
||||
"active_provider": "nous",
|
||||
"providers": {
|
||||
"nous": {
|
||||
"portal_base_url": "https://portal.example.com",
|
||||
"inference_base_url": "https://inference.example.com/v1",
|
||||
"client_id": "hermes-cli",
|
||||
"token_type": "Bearer",
|
||||
"scope": "inference:mint_agent_key",
|
||||
"access_token": "access-FRESH",
|
||||
"refresh_token": "refresh-FRESH",
|
||||
"expires_at": "2026-03-24T12:30:00+00:00",
|
||||
"agent_key": "agent-key-FRESH",
|
||||
"agent_key_expires_at": "2026-03-24T14:00:00+00:00",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
available = pool._available_entries(clear_expired=True)
|
||||
assert len(available) == 1
|
||||
assert available[0].refresh_token == "refresh-FRESH"
|
||||
assert available[0].last_status is None
|
||||
|
||||
@@ -1094,37 +1094,3 @@ class TestSSLTransientPatterns:
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
||||
# ── Test: RateLimitError without status_code (Copilot/GitHub Models) ──────────
|
||||
|
||||
class TestRateLimitErrorWithoutStatusCode:
|
||||
"""Regression tests for the Copilot/GitHub Models edge case where the
|
||||
OpenAI SDK raises RateLimitError but does not populate .status_code."""
|
||||
|
||||
def _make_rate_limit_error(self, status_code=None):
|
||||
"""Create an exception whose class name is 'RateLimitError' with
|
||||
an optionally missing status_code, mirroring the OpenAI SDK shape."""
|
||||
cls = type("RateLimitError", (Exception,), {})
|
||||
e = cls("You have exceeded your rate limit.")
|
||||
e.status_code = status_code # None simulates the Copilot case
|
||||
return e
|
||||
|
||||
def test_rate_limit_error_without_status_code_classified_as_rate_limit(self):
|
||||
"""RateLimitError with status_code=None must classify as rate_limit."""
|
||||
e = self._make_rate_limit_error(status_code=None)
|
||||
result = classify_api_error(e, provider="copilot", model="gpt-4o")
|
||||
assert result.reason == FailoverReason.rate_limit
|
||||
|
||||
def test_rate_limit_error_with_status_code_429_classified_as_rate_limit(self):
|
||||
"""RateLimitError that does set status_code=429 still classifies correctly."""
|
||||
e = self._make_rate_limit_error(status_code=429)
|
||||
result = classify_api_error(e, provider="copilot", model="gpt-4o")
|
||||
assert result.reason == FailoverReason.rate_limit
|
||||
|
||||
def test_other_error_without_status_code_not_forced_to_rate_limit(self):
|
||||
"""A non-RateLimitError with missing status_code must NOT be forced to 429."""
|
||||
cls = type("APIError", (Exception,), {})
|
||||
e = cls("something went wrong")
|
||||
e.status_code = None
|
||||
result = classify_api_error(e, provider="copilot", model="gpt-4o")
|
||||
assert result.reason != FailoverReason.rate_limit
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
"""Tests for Gemini free-tier detection and blocking."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.gemini_native_adapter import (
|
||||
gemini_http_error,
|
||||
is_free_tier_quota_error,
|
||||
probe_gemini_tier,
|
||||
)
|
||||
|
||||
|
||||
def _mock_response(status: int, headers: dict | None = None, text: str = "") -> MagicMock:
|
||||
resp = MagicMock()
|
||||
resp.status_code = status
|
||||
resp.headers = headers or {}
|
||||
resp.text = text
|
||||
return resp
|
||||
|
||||
|
||||
def _run_probe(resp: MagicMock) -> str:
|
||||
with patch("agent.gemini_native_adapter.httpx.Client") as MC:
|
||||
inst = MagicMock()
|
||||
inst.post.return_value = resp
|
||||
MC.return_value.__enter__.return_value = inst
|
||||
return probe_gemini_tier("fake-key")
|
||||
|
||||
|
||||
class TestProbeGeminiTier:
|
||||
"""Verify the tier probe classifies keys correctly."""
|
||||
|
||||
def test_free_tier_via_rpd_header_flash(self):
|
||||
# gemini-2.5-flash free tier: 250 RPD
|
||||
resp = _mock_response(200, {"x-ratelimit-limit-requests-per-day": "250"}, "{}")
|
||||
assert _run_probe(resp) == "free"
|
||||
|
||||
def test_free_tier_via_rpd_header_pro(self):
|
||||
# gemini-2.5-pro free tier: 100 RPD
|
||||
resp = _mock_response(200, {"x-ratelimit-limit-requests-per-day": "100"}, "{}")
|
||||
assert _run_probe(resp) == "free"
|
||||
|
||||
def test_free_tier_via_rpd_header_flash_lite(self):
|
||||
# flash-lite free tier: 1000 RPD (our upper bound)
|
||||
resp = _mock_response(200, {"x-ratelimit-limit-requests-per-day": "1000"}, "{}")
|
||||
assert _run_probe(resp) == "free"
|
||||
|
||||
def test_paid_tier_via_rpd_header(self):
|
||||
# Tier 1 starts at 1500+ RPD
|
||||
resp = _mock_response(200, {"x-ratelimit-limit-requests-per-day": "1500"}, "{}")
|
||||
assert _run_probe(resp) == "paid"
|
||||
|
||||
def test_free_tier_via_429_body(self):
|
||||
body = (
|
||||
'{"error":{"code":429,"message":"Quota exceeded for metric: '
|
||||
'generativelanguage.googleapis.com/generate_content_free_tier_requests, '
|
||||
'limit: 20"}}'
|
||||
)
|
||||
resp = _mock_response(429, {}, body)
|
||||
assert _run_probe(resp) == "free"
|
||||
|
||||
def test_paid_429_has_no_free_tier_marker(self):
|
||||
body = '{"error":{"code":429,"message":"rate limited"}}'
|
||||
resp = _mock_response(429, {}, body)
|
||||
assert _run_probe(resp) == "paid"
|
||||
|
||||
def test_successful_200_without_rpd_header_is_paid(self):
|
||||
resp = _mock_response(200, {}, '{"candidates":[]}')
|
||||
assert _run_probe(resp) == "paid"
|
||||
|
||||
def test_401_returns_unknown(self):
|
||||
resp = _mock_response(401, {}, '{"error":{"code":401}}')
|
||||
assert _run_probe(resp) == "unknown"
|
||||
|
||||
def test_404_returns_unknown(self):
|
||||
resp = _mock_response(404, {}, '{"error":{"code":404}}')
|
||||
assert _run_probe(resp) == "unknown"
|
||||
|
||||
def test_network_error_returns_unknown(self):
|
||||
with patch(
|
||||
"agent.gemini_native_adapter.httpx.Client",
|
||||
side_effect=Exception("dns failure"),
|
||||
):
|
||||
assert probe_gemini_tier("fake-key") == "unknown"
|
||||
|
||||
def test_empty_key_returns_unknown(self):
|
||||
assert probe_gemini_tier("") == "unknown"
|
||||
assert probe_gemini_tier(" ") == "unknown"
|
||||
assert probe_gemini_tier(None) == "unknown" # type: ignore[arg-type]
|
||||
|
||||
def test_malformed_rpd_header_falls_through(self):
|
||||
# Non-integer header value shouldn't crash; 200 with no usable header -> paid.
|
||||
resp = _mock_response(200, {"x-ratelimit-limit-requests-per-day": "abc"}, "{}")
|
||||
assert _run_probe(resp) == "paid"
|
||||
|
||||
def test_openai_compat_suffix_stripped(self):
|
||||
"""Base URLs ending in /openai get normalized to the native endpoint."""
|
||||
resp = _mock_response(200, {"x-ratelimit-limit-requests-per-day": "1500"}, "{}")
|
||||
with patch("agent.gemini_native_adapter.httpx.Client") as MC:
|
||||
inst = MagicMock()
|
||||
inst.post.return_value = resp
|
||||
MC.return_value.__enter__.return_value = inst
|
||||
probe_gemini_tier(
|
||||
"fake",
|
||||
"https://generativelanguage.googleapis.com/v1beta/openai",
|
||||
)
|
||||
# Verify the post URL does NOT contain /openai
|
||||
called_url = inst.post.call_args[0][0]
|
||||
assert "/openai/" not in called_url
|
||||
assert called_url.endswith(":generateContent")
|
||||
|
||||
|
||||
class TestIsFreeTierQuotaError:
|
||||
def test_detects_free_tier_marker(self):
|
||||
assert is_free_tier_quota_error(
|
||||
"Quota exceeded for metric: generate_content_free_tier_requests"
|
||||
)
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert is_free_tier_quota_error("QUOTA: FREE_TIER_REQUESTS")
|
||||
|
||||
def test_no_free_tier_marker(self):
|
||||
assert not is_free_tier_quota_error("rate limited")
|
||||
|
||||
def test_empty_string(self):
|
||||
assert not is_free_tier_quota_error("")
|
||||
|
||||
def test_none(self):
|
||||
assert not is_free_tier_quota_error(None) # type: ignore[arg-type]
|
||||
|
||||
|
||||
class TestGeminiHttpErrorFreeTierGuidance:
|
||||
"""gemini_http_error should append free-tier guidance for free-tier 429s."""
|
||||
|
||||
class _FakeResp:
|
||||
def __init__(self, status: int, text: str):
|
||||
self.status_code = status
|
||||
self.headers: dict = {}
|
||||
self.text = text
|
||||
|
||||
def test_free_tier_429_appends_guidance(self):
|
||||
body = (
|
||||
'{"error":{"code":429,"message":"Quota exceeded for metric: '
|
||||
"generativelanguage.googleapis.com/generate_content_free_tier_requests, "
|
||||
'limit: 20","status":"RESOURCE_EXHAUSTED"}}'
|
||||
)
|
||||
err = gemini_http_error(self._FakeResp(429, body))
|
||||
msg = str(err)
|
||||
assert "free tier" in msg.lower()
|
||||
assert "aistudio.google.com/apikey" in msg
|
||||
|
||||
def test_paid_429_has_no_billing_url(self):
|
||||
body = '{"error":{"code":429,"message":"Rate limited","status":"RESOURCE_EXHAUSTED"}}'
|
||||
err = gemini_http_error(self._FakeResp(429, body))
|
||||
assert "aistudio.google.com/apikey" not in str(err)
|
||||
|
||||
def test_non_429_has_no_billing_url(self):
|
||||
body = '{"error":{"code":400,"message":"bad request","status":"INVALID_ARGUMENT"}}'
|
||||
err = gemini_http_error(self._FakeResp(400, body))
|
||||
assert "aistudio.google.com/apikey" not in str(err)
|
||||
|
||||
def test_401_has_no_billing_url(self):
|
||||
body = '{"error":{"code":401,"message":"API key invalid","status":"UNAUTHENTICATED"}}'
|
||||
err = gemini_http_error(self._FakeResp(401, body))
|
||||
assert "aistudio.google.com/apikey" not in str(err)
|
||||
@@ -234,19 +234,6 @@ def test_native_client_accepts_injected_http_client():
|
||||
assert client._http is injected
|
||||
|
||||
|
||||
def test_native_client_rejects_empty_api_key_with_actionable_message():
|
||||
"""Empty/whitespace api_key must raise at construction, not produce a cryptic
|
||||
Google GFE 'Error 400 (Bad Request)!!1' HTML page on the first request."""
|
||||
from agent.gemini_native_adapter import GeminiNativeClient
|
||||
|
||||
for bad in ("", " ", None):
|
||||
with pytest.raises(RuntimeError) as excinfo:
|
||||
GeminiNativeClient(api_key=bad) # type: ignore[arg-type]
|
||||
msg = str(excinfo.value)
|
||||
assert "GOOGLE_API_KEY" in msg and "GEMINI_API_KEY" in msg
|
||||
assert "aistudio.google.com" in msg
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_native_client_streams_without_requiring_async_iterator_from_sync_client():
|
||||
from agent.gemini_native_adapter import AsyncGeminiNativeClient
|
||||
|
||||
@@ -1,140 +0,0 @@
|
||||
"""Tests for agent.gemini_schema — OpenAI→Gemini tool parameter translation."""
|
||||
|
||||
from agent.gemini_schema import (
|
||||
sanitize_gemini_schema,
|
||||
sanitize_gemini_tool_parameters,
|
||||
)
|
||||
|
||||
|
||||
class TestSanitizeGeminiSchema:
|
||||
def test_strips_unknown_top_level_keys(self):
|
||||
"""$schema / additionalProperties etc. must not reach Gemini."""
|
||||
schema = {
|
||||
"type": "object",
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"additionalProperties": False,
|
||||
"properties": {"foo": {"type": "string"}},
|
||||
}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert "$schema" not in cleaned
|
||||
assert "additionalProperties" not in cleaned
|
||||
assert cleaned["type"] == "object"
|
||||
assert cleaned["properties"] == {"foo": {"type": "string"}}
|
||||
|
||||
def test_preserves_string_enums(self):
|
||||
"""String-valued enums are valid for Gemini and must pass through."""
|
||||
schema = {"type": "string", "enum": ["pending", "done", "cancelled"]}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert cleaned["type"] == "string"
|
||||
assert cleaned["enum"] == ["pending", "done", "cancelled"]
|
||||
|
||||
def test_drops_integer_enum_to_satisfy_gemini(self):
|
||||
"""Gemini rejects int-typed enums; the sanitizer must drop the enum.
|
||||
|
||||
Regression for the Discord tool's ``auto_archive_duration``:
|
||||
``{type: integer, enum: [60, 1440, 4320, 10080]}`` caused
|
||||
Gemini HTTP 400 INVALID_ARGUMENT
|
||||
"Invalid value ... (TYPE_STRING), 60" on every request that
|
||||
shipped the full tool catalog to generativelanguage.googleapis.com.
|
||||
"""
|
||||
schema = {
|
||||
"type": "integer",
|
||||
"enum": [60, 1440, 4320, 10080],
|
||||
"description": "Minutes (60, 1440, 4320, 10080).",
|
||||
}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert cleaned["type"] == "integer"
|
||||
assert "enum" not in cleaned
|
||||
# description must survive so the model still sees the allowed values
|
||||
assert cleaned["description"].startswith("Minutes")
|
||||
|
||||
def test_drops_number_enum(self):
|
||||
"""Same rule applies to ``type: number``."""
|
||||
schema = {"type": "number", "enum": [0.5, 1.0, 2.0]}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert cleaned["type"] == "number"
|
||||
assert "enum" not in cleaned
|
||||
|
||||
def test_drops_boolean_enum(self):
|
||||
"""And to ``type: boolean`` (Gemini rejects non-string entries)."""
|
||||
schema = {"type": "boolean", "enum": [True, False]}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert cleaned["type"] == "boolean"
|
||||
assert "enum" not in cleaned
|
||||
|
||||
def test_keeps_string_enum_even_when_numeric_values_coexist_as_strings(self):
|
||||
"""Stringified-numeric enums ARE valid for Gemini; don't drop them."""
|
||||
schema = {"type": "string", "enum": ["60", "1440", "4320", "10080"]}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert cleaned["enum"] == ["60", "1440", "4320", "10080"]
|
||||
|
||||
def test_drops_nested_integer_enum_inside_properties(self):
|
||||
"""The fix must apply recursively — the Discord case is nested."""
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"auto_archive_duration": {
|
||||
"type": "integer",
|
||||
"enum": [60, 1440, 4320, 10080],
|
||||
"description": "Thread archive duration in minutes.",
|
||||
},
|
||||
"status": {
|
||||
"type": "string",
|
||||
"enum": ["active", "archived"],
|
||||
},
|
||||
},
|
||||
}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
props = cleaned["properties"]
|
||||
# Integer enum is dropped...
|
||||
assert props["auto_archive_duration"]["type"] == "integer"
|
||||
assert "enum" not in props["auto_archive_duration"]
|
||||
# ...but the sibling string enum is preserved.
|
||||
assert props["status"]["enum"] == ["active", "archived"]
|
||||
|
||||
def test_drops_integer_enum_inside_array_items(self):
|
||||
"""Array item schemas recurse through ``items``."""
|
||||
schema = {
|
||||
"type": "array",
|
||||
"items": {"type": "integer", "enum": [1, 2, 3]},
|
||||
}
|
||||
cleaned = sanitize_gemini_schema(schema)
|
||||
assert cleaned["items"]["type"] == "integer"
|
||||
assert "enum" not in cleaned["items"]
|
||||
|
||||
def test_non_dict_input_returns_empty(self):
|
||||
assert sanitize_gemini_schema(None) == {}
|
||||
assert sanitize_gemini_schema("not a schema") == {}
|
||||
assert sanitize_gemini_schema([1, 2, 3]) == {}
|
||||
|
||||
|
||||
class TestSanitizeGeminiToolParameters:
|
||||
def test_empty_parameters_return_valid_object_schema(self):
|
||||
"""Gemini requires ``parameters`` to be a valid object schema."""
|
||||
cleaned = sanitize_gemini_tool_parameters({})
|
||||
assert cleaned == {"type": "object", "properties": {}}
|
||||
|
||||
def test_discord_create_thread_parameters_no_longer_trip_gemini(self):
|
||||
"""End-to-end regression: the exact shape that was rejected in prod."""
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {"type": "string", "enum": ["create_thread"]},
|
||||
"auto_archive_duration": {
|
||||
"type": "integer",
|
||||
"enum": [60, 1440, 4320, 10080],
|
||||
"description": "Thread archive duration in minutes "
|
||||
"(create_thread, default 1440).",
|
||||
},
|
||||
},
|
||||
"required": ["action"],
|
||||
}
|
||||
cleaned = sanitize_gemini_tool_parameters(params)
|
||||
aad = cleaned["properties"]["auto_archive_duration"]
|
||||
# The field that triggered the Gemini 400 is gone.
|
||||
assert "enum" not in aad
|
||||
# Type + description survive so the model still knows what to send.
|
||||
assert aad["type"] == "integer"
|
||||
assert "1440" in aad["description"]
|
||||
# And the string-enum sibling is untouched.
|
||||
assert cleaned["properties"]["action"]["enum"] == ["create_thread"]
|
||||
@@ -341,7 +341,6 @@ class TestMinimaxSwitchModelCredentialGuard:
|
||||
agent._client_kwargs = {}
|
||||
agent.client = None
|
||||
agent._anthropic_client = MagicMock()
|
||||
agent._fallback_chain = []
|
||||
|
||||
with patch("agent.anthropic_adapter.build_anthropic_client") as mock_build, \
|
||||
patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-leaked") as mock_resolve, \
|
||||
|
||||
@@ -319,98 +319,6 @@ class TestCodexOAuthContextLength:
|
||||
"leaked outside openai-codex provider"
|
||||
)
|
||||
|
||||
def test_stale_codex_cache_over_400k_is_invalidated(self, tmp_path, monkeypatch):
|
||||
"""Pre-PR #14935 builds cached gpt-5.5 at 1.05M (from models.dev)
|
||||
before the Codex-aware branch existed. Upgrading users keep that
|
||||
stale entry on disk and the cache-first lookup returns it forever.
|
||||
Codex OAuth caps at 272k for every slug, so any cached Codex
|
||||
entry >= 400k must be dropped and re-resolved via the live probe.
|
||||
"""
|
||||
from agent import model_metadata as mm
|
||||
|
||||
# Isolate the cache file to tmp_path
|
||||
cache_file = tmp_path / "context_length_cache.yaml"
|
||||
monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
|
||||
|
||||
base_url = "https://chatgpt.com/backend-api/codex/"
|
||||
stale_key = f"gpt-5.5@{base_url}"
|
||||
other_key = "other-model@https://api.openai.com/v1/"
|
||||
import yaml as _yaml
|
||||
cache_file.write_text(_yaml.dump({"context_lengths": {
|
||||
stale_key: 1_050_000, # stale pre-fix value
|
||||
other_key: 128_000, # unrelated, must survive
|
||||
}}))
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.status_code = 200
|
||||
fake_response.json.return_value = {
|
||||
"models": [{"slug": "gpt-5.5", "context_window": 272_000}]
|
||||
}
|
||||
|
||||
with patch("agent.model_metadata.requests.get", return_value=fake_response), \
|
||||
patch("agent.model_metadata.save_context_length") as mock_save:
|
||||
ctx = mm.get_model_context_length(
|
||||
model="gpt-5.5",
|
||||
base_url=base_url,
|
||||
api_key="fake-token",
|
||||
provider="openai-codex",
|
||||
)
|
||||
|
||||
assert ctx == 272_000, f"Stale entry should have been re-resolved to 272k, got {ctx}"
|
||||
# Live save was called with the fresh value
|
||||
mock_save.assert_called_with("gpt-5.5", base_url, 272_000)
|
||||
# The stale entry was removed from disk; unrelated entries survived
|
||||
remaining = _yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
|
||||
assert stale_key not in remaining, "Stale entry was not invalidated from the cache file"
|
||||
assert remaining.get(other_key) == 128_000, "Unrelated cache entries must not be touched"
|
||||
|
||||
def test_fresh_codex_cache_under_400k_is_respected(self, tmp_path, monkeypatch):
|
||||
"""Codex entries at the correct 272k must NOT be invalidated —
|
||||
only stale pre-fix values (>= 400k) get dropped."""
|
||||
from agent import model_metadata as mm
|
||||
|
||||
cache_file = tmp_path / "context_length_cache.yaml"
|
||||
monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
|
||||
|
||||
base_url = "https://chatgpt.com/backend-api/codex/"
|
||||
import yaml as _yaml
|
||||
cache_file.write_text(_yaml.dump({"context_lengths": {
|
||||
f"gpt-5.5@{base_url}": 272_000,
|
||||
}}))
|
||||
|
||||
# If the invalidation incorrectly fired, this would be called; assert it isn't.
|
||||
with patch("agent.model_metadata.requests.get") as mock_get:
|
||||
ctx = mm.get_model_context_length(
|
||||
model="gpt-5.5",
|
||||
base_url=base_url,
|
||||
api_key="fake-token",
|
||||
provider="openai-codex",
|
||||
)
|
||||
assert ctx == 272_000
|
||||
mock_get.assert_not_called()
|
||||
|
||||
def test_stale_invalidation_scoped_to_codex_provider(self, tmp_path, monkeypatch):
|
||||
"""A cached 1M entry for a non-Codex provider (e.g. Anthropic opus on
|
||||
OpenRouter, legitimately 1M) must NOT be invalidated by this guard."""
|
||||
from agent import model_metadata as mm
|
||||
|
||||
cache_file = tmp_path / "context_length_cache.yaml"
|
||||
monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
|
||||
|
||||
base_url = "https://openrouter.ai/api/v1"
|
||||
import yaml as _yaml
|
||||
cache_file.write_text(_yaml.dump({"context_lengths": {
|
||||
f"anthropic/claude-opus-4.6@{base_url}": 1_000_000,
|
||||
}}))
|
||||
|
||||
ctx = mm.get_model_context_length(
|
||||
model="anthropic/claude-opus-4.6",
|
||||
base_url=base_url,
|
||||
api_key="fake",
|
||||
provider="openrouter",
|
||||
)
|
||||
assert ctx == 1_000_000, "Non-codex 1M cache entries must be respected"
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# get_model_context_length — resolution order
|
||||
|
||||
@@ -1,90 +0,0 @@
|
||||
"""Tests for _resolve_requests_verify() env var precedence.
|
||||
|
||||
Verifies that custom provider `/models` fetches honour the three supported
|
||||
CA bundle env vars (HERMES_CA_BUNDLE, REQUESTS_CA_BUNDLE, SSL_CERT_FILE)
|
||||
in the documented priority order, and that non-existent paths are
|
||||
skipped gracefully rather than breaking the request.
|
||||
|
||||
No filesystem or network I/O required — we use tmp_path to create real
|
||||
CA bundle stand-in files and monkeypatch env vars.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.model_metadata import _resolve_requests_verify
|
||||
|
||||
|
||||
_CA_ENV_VARS = ("HERMES_CA_BUNDLE", "REQUESTS_CA_BUNDLE", "SSL_CERT_FILE")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def clean_env(monkeypatch):
|
||||
"""Clear all three SSL env vars so each test starts from a known state."""
|
||||
for var in _CA_ENV_VARS:
|
||||
monkeypatch.delenv(var, raising=False)
|
||||
return monkeypatch
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bundle_file(tmp_path: Path) -> str:
|
||||
"""Create a placeholder CA bundle file and return its absolute path."""
|
||||
path = tmp_path / "ca.pem"
|
||||
path.write_text("-----BEGIN CERTIFICATE-----\nstub\n-----END CERTIFICATE-----\n")
|
||||
return str(path)
|
||||
|
||||
|
||||
class TestResolveRequestsVerify:
|
||||
def test_no_env_returns_true(self, clean_env):
|
||||
assert _resolve_requests_verify() is True
|
||||
|
||||
def test_hermes_ca_bundle_returns_path(self, clean_env, bundle_file):
|
||||
clean_env.setenv("HERMES_CA_BUNDLE", bundle_file)
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
|
||||
def test_requests_ca_bundle_returns_path(self, clean_env, bundle_file):
|
||||
clean_env.setenv("REQUESTS_CA_BUNDLE", bundle_file)
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
|
||||
def test_ssl_cert_file_returns_path(self, clean_env, bundle_file):
|
||||
clean_env.setenv("SSL_CERT_FILE", bundle_file)
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
|
||||
def test_priority_hermes_over_requests(self, clean_env, tmp_path, bundle_file):
|
||||
other = tmp_path / "other.pem"
|
||||
other.write_text("stub")
|
||||
clean_env.setenv("HERMES_CA_BUNDLE", bundle_file)
|
||||
clean_env.setenv("REQUESTS_CA_BUNDLE", str(other))
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
|
||||
def test_priority_requests_over_ssl_cert_file(self, clean_env, tmp_path, bundle_file):
|
||||
other = tmp_path / "other.pem"
|
||||
other.write_text("stub")
|
||||
clean_env.setenv("REQUESTS_CA_BUNDLE", bundle_file)
|
||||
clean_env.setenv("SSL_CERT_FILE", str(other))
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
|
||||
def test_nonexistent_path_falls_through(self, clean_env, tmp_path, bundle_file):
|
||||
missing = tmp_path / "does_not_exist.pem"
|
||||
clean_env.setenv("HERMES_CA_BUNDLE", str(missing))
|
||||
clean_env.setenv("REQUESTS_CA_BUNDLE", bundle_file)
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
|
||||
def test_all_nonexistent_returns_true(self, clean_env, tmp_path):
|
||||
missing1 = tmp_path / "a.pem"
|
||||
missing2 = tmp_path / "b.pem"
|
||||
missing3 = tmp_path / "c.pem"
|
||||
clean_env.setenv("HERMES_CA_BUNDLE", str(missing1))
|
||||
clean_env.setenv("REQUESTS_CA_BUNDLE", str(missing2))
|
||||
clean_env.setenv("SSL_CERT_FILE", str(missing3))
|
||||
assert _resolve_requests_verify() is True
|
||||
|
||||
def test_empty_string_env_var_ignored(self, clean_env, bundle_file):
|
||||
clean_env.setenv("HERMES_CA_BUNDLE", "")
|
||||
clean_env.setenv("REQUESTS_CA_BUNDLE", bundle_file)
|
||||
assert _resolve_requests_verify() == bundle_file
|
||||
@@ -1,11 +1,13 @@
|
||||
"""Tests for agent/skill_commands.py — skill slash command scanning and platform filtering."""
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import tools.skills_tool as skills_tool_module
|
||||
from agent.skill_commands import (
|
||||
build_plan_path,
|
||||
build_preloaded_skills_prompt,
|
||||
build_skill_invocation_message,
|
||||
resolve_skill_command_key,
|
||||
@@ -397,6 +399,40 @@ Generate some audio.
|
||||
assert 'file_path="<path>"' in msg
|
||||
|
||||
|
||||
class TestPlanSkillHelpers:
|
||||
def test_build_plan_path_uses_workspace_relative_dir_and_slugifies_request(self):
|
||||
path = build_plan_path(
|
||||
"Implement OAuth login + refresh tokens!",
|
||||
now=datetime(2026, 3, 15, 9, 30, 45),
|
||||
)
|
||||
|
||||
assert path == Path(".hermes") / "plans" / "2026-03-15_093045-implement-oauth-login-refresh-tokens.md"
|
||||
|
||||
def test_plan_skill_message_can_include_runtime_save_path_note(self, tmp_path):
|
||||
with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
|
||||
_make_skill(
|
||||
tmp_path,
|
||||
"plan",
|
||||
body="Save plans under .hermes/plans in the active workspace and do not execute the work.",
|
||||
)
|
||||
scan_skill_commands()
|
||||
msg = build_skill_invocation_message(
|
||||
"/plan",
|
||||
"Add a /plan command",
|
||||
runtime_note=(
|
||||
"Save the markdown plan with write_file to this exact relative path inside "
|
||||
"the active workspace/backend cwd: .hermes/plans/plan.md"
|
||||
),
|
||||
)
|
||||
|
||||
assert msg is not None
|
||||
assert "Save plans under $HERMES_HOME/plans" not in msg
|
||||
assert ".hermes/plans" in msg
|
||||
assert "Add a /plan command" in msg
|
||||
assert ".hermes/plans/plan.md" in msg
|
||||
assert "Runtime note:" in msg
|
||||
|
||||
|
||||
class TestSkillDirectoryHeader:
|
||||
"""The activation message must expose the absolute skill directory and
|
||||
explain how to resolve relative paths, so skills with bundled scripts
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
"""Tests for the /plan CLI slash command."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from agent.skill_commands import scan_skill_commands
|
||||
from cli import HermesCLI
|
||||
|
||||
|
||||
def _make_cli():
|
||||
cli_obj = HermesCLI.__new__(HermesCLI)
|
||||
cli_obj.config = {}
|
||||
cli_obj.console = MagicMock()
|
||||
cli_obj.agent = None
|
||||
cli_obj.conversation_history = []
|
||||
cli_obj.session_id = "sess-123"
|
||||
cli_obj._pending_input = MagicMock()
|
||||
return cli_obj
|
||||
|
||||
|
||||
def _make_plan_skill(skills_dir):
|
||||
skill_dir = skills_dir / "plan"
|
||||
skill_dir.mkdir(parents=True, exist_ok=True)
|
||||
(skill_dir / "SKILL.md").write_text(
|
||||
"""---
|
||||
name: plan
|
||||
description: Plan mode skill.
|
||||
---
|
||||
|
||||
# Plan
|
||||
|
||||
Use the current conversation context when no explicit instruction is provided.
|
||||
Save plans under the active workspace's .hermes/plans directory.
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
class TestCLIPlanCommand:
|
||||
def test_plan_command_queues_plan_skill_message(self, tmp_path, monkeypatch):
|
||||
cli_obj = _make_cli()
|
||||
|
||||
with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
|
||||
_make_plan_skill(tmp_path)
|
||||
scan_skill_commands()
|
||||
result = cli_obj.process_command("/plan Add OAuth login")
|
||||
|
||||
assert result is True
|
||||
cli_obj._pending_input.put.assert_called_once()
|
||||
queued = cli_obj._pending_input.put.call_args[0][0]
|
||||
assert "Plan mode skill" in queued
|
||||
assert "Add OAuth login" in queued
|
||||
assert ".hermes/plans" in queued
|
||||
assert str(tmp_path / "plans") not in queued
|
||||
assert "active workspace/backend cwd" in queued
|
||||
assert "Runtime note:" in queued
|
||||
|
||||
def test_plan_without_args_uses_skill_context_guidance(self, tmp_path, monkeypatch):
|
||||
cli_obj = _make_cli()
|
||||
|
||||
with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
|
||||
_make_plan_skill(tmp_path)
|
||||
scan_skill_commands()
|
||||
cli_obj.process_command("/plan")
|
||||
|
||||
queued = cli_obj._pending_input.put.call_args[0][0]
|
||||
assert "current conversation context" in queued
|
||||
assert ".hermes/plans/" in queued
|
||||
assert "conversation-plan.md" in queued
|
||||
@@ -23,11 +23,6 @@ class TestCLIQuickCommands:
|
||||
cli.console = MagicMock()
|
||||
cli.agent = None
|
||||
cli.conversation_history = []
|
||||
# session_id is accessed by the fallback skill/fuzzy-match path in
|
||||
# process_command; without it, tests that exercise `/alias args`
|
||||
# can trip an AttributeError when cross-test state leaks a skill
|
||||
# command matching the alias target.
|
||||
cli.session_id = "test-session"
|
||||
return cli
|
||||
|
||||
def test_exec_command_runs_and_prints_output(self):
|
||||
|
||||
@@ -1,380 +0,0 @@
|
||||
"""Tests for per-job workdir support in cron jobs.
|
||||
|
||||
Covers:
|
||||
- jobs.create_job: param plumbing, validation, default-None preserved
|
||||
- jobs._normalize_workdir: absolute / relative / missing / file-not-dir
|
||||
- jobs.update_job: set, clear, re-validate
|
||||
- tools.cronjob_tools.cronjob: create + update JSON round-trip, schema
|
||||
includes workdir, _format_job exposes it when set
|
||||
- scheduler.tick(): partitions workdir jobs off the thread pool, restores
|
||||
TERMINAL_CWD in finally, honours the env override during run_job
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tmp_cron_dir(tmp_path, monkeypatch):
|
||||
"""Isolate cron job storage into a temp dir so tests don't stomp on real jobs."""
|
||||
monkeypatch.setattr("cron.jobs.CRON_DIR", tmp_path / "cron")
|
||||
monkeypatch.setattr("cron.jobs.JOBS_FILE", tmp_path / "cron" / "jobs.json")
|
||||
monkeypatch.setattr("cron.jobs.OUTPUT_DIR", tmp_path / "cron" / "output")
|
||||
return tmp_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# jobs._normalize_workdir
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNormalizeWorkdir:
|
||||
def test_none_returns_none(self):
|
||||
from cron.jobs import _normalize_workdir
|
||||
assert _normalize_workdir(None) is None
|
||||
|
||||
def test_empty_string_returns_none(self):
|
||||
from cron.jobs import _normalize_workdir
|
||||
assert _normalize_workdir("") is None
|
||||
assert _normalize_workdir(" ") is None
|
||||
|
||||
def test_absolute_existing_dir_returns_resolved_str(self, tmp_path):
|
||||
from cron.jobs import _normalize_workdir
|
||||
result = _normalize_workdir(str(tmp_path))
|
||||
assert result == str(tmp_path.resolve())
|
||||
|
||||
def test_tilde_expands(self, tmp_path, monkeypatch):
|
||||
from cron.jobs import _normalize_workdir
|
||||
monkeypatch.setenv("HOME", str(tmp_path))
|
||||
result = _normalize_workdir("~")
|
||||
assert result == str(tmp_path.resolve())
|
||||
|
||||
def test_relative_path_rejected(self):
|
||||
from cron.jobs import _normalize_workdir
|
||||
with pytest.raises(ValueError, match="absolute path"):
|
||||
_normalize_workdir("some/relative/path")
|
||||
|
||||
def test_missing_dir_rejected(self, tmp_path):
|
||||
from cron.jobs import _normalize_workdir
|
||||
missing = tmp_path / "does-not-exist"
|
||||
with pytest.raises(ValueError, match="does not exist"):
|
||||
_normalize_workdir(str(missing))
|
||||
|
||||
def test_file_not_dir_rejected(self, tmp_path):
|
||||
from cron.jobs import _normalize_workdir
|
||||
f = tmp_path / "file.txt"
|
||||
f.write_text("hi")
|
||||
with pytest.raises(ValueError, match="not a directory"):
|
||||
_normalize_workdir(str(f))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# jobs.create_job and update_job
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCreateJobWorkdir:
|
||||
def test_workdir_stored_when_set(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job, get_job
|
||||
job = create_job(
|
||||
prompt="hello",
|
||||
schedule="every 1h",
|
||||
workdir=str(tmp_cron_dir),
|
||||
)
|
||||
stored = get_job(job["id"])
|
||||
assert stored["workdir"] == str(tmp_cron_dir.resolve())
|
||||
|
||||
def test_workdir_none_preserves_old_behaviour(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job, get_job
|
||||
job = create_job(prompt="hello", schedule="every 1h")
|
||||
stored = get_job(job["id"])
|
||||
# Field is present on the dict but None — downstream code checks
|
||||
# truthiness to decide whether the feature is active.
|
||||
assert stored.get("workdir") is None
|
||||
|
||||
def test_create_rejects_invalid_workdir(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job
|
||||
with pytest.raises(ValueError):
|
||||
create_job(
|
||||
prompt="hello",
|
||||
schedule="every 1h",
|
||||
workdir="not/absolute",
|
||||
)
|
||||
|
||||
|
||||
class TestUpdateJobWorkdir:
|
||||
def test_set_workdir_via_update(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job, get_job, update_job
|
||||
job = create_job(prompt="x", schedule="every 1h")
|
||||
update_job(job["id"], {"workdir": str(tmp_cron_dir)})
|
||||
assert get_job(job["id"])["workdir"] == str(tmp_cron_dir.resolve())
|
||||
|
||||
def test_clear_workdir_with_none(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job, get_job, update_job
|
||||
job = create_job(
|
||||
prompt="x", schedule="every 1h", workdir=str(tmp_cron_dir)
|
||||
)
|
||||
update_job(job["id"], {"workdir": None})
|
||||
assert get_job(job["id"])["workdir"] is None
|
||||
|
||||
def test_clear_workdir_with_empty_string(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job, get_job, update_job
|
||||
job = create_job(
|
||||
prompt="x", schedule="every 1h", workdir=str(tmp_cron_dir)
|
||||
)
|
||||
update_job(job["id"], {"workdir": ""})
|
||||
assert get_job(job["id"])["workdir"] is None
|
||||
|
||||
def test_update_rejects_invalid_workdir(self, tmp_cron_dir):
|
||||
from cron.jobs import create_job, update_job
|
||||
job = create_job(prompt="x", schedule="every 1h")
|
||||
with pytest.raises(ValueError):
|
||||
update_job(job["id"], {"workdir": "nope/relative"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# tools.cronjob_tools: end-to-end JSON round-trip
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCronjobToolWorkdir:
|
||||
def test_create_with_workdir_json_roundtrip(self, tmp_cron_dir):
|
||||
from tools.cronjob_tools import cronjob
|
||||
|
||||
result = json.loads(
|
||||
cronjob(
|
||||
action="create",
|
||||
prompt="hi",
|
||||
schedule="every 1h",
|
||||
workdir=str(tmp_cron_dir),
|
||||
)
|
||||
)
|
||||
assert result["success"] is True
|
||||
assert result["job"]["workdir"] == str(tmp_cron_dir.resolve())
|
||||
|
||||
def test_create_without_workdir_hides_field_in_format(self, tmp_cron_dir):
|
||||
from tools.cronjob_tools import cronjob
|
||||
|
||||
result = json.loads(
|
||||
cronjob(
|
||||
action="create",
|
||||
prompt="hi",
|
||||
schedule="every 1h",
|
||||
)
|
||||
)
|
||||
assert result["success"] is True
|
||||
# _format_job omits the field when unset — reduces noise in agent output.
|
||||
assert "workdir" not in result["job"]
|
||||
|
||||
def test_update_clears_workdir_with_empty_string(self, tmp_cron_dir):
|
||||
from tools.cronjob_tools import cronjob
|
||||
|
||||
created = json.loads(
|
||||
cronjob(
|
||||
action="create",
|
||||
prompt="hi",
|
||||
schedule="every 1h",
|
||||
workdir=str(tmp_cron_dir),
|
||||
)
|
||||
)
|
||||
job_id = created["job_id"]
|
||||
|
||||
updated = json.loads(
|
||||
cronjob(action="update", job_id=job_id, workdir="")
|
||||
)
|
||||
assert updated["success"] is True
|
||||
assert "workdir" not in updated["job"]
|
||||
|
||||
def test_schema_advertises_workdir(self):
|
||||
from tools.cronjob_tools import CRONJOB_SCHEMA
|
||||
assert "workdir" in CRONJOB_SCHEMA["parameters"]["properties"]
|
||||
desc = CRONJOB_SCHEMA["parameters"]["properties"]["workdir"]["description"]
|
||||
assert "absolute" in desc.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# scheduler.tick(): workdir partition
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTickWorkdirPartition:
|
||||
"""
|
||||
tick() must run workdir jobs sequentially (outside the ThreadPoolExecutor)
|
||||
because run_job mutates os.environ["TERMINAL_CWD"], which is process-global.
|
||||
We verify the partition without booting the real scheduler by patching the
|
||||
pieces tick() calls.
|
||||
"""
|
||||
|
||||
def test_workdir_jobs_run_sequentially(self, tmp_path, monkeypatch):
|
||||
import cron.scheduler as sched
|
||||
|
||||
# Two "jobs" — one with workdir, one without. get_due_jobs returns both.
|
||||
workdir_job = {"id": "a", "name": "A", "workdir": str(tmp_path)}
|
||||
parallel_job = {"id": "b", "name": "B", "workdir": None}
|
||||
|
||||
monkeypatch.setattr(sched, "get_due_jobs", lambda: [workdir_job, parallel_job])
|
||||
monkeypatch.setattr(sched, "advance_next_run", lambda *_a, **_kw: None)
|
||||
|
||||
# Record call order / thread context.
|
||||
import threading
|
||||
calls: list[tuple[str, bool]] = []
|
||||
|
||||
def fake_run_job(job):
|
||||
# Return a minimal tuple matching run_job's signature.
|
||||
calls.append((job["id"], threading.current_thread().name))
|
||||
return True, "output", "response", None
|
||||
|
||||
monkeypatch.setattr(sched, "run_job", fake_run_job)
|
||||
monkeypatch.setattr(sched, "save_job_output", lambda _jid, _o: None)
|
||||
monkeypatch.setattr(sched, "mark_job_run", lambda *_a, **_kw: None)
|
||||
monkeypatch.setattr(
|
||||
sched, "_deliver_result", lambda *_a, **_kw: None
|
||||
)
|
||||
|
||||
n = sched.tick(verbose=False)
|
||||
assert n == 2
|
||||
|
||||
ids = [c[0] for c in calls]
|
||||
# Workdir jobs always come before parallel jobs.
|
||||
assert ids.index("a") < ids.index("b")
|
||||
|
||||
# The workdir job must run on the main thread (sequential pass).
|
||||
main_thread_name = threading.current_thread().name
|
||||
workdir_thread_name = next(t for jid, t in calls if jid == "a")
|
||||
assert workdir_thread_name == main_thread_name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# scheduler.run_job: TERMINAL_CWD + skip_context_files wiring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRunJobTerminalCwd:
|
||||
"""
|
||||
run_job sets TERMINAL_CWD + flips skip_context_files=False when workdir
|
||||
is set, and restores the prior TERMINAL_CWD in finally — even on error.
|
||||
We stub AIAgent so no real API call happens.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _install_stubs(monkeypatch, observed: dict):
|
||||
"""Patch enough of run_job's deps that it executes without real creds."""
|
||||
import os
|
||||
import sys
|
||||
import cron.scheduler as sched
|
||||
|
||||
class FakeAgent:
|
||||
def __init__(self, **kwargs):
|
||||
observed["skip_context_files"] = kwargs.get("skip_context_files")
|
||||
observed["terminal_cwd_during_init"] = os.environ.get(
|
||||
"TERMINAL_CWD", "_UNSET_"
|
||||
)
|
||||
|
||||
def run_conversation(self, *_a, **_kw):
|
||||
observed["terminal_cwd_during_run"] = os.environ.get(
|
||||
"TERMINAL_CWD", "_UNSET_"
|
||||
)
|
||||
return {"final_response": "done", "messages": []}
|
||||
|
||||
def get_activity_summary(self):
|
||||
return {"seconds_since_activity": 0.0}
|
||||
|
||||
fake_mod = type(sys)("run_agent")
|
||||
fake_mod.AIAgent = FakeAgent
|
||||
monkeypatch.setitem(sys.modules, "run_agent", fake_mod)
|
||||
|
||||
# Bypass the real provider resolver — it reads ~/.hermes and credentials.
|
||||
from hermes_cli import runtime_provider as _rtp
|
||||
monkeypatch.setattr(
|
||||
_rtp,
|
||||
"resolve_runtime_provider",
|
||||
lambda **_kw: {
|
||||
"provider": "test",
|
||||
"api_key": "k",
|
||||
"base_url": "http://test.local",
|
||||
"api_mode": "chat_completions",
|
||||
},
|
||||
)
|
||||
|
||||
# Stub scheduler helpers that would otherwise hit the filesystem / config.
|
||||
monkeypatch.setattr(sched, "_build_job_prompt", lambda job, prerun_script=None: "hi")
|
||||
monkeypatch.setattr(sched, "_resolve_origin", lambda job: None)
|
||||
monkeypatch.setattr(sched, "_resolve_delivery_target", lambda job: None)
|
||||
monkeypatch.setattr(sched, "_resolve_cron_enabled_toolsets", lambda job, cfg: None)
|
||||
# Unlimited inactivity so the poll loop returns immediately.
|
||||
monkeypatch.setenv("HERMES_CRON_TIMEOUT", "0")
|
||||
|
||||
# run_job calls load_dotenv(~/.hermes/.env, override=True), which will
|
||||
# happily clobber TERMINAL_CWD out from under us if the real user .env
|
||||
# has TERMINAL_CWD set (common on dev boxes). Stub it out.
|
||||
import dotenv
|
||||
monkeypatch.setattr(dotenv, "load_dotenv", lambda *_a, **_kw: True)
|
||||
|
||||
def test_workdir_sets_and_restores_terminal_cwd(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
import os
|
||||
import cron.scheduler as sched
|
||||
|
||||
# Make sure the test's TERMINAL_CWD starts at a known non-workdir value.
|
||||
# Use monkeypatch.setenv so it's restored on teardown regardless of
|
||||
# whatever other tests in this xdist worker have left behind.
|
||||
monkeypatch.setenv("TERMINAL_CWD", "/original/cwd")
|
||||
|
||||
observed: dict = {}
|
||||
self._install_stubs(monkeypatch, observed)
|
||||
|
||||
job = {
|
||||
"id": "abc",
|
||||
"name": "wd-job",
|
||||
"workdir": str(tmp_path),
|
||||
"schedule_display": "manual",
|
||||
}
|
||||
|
||||
success, _output, response, error = sched.run_job(job)
|
||||
assert success is True, f"run_job failed: error={error!r} response={response!r}"
|
||||
|
||||
# AIAgent was built with skip_context_files=False (feature ON).
|
||||
assert observed["skip_context_files"] is False
|
||||
# TERMINAL_CWD was pointing at the job workdir while the agent ran.
|
||||
assert observed["terminal_cwd_during_init"] == str(tmp_path.resolve())
|
||||
assert observed["terminal_cwd_during_run"] == str(tmp_path.resolve())
|
||||
|
||||
# And it was restored to the original value in finally.
|
||||
assert os.environ["TERMINAL_CWD"] == "/original/cwd"
|
||||
|
||||
def test_no_workdir_leaves_terminal_cwd_untouched(self, monkeypatch):
|
||||
"""When workdir is absent, run_job must not touch TERMINAL_CWD at all —
|
||||
whatever value was present before the call should be present after.
|
||||
|
||||
We don't assert on the *content* of TERMINAL_CWD (other tests in the
|
||||
same xdist worker may leave it set to something like '.'); we just
|
||||
check it's unchanged by run_job.
|
||||
"""
|
||||
import os
|
||||
import cron.scheduler as sched
|
||||
|
||||
# Pin TERMINAL_CWD to a sentinel via monkeypatch so we control both
|
||||
# the before-value and the after-value regardless of cross-test state.
|
||||
monkeypatch.setenv("TERMINAL_CWD", "/cron-test-sentinel")
|
||||
before = os.environ["TERMINAL_CWD"]
|
||||
|
||||
observed: dict = {}
|
||||
self._install_stubs(monkeypatch, observed)
|
||||
|
||||
job = {
|
||||
"id": "xyz",
|
||||
"name": "no-wd-job",
|
||||
"workdir": None,
|
||||
"schedule_display": "manual",
|
||||
}
|
||||
|
||||
success, *_ = sched.run_job(job)
|
||||
assert success is True
|
||||
|
||||
# Feature is OFF — skip_context_files stays True.
|
||||
assert observed["skip_context_files"] is True
|
||||
# TERMINAL_CWD saw the same value during init as it had before.
|
||||
assert observed["terminal_cwd_during_init"] == before
|
||||
# And after run_job completes, it's still the sentinel (nothing
|
||||
# overwrote or cleared it).
|
||||
assert os.environ["TERMINAL_CWD"] == before
|
||||
@@ -73,6 +73,14 @@ class TestSlashCommands:
|
||||
send_status = await send_and_capture(adapter, "/status", platform)
|
||||
send_status.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_provider_shows_current_provider(self, adapter, platform):
|
||||
send = await send_and_capture(adapter, "/provider", platform)
|
||||
|
||||
send.assert_called_once()
|
||||
response_text = send.call_args[1].get("content") or send.call_args[0][1]
|
||||
assert "provider" in response_text.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verbose_responds(self, adapter, platform):
|
||||
send = await send_and_capture(adapter, "/verbose", platform)
|
||||
|
||||
@@ -88,63 +88,11 @@ def _ensure_discord_mock() -> None:
|
||||
discord_mod.Thread = type("Thread", (), {})
|
||||
discord_mod.ForumChannel = type("ForumChannel", (), {})
|
||||
discord_mod.Interaction = object
|
||||
discord_mod.Message = type("Message", (), {})
|
||||
|
||||
# Embed: accept the kwargs production code / tests use
|
||||
# (title, description, color). MagicMock auto-attributes work too,
|
||||
# but some tests construct and inspect .title/.description directly.
|
||||
class _FakeEmbed:
|
||||
def __init__(self, *, title=None, description=None, color=None, **_):
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.color = color
|
||||
discord_mod.Embed = _FakeEmbed
|
||||
|
||||
# ui.View / ui.Select / ui.Button: real classes (not MagicMock) so
|
||||
# tests that subclass ModelPickerView / iterate .children / clear
|
||||
# items work.
|
||||
class _FakeView:
|
||||
def __init__(self, timeout=None):
|
||||
self.timeout = timeout
|
||||
self.children = []
|
||||
def add_item(self, item):
|
||||
self.children.append(item)
|
||||
def clear_items(self):
|
||||
self.children.clear()
|
||||
|
||||
class _FakeSelect:
|
||||
def __init__(self, *, placeholder=None, options=None, custom_id=None, **_):
|
||||
self.placeholder = placeholder
|
||||
self.options = options or []
|
||||
self.custom_id = custom_id
|
||||
self.callback = None
|
||||
self.disabled = False
|
||||
|
||||
class _FakeButton:
|
||||
def __init__(self, *, label=None, style=None, custom_id=None, emoji=None,
|
||||
url=None, disabled=False, row=None, sku_id=None, **_):
|
||||
self.label = label
|
||||
self.style = style
|
||||
self.custom_id = custom_id
|
||||
self.emoji = emoji
|
||||
self.url = url
|
||||
self.disabled = disabled
|
||||
self.row = row
|
||||
self.sku_id = sku_id
|
||||
self.callback = None
|
||||
|
||||
class _FakeSelectOption:
|
||||
def __init__(self, *, label=None, value=None, description=None, **_):
|
||||
self.label = label
|
||||
self.value = value
|
||||
self.description = description
|
||||
discord_mod.SelectOption = _FakeSelectOption
|
||||
|
||||
discord_mod.Embed = MagicMock
|
||||
discord_mod.ui = SimpleNamespace(
|
||||
View=_FakeView,
|
||||
Select=_FakeSelect,
|
||||
Button=_FakeButton,
|
||||
View=object,
|
||||
button=lambda *a, **k: (lambda fn: fn),
|
||||
Button=object,
|
||||
)
|
||||
discord_mod.ButtonStyle = SimpleNamespace(
|
||||
success=1, primary=2, secondary=2, danger=3,
|
||||
@@ -152,7 +100,7 @@ def _ensure_discord_mock() -> None:
|
||||
)
|
||||
discord_mod.Color = SimpleNamespace(
|
||||
orange=lambda: 1, green=lambda: 2, blue=lambda: 3,
|
||||
red=lambda: 4, purple=lambda: 5, greyple=lambda: 6,
|
||||
red=lambda: 4, purple=lambda: 5,
|
||||
)
|
||||
|
||||
# app_commands — needed by _register_slash_commands auto-registration
|
||||
|
||||
@@ -950,7 +950,7 @@ class TestAgentCacheIdleResume:
|
||||
release_clients() (soft — session may resume).
|
||||
"""
|
||||
from run_agent import AIAgent
|
||||
import run_agent as _ra
|
||||
from tools import terminal_tool as _tt
|
||||
|
||||
# Agent A: evicted from cache (soft) — terminal survives.
|
||||
# Agent B: session expired (hard) — terminal torn down.
|
||||
@@ -970,16 +970,13 @@ class TestAgentCacheIdleResume:
|
||||
)
|
||||
|
||||
vm_calls: list = []
|
||||
# AIAgent.close() calls the ``cleanup_vm`` name bound into
|
||||
# ``run_agent`` at import time, not ``tools.terminal_tool.cleanup_vm``
|
||||
# directly — so patch the ``run_agent`` reference.
|
||||
original_vm = _ra.cleanup_vm
|
||||
_ra.cleanup_vm = lambda tid: vm_calls.append(tid)
|
||||
original_vm = _tt.cleanup_vm
|
||||
_tt.cleanup_vm = lambda tid: vm_calls.append(tid)
|
||||
try:
|
||||
agent_a.release_clients() # cache eviction
|
||||
agent_b.close() # session expiry
|
||||
finally:
|
||||
_ra.cleanup_vm = original_vm
|
||||
_tt.cleanup_vm = original_vm
|
||||
try:
|
||||
agent_a.close()
|
||||
except Exception:
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
"""Test that AuthError triggers fallback provider resolution (#7230)."""
|
||||
|
||||
import os
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class TestResolveRuntimeAgentKwargsAuthFallback:
|
||||
"""_resolve_runtime_agent_kwargs should try fallback on AuthError."""
|
||||
|
||||
def test_auth_error_tries_fallback(self, tmp_path, monkeypatch):
|
||||
"""When primary provider raises AuthError, fallback is attempted."""
|
||||
from hermes_cli.auth import AuthError
|
||||
|
||||
# Create a config with fallback
|
||||
config_path = tmp_path / "config.yaml"
|
||||
config_path.write_text(
|
||||
"model:\n provider: openai-codex\n"
|
||||
"fallback_model:\n provider: openrouter\n"
|
||||
" model: meta-llama/llama-4-maverick\n"
|
||||
)
|
||||
|
||||
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
||||
|
||||
call_count = {"n": 0}
|
||||
|
||||
def _mock_resolve(**kwargs):
|
||||
call_count["n"] += 1
|
||||
requested = kwargs.get("requested", "")
|
||||
if requested and "codex" in str(requested).lower():
|
||||
raise AuthError("Codex token refresh failed with status 401")
|
||||
return {
|
||||
"api_key": "fallback-key",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"provider": "openrouter",
|
||||
"api_mode": "openai_chat",
|
||||
"command": None,
|
||||
"args": None,
|
||||
"credential_pool": None,
|
||||
}
|
||||
|
||||
monkeypatch.setenv("HERMES_INFERENCE_PROVIDER", "openai-codex")
|
||||
|
||||
with patch(
|
||||
"hermes_cli.runtime_provider.resolve_runtime_provider",
|
||||
side_effect=_mock_resolve,
|
||||
):
|
||||
from gateway.run import _resolve_runtime_agent_kwargs
|
||||
result = _resolve_runtime_agent_kwargs()
|
||||
|
||||
assert result["provider"] == "openrouter"
|
||||
assert result["api_key"] == "fallback-key"
|
||||
# Should have been called at least twice (primary + fallback)
|
||||
assert call_count["n"] >= 2
|
||||
|
||||
def test_auth_error_no_fallback_raises(self, tmp_path, monkeypatch):
|
||||
"""When primary fails and no fallback configured, RuntimeError is raised."""
|
||||
from hermes_cli.auth import AuthError
|
||||
|
||||
config_path = tmp_path / "config.yaml"
|
||||
config_path.write_text("model:\n provider: openai-codex\n")
|
||||
|
||||
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
||||
monkeypatch.setenv("HERMES_INFERENCE_PROVIDER", "openai-codex")
|
||||
|
||||
with patch(
|
||||
"hermes_cli.runtime_provider.resolve_runtime_provider",
|
||||
side_effect=AuthError("token expired"),
|
||||
):
|
||||
from gateway.run import _resolve_runtime_agent_kwargs
|
||||
with pytest.raises(RuntimeError):
|
||||
_resolve_runtime_agent_kwargs()
|
||||
@@ -272,7 +272,7 @@ class TestCommandBypassActiveSession:
|
||||
# Tests: non-bypass-set commands (no dedicated Level-2 handler) also bypass
|
||||
# instead of interrupting + being discarded. Regression for the Discord
|
||||
# ghost-slash-command bug where /model, /reasoning, /voice, /insights, /title,
|
||||
# /resume, /retry, /undo, /compress, /usage, /reload-mcp,
|
||||
# /resume, /retry, /undo, /compress, /usage, /provider, /reload-mcp,
|
||||
# /sethome, /reset silently interrupted the running agent.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -298,6 +298,7 @@ class TestAllResolvableCommandsBypassGuard:
|
||||
("/undo", "undo"),
|
||||
("/compress", "compress"),
|
||||
("/usage", "usage"),
|
||||
("/provider", "provider"),
|
||||
("/reload-mcp", "reload-mcp"),
|
||||
("/sethome", "sethome"),
|
||||
],
|
||||
@@ -325,7 +326,7 @@ class TestAllResolvableCommandsBypassGuard:
|
||||
|
||||
for cmd in (
|
||||
"model", "reasoning", "personality", "voice", "insights", "title",
|
||||
"resume", "retry", "undo", "compress", "usage",
|
||||
"resume", "retry", "undo", "compress", "usage", "provider",
|
||||
"reload-mcp", "sethome", "reset",
|
||||
):
|
||||
assert should_bypass_active_session(cmd) is True, (
|
||||
|
||||
@@ -64,7 +64,9 @@ async def test_compress_command_reports_noop_without_success_banner():
|
||||
agent_instance = MagicMock()
|
||||
agent_instance.shutdown_memory_provider = MagicMock()
|
||||
agent_instance.close = MagicMock()
|
||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||
agent_instance.context_compressor.protect_first_n = 0
|
||||
agent_instance.context_compressor._align_boundary_forward.return_value = 0
|
||||
agent_instance.context_compressor._find_tail_cut_by_tokens.return_value = 2
|
||||
agent_instance.session_id = "sess-1"
|
||||
agent_instance._compress_context.return_value = (list(history), "")
|
||||
|
||||
@@ -99,7 +101,9 @@ async def test_compress_command_explains_when_token_estimate_rises():
|
||||
agent_instance = MagicMock()
|
||||
agent_instance.shutdown_memory_provider = MagicMock()
|
||||
agent_instance.close = MagicMock()
|
||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||
agent_instance.context_compressor.protect_first_n = 0
|
||||
agent_instance.context_compressor._align_boundary_forward.return_value = 0
|
||||
agent_instance.context_compressor._find_tail_cut_by_tokens.return_value = 2
|
||||
agent_instance.session_id = "sess-1"
|
||||
agent_instance._compress_context.return_value = (compressed, "")
|
||||
|
||||
|
||||
@@ -64,7 +64,9 @@ async def test_compress_focus_topic_passed_to_agent():
|
||||
compressed = [history[0], history[-1]]
|
||||
runner = _make_runner(history)
|
||||
agent_instance = MagicMock()
|
||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||
agent_instance.context_compressor.protect_first_n = 0
|
||||
agent_instance.context_compressor._align_boundary_forward.return_value = 0
|
||||
agent_instance.context_compressor._find_tail_cut_by_tokens.return_value = 2
|
||||
agent_instance.session_id = "sess-1"
|
||||
agent_instance._compress_context.return_value = (compressed, "")
|
||||
|
||||
@@ -94,7 +96,9 @@ async def test_compress_no_focus_passes_none():
|
||||
history = _make_history()
|
||||
runner = _make_runner(history)
|
||||
agent_instance = MagicMock()
|
||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||
agent_instance.context_compressor.protect_first_n = 0
|
||||
agent_instance.context_compressor._align_boundary_forward.return_value = 0
|
||||
agent_instance.context_compressor._find_tail_cut_by_tokens.return_value = 2
|
||||
agent_instance.session_id = "sess-1"
|
||||
agent_instance._compress_context.return_value = (list(history), "")
|
||||
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
"""Regression test: /compress works with context engine plugins.
|
||||
|
||||
Reported by @selfhostedsoul (Discord, Apr 2026) with the LCM plugin installed:
|
||||
|
||||
Compression failed: 'LCMEngine' object has no attribute '_align_boundary_forward'
|
||||
|
||||
Root cause: the gateway /compress handler used to reach into
|
||||
ContextCompressor-specific private helpers (_align_boundary_forward,
|
||||
_find_tail_cut_by_tokens) for its preflight check. Those helpers are not
|
||||
part of the generic ContextEngine ABC, so any plugin engine (LCM, etc.)
|
||||
raised AttributeError.
|
||||
|
||||
The fix promotes the preflight into an optional ABC method
|
||||
(has_content_to_compress) with a safe default of True.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.context_engine import ContextEngine
|
||||
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
||||
from gateway.platforms.base import MessageEvent
|
||||
from gateway.session import SessionEntry, SessionSource, build_session_key
|
||||
|
||||
|
||||
class _FakePluginEngine(ContextEngine):
|
||||
"""Minimal ContextEngine that only implements the ABC — no private helpers.
|
||||
|
||||
Mirrors the shape of a third-party context engine plugin such as LCM.
|
||||
If /compress reaches into any ContextCompressor-specific internals this
|
||||
engine will raise AttributeError, just like the real bug.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "fake-plugin"
|
||||
|
||||
def update_from_response(self, usage: Dict[str, Any]) -> None:
|
||||
return None
|
||||
|
||||
def should_compress(self, prompt_tokens: int = None) -> bool:
|
||||
return False
|
||||
|
||||
def compress(
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
current_tokens: int = None,
|
||||
focus_topic: str = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
# Pretend we dropped a middle turn.
|
||||
self.compression_count += 1
|
||||
if len(messages) >= 3:
|
||||
return [messages[0], messages[-1]]
|
||||
return list(messages)
|
||||
|
||||
|
||||
def _make_source() -> SessionSource:
|
||||
return SessionSource(
|
||||
platform=Platform.TELEGRAM,
|
||||
user_id="u1",
|
||||
chat_id="c1",
|
||||
user_name="tester",
|
||||
chat_type="dm",
|
||||
)
|
||||
|
||||
|
||||
def _make_event(text: str = "/compress") -> MessageEvent:
|
||||
return MessageEvent(text=text, source=_make_source(), message_id="m1")
|
||||
|
||||
|
||||
def _make_history() -> list[dict[str, str]]:
|
||||
return [
|
||||
{"role": "user", "content": "one"},
|
||||
{"role": "assistant", "content": "two"},
|
||||
{"role": "user", "content": "three"},
|
||||
{"role": "assistant", "content": "four"},
|
||||
]
|
||||
|
||||
|
||||
def _make_runner(history: list[dict[str, str]]):
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
runner = object.__new__(GatewayRunner)
|
||||
runner.config = GatewayConfig(
|
||||
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
|
||||
)
|
||||
session_entry = SessionEntry(
|
||||
session_key=build_session_key(_make_source()),
|
||||
session_id="sess-1",
|
||||
created_at=datetime.now(),
|
||||
updated_at=datetime.now(),
|
||||
platform=Platform.TELEGRAM,
|
||||
chat_type="dm",
|
||||
)
|
||||
runner.session_store = MagicMock()
|
||||
runner.session_store.get_or_create_session.return_value = session_entry
|
||||
runner.session_store.load_transcript.return_value = history
|
||||
runner.session_store.rewrite_transcript = MagicMock()
|
||||
runner.session_store.update_session = MagicMock()
|
||||
runner.session_store._save = MagicMock()
|
||||
return runner
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_compress_works_with_plugin_context_engine():
|
||||
"""/compress must not call ContextCompressor-only private helpers.
|
||||
|
||||
Uses a fake ContextEngine subclass that only implements the ABC —
|
||||
matches what a real plugin (LCM, etc.) exposes. If the gateway
|
||||
reaches into ``_align_boundary_forward`` or ``_find_tail_cut_by_tokens``
|
||||
on this engine, AttributeError propagates and the test fails with the
|
||||
exact user-visible error selfhostedsoul reported.
|
||||
"""
|
||||
history = _make_history()
|
||||
compressed = [history[0], history[-1]]
|
||||
runner = _make_runner(history)
|
||||
|
||||
plugin_engine = _FakePluginEngine()
|
||||
agent_instance = MagicMock()
|
||||
agent_instance.shutdown_memory_provider = MagicMock()
|
||||
agent_instance.close = MagicMock()
|
||||
# Real plugin engine — no MagicMock auto-attributes masking missing helpers.
|
||||
agent_instance.context_compressor = plugin_engine
|
||||
agent_instance.session_id = "sess-1"
|
||||
agent_instance._compress_context.return_value = (compressed, "")
|
||||
|
||||
with (
|
||||
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
|
||||
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
||||
patch("run_agent.AIAgent", return_value=agent_instance),
|
||||
patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100),
|
||||
):
|
||||
result = await runner._handle_compress_command(_make_event("/compress"))
|
||||
|
||||
# No AttributeError surfaced as "Compression failed: ..."
|
||||
assert "Compression failed" not in result
|
||||
assert "_align_boundary_forward" not in result
|
||||
assert "_find_tail_cut_by_tokens" not in result
|
||||
# Happy path fired
|
||||
agent_instance._compress_context.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_compress_respects_plugin_has_content_to_compress_false():
|
||||
"""If a plugin reports no compressible content, gateway skips the LLM call."""
|
||||
|
||||
class _EmptyEngine(_FakePluginEngine):
|
||||
def has_content_to_compress(self, messages):
|
||||
return False
|
||||
|
||||
history = _make_history()
|
||||
runner = _make_runner(history)
|
||||
|
||||
plugin_engine = _EmptyEngine()
|
||||
agent_instance = MagicMock()
|
||||
agent_instance.shutdown_memory_provider = MagicMock()
|
||||
agent_instance.close = MagicMock()
|
||||
agent_instance.context_compressor = plugin_engine
|
||||
agent_instance.session_id = "sess-1"
|
||||
|
||||
with (
|
||||
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
|
||||
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
||||
patch("run_agent.AIAgent", return_value=agent_instance),
|
||||
patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100),
|
||||
):
|
||||
result = await runner._handle_compress_command(_make_event("/compress"))
|
||||
|
||||
assert "Nothing to compress" in result
|
||||
agent_instance._compress_context.assert_not_called()
|
||||
@@ -1,104 +0,0 @@
|
||||
"""Regression guard for #14920: wildcard "*" in Discord channel config lists.
|
||||
|
||||
Setting ``allowed_channels: "*"``, ``free_response_channels: "*"``, or
|
||||
``ignored_channels: "*"`` in config (or their ``DISCORD_*_CHANNELS`` env var
|
||||
equivalents) must behave as a wildcard — i.e. the bot responds in every
|
||||
channel (or is silenced in every channel, for the ignored list). Previously
|
||||
the literal string "*" was placed into a set and compared against numeric
|
||||
channel IDs via set-intersection, which always produced an empty set and
|
||||
caused every message to be silently dropped (for ``allowed_channels``) or
|
||||
every ``free_response`` / ``ignored`` check to fail open.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
def _channel_is_allowed(channel_id: str, allowed_channels_raw: str) -> bool:
|
||||
"""Replicate the channel-allow-list check from discord.py on_message."""
|
||||
if not allowed_channels_raw:
|
||||
return True
|
||||
allowed_channels = {ch.strip() for ch in allowed_channels_raw.split(",") if ch.strip()}
|
||||
if "*" in allowed_channels:
|
||||
return True
|
||||
return bool({channel_id} & allowed_channels)
|
||||
|
||||
|
||||
def _channel_is_ignored(channel_id: str, ignored_channels_raw: str) -> bool:
|
||||
"""Replicate the ignored-channel check from discord.py on_message."""
|
||||
ignored_channels = {
|
||||
ch.strip() for ch in ignored_channels_raw.split(",") if ch.strip()
|
||||
}
|
||||
return "*" in ignored_channels or bool({channel_id} & ignored_channels)
|
||||
|
||||
|
||||
def _channel_is_free_response(channel_id: str, free_channels_raw: str) -> bool:
|
||||
"""Replicate the free-response-channel check from discord.py on_message."""
|
||||
free_channels = {
|
||||
ch.strip() for ch in free_channels_raw.split(",") if ch.strip()
|
||||
}
|
||||
return "*" in free_channels or bool({channel_id} & free_channels)
|
||||
|
||||
|
||||
class TestDiscordAllowedChannelsWildcard(unittest.TestCase):
|
||||
"""Wildcard and channel-list behaviour for DISCORD_ALLOWED_CHANNELS."""
|
||||
|
||||
def test_wildcard_allows_any_channel(self):
|
||||
"""'*' should allow messages from any channel ID."""
|
||||
self.assertTrue(_channel_is_allowed("1234567890", "*"))
|
||||
|
||||
def test_wildcard_in_list_allows_any_channel(self):
|
||||
"""'*' mixed with other entries still allows any channel."""
|
||||
self.assertTrue(_channel_is_allowed("9999999999", "111,*,222"))
|
||||
|
||||
def test_exact_match_allowed(self):
|
||||
"""Channel ID present in the explicit list is allowed."""
|
||||
self.assertTrue(_channel_is_allowed("1234567890", "1234567890,9876543210"))
|
||||
|
||||
def test_non_matching_channel_blocked(self):
|
||||
"""Channel ID absent from the explicit list is blocked."""
|
||||
self.assertFalse(_channel_is_allowed("5555555555", "1234567890,9876543210"))
|
||||
|
||||
def test_empty_allowlist_allows_all(self):
|
||||
"""Empty DISCORD_ALLOWED_CHANNELS means no restriction."""
|
||||
self.assertTrue(_channel_is_allowed("1234567890", ""))
|
||||
|
||||
def test_whitespace_only_entry_ignored(self):
|
||||
"""Entries that are only whitespace are stripped and ignored."""
|
||||
self.assertFalse(_channel_is_allowed("1234567890", " , "))
|
||||
|
||||
|
||||
class TestDiscordIgnoredChannelsWildcard(unittest.TestCase):
|
||||
"""Wildcard and channel-list behaviour for DISCORD_IGNORED_CHANNELS."""
|
||||
|
||||
def test_wildcard_silences_every_channel(self):
|
||||
"""'*' in ignored_channels silences the bot everywhere."""
|
||||
self.assertTrue(_channel_is_ignored("1234567890", "*"))
|
||||
|
||||
def test_empty_ignored_list_silences_nothing(self):
|
||||
self.assertFalse(_channel_is_ignored("1234567890", ""))
|
||||
|
||||
def test_exact_match_is_ignored(self):
|
||||
self.assertTrue(_channel_is_ignored("111", "111,222"))
|
||||
|
||||
def test_non_match_not_ignored(self):
|
||||
self.assertFalse(_channel_is_ignored("333", "111,222"))
|
||||
|
||||
|
||||
class TestDiscordFreeResponseChannelsWildcard(unittest.TestCase):
|
||||
"""Wildcard and channel-list behaviour for DISCORD_FREE_RESPONSE_CHANNELS."""
|
||||
|
||||
def test_wildcard_makes_every_channel_free_response(self):
|
||||
"""'*' in free_response_channels exempts every channel from mention-required."""
|
||||
self.assertTrue(_channel_is_free_response("1234567890", "*"))
|
||||
|
||||
def test_wildcard_in_list_applies_everywhere(self):
|
||||
self.assertTrue(_channel_is_free_response("9999999999", "111,*,222"))
|
||||
|
||||
def test_exact_match_is_free_response(self):
|
||||
self.assertTrue(_channel_is_free_response("111", "111,222"))
|
||||
|
||||
def test_non_match_not_free_response(self):
|
||||
self.assertFalse(_channel_is_free_response("333", "111,222"))
|
||||
|
||||
def test_empty_list_no_free_response(self):
|
||||
self.assertFalse(_channel_is_free_response("111", ""))
|
||||
@@ -1,82 +0,0 @@
|
||||
"""Regression tests for the Discord /model picker.
|
||||
|
||||
Uses the shared discord mock from tests/gateway/conftest.py (installed
|
||||
at collection time via _ensure_discord_mock()). Previously this file
|
||||
installed its own mock at module-import time and clobbered sys.modules,
|
||||
breaking other gateway tests under pytest-xdist.
|
||||
"""
|
||||
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from gateway.platforms.discord import ModelPickerView
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_model_picker_clears_controls_before_running_switch_callback():
|
||||
events: list[object] = []
|
||||
|
||||
async def on_model_selected(chat_id: str, model_id: str, provider_slug: str) -> str:
|
||||
events.append(("switch", chat_id, model_id, provider_slug))
|
||||
return "Model switched"
|
||||
|
||||
async def edit_message(**kwargs):
|
||||
events.append(
|
||||
(
|
||||
"initial-edit",
|
||||
kwargs["embed"].title,
|
||||
kwargs["embed"].description,
|
||||
kwargs["view"],
|
||||
)
|
||||
)
|
||||
|
||||
async def edit_original_response(**kwargs):
|
||||
events.append((
|
||||
"final-edit",
|
||||
kwargs["embed"].title,
|
||||
kwargs["embed"].description,
|
||||
kwargs["view"],
|
||||
))
|
||||
|
||||
view = ModelPickerView(
|
||||
providers=[
|
||||
{
|
||||
"slug": "copilot",
|
||||
"name": "GitHub Copilot",
|
||||
"models": ["gpt-5.4"],
|
||||
"total_models": 1,
|
||||
"is_current": True,
|
||||
}
|
||||
],
|
||||
current_model="gpt-5-mini",
|
||||
current_provider="copilot",
|
||||
session_key="session-1",
|
||||
on_model_selected=on_model_selected,
|
||||
allowed_user_ids=set(),
|
||||
)
|
||||
view._selected_provider = "copilot"
|
||||
|
||||
interaction = SimpleNamespace(
|
||||
user=SimpleNamespace(id=123),
|
||||
channel_id=456,
|
||||
data={"values": ["gpt-5.4"]},
|
||||
response=SimpleNamespace(
|
||||
defer=AsyncMock(),
|
||||
send_message=AsyncMock(),
|
||||
edit_message=AsyncMock(side_effect=edit_message),
|
||||
),
|
||||
edit_original_response=AsyncMock(side_effect=edit_original_response),
|
||||
)
|
||||
|
||||
await view._on_model_selected(interaction)
|
||||
|
||||
assert events == [
|
||||
("initial-edit", "⚙ Switching Model", "Switching to `gpt-5.4`...", None),
|
||||
("switch", "456", "gpt-5.4", "copilot"),
|
||||
("final-edit", "⚙ Model Switched", "Model switched", None),
|
||||
]
|
||||
interaction.response.edit_message.assert_awaited_once()
|
||||
interaction.response.defer.assert_not_called()
|
||||
interaction.edit_original_response.assert_awaited_once()
|
||||
@@ -164,7 +164,7 @@ async def test_auto_registers_missing_gateway_commands(adapter):
|
||||
|
||||
# These commands are gateway-available but were not in the original
|
||||
# hardcoded registration list — they should be auto-registered.
|
||||
expected_auto = {"debug", "yolo", "profile"}
|
||||
expected_auto = {"debug", "yolo", "reload", "profile"}
|
||||
for name in expected_auto:
|
||||
assert name in tree_names, f"/{name} should be auto-registered on Discord"
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user