Compare commits
47 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7efd91d4b4 | |||
| 0aa1269e56 | |||
| 3c29834354 | |||
| 0eb85906b0 | |||
| ff9b0528a2 | |||
| 8feaa7cd1b | |||
| 57a2b97ae8 | |||
| bd9afb027a | |||
| 6051fba9dc | |||
| 2acc8783d1 | |||
| acdcb167fb | |||
| 51f4c9827f | |||
| 2e78a2b6b2 | |||
| 5a1c599412 | |||
| 0f6eabb890 | |||
| eb93f88e1d | |||
| 3ccda2aa05 | |||
| 983bbe2d40 | |||
| 379b2273d9 | |||
| 7db2703b33 | |||
| 7c59e1a871 | |||
| 6fdbf2f2d7 | |||
| 0a679cb7ad | |||
| 41b4d69167 | |||
| 3f343cf7cf | |||
| 4ae5b58cb1 | |||
| 2258a181f0 | |||
| 11b2942f16 | |||
| b08cbc7a79 | |||
| c95c6bdb7c | |||
| bd929ea514 | |||
| 6a20e187dd | |||
| 9ff21437a0 | |||
| 44a0cbe525 | |||
| 2af0848f3c | |||
| 7baf370d3d | |||
| eeda18a9b7 | |||
| 3a9598337f | |||
| 98418afd5d | |||
| 42ff785771 | |||
| 04c489b587 | |||
| 0bb460b070 | |||
| 3504bd401b | |||
| 50d97edbe1 | |||
| e26c4f0e34 | |||
| 24f139e16a | |||
| ef5eaf8d87 |
@@ -53,6 +53,9 @@ jobs:
|
||||
- name: Extract skill metadata for dashboard
|
||||
run: python3 website/scripts/extract-skills.py
|
||||
|
||||
- name: Regenerate per-skill docs pages + catalogs
|
||||
run: python3 website/scripts/generate-skill-docs.py
|
||||
|
||||
- name: Build skills index (if not already present)
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
@@ -36,6 +36,9 @@ jobs:
|
||||
- name: Extract skill metadata for dashboard
|
||||
run: python3 website/scripts/extract-skills.py
|
||||
|
||||
- name: Regenerate per-skill docs pages + catalogs
|
||||
run: python3 website/scripts/generate-skill-docs.py
|
||||
|
||||
- name: Lint docs diagrams
|
||||
run: npm run lint:diagrams
|
||||
working-directory: website
|
||||
|
||||
@@ -45,6 +45,7 @@ class FailoverReason(enum.Enum):
|
||||
|
||||
# Model
|
||||
model_not_found = "model_not_found" # 404 or invalid model — fallback to different model
|
||||
provider_policy_blocked = "provider_policy_blocked" # Aggregator (e.g. OpenRouter) blocked the only endpoint due to account data/privacy policy
|
||||
|
||||
# Request format
|
||||
format_error = "format_error" # 400 bad request — abort or strip + retry
|
||||
@@ -194,6 +195,29 @@ _MODEL_NOT_FOUND_PATTERNS = [
|
||||
"unsupported model",
|
||||
]
|
||||
|
||||
# OpenRouter aggregator policy-block patterns.
|
||||
#
|
||||
# When a user's OpenRouter account privacy setting (or a per-request
|
||||
# `provider.data_collection: deny` preference) excludes the only endpoint
|
||||
# serving a model, OpenRouter returns 404 with a *specific* message that is
|
||||
# distinct from "model not found":
|
||||
#
|
||||
# "No endpoints available matching your guardrail restrictions and
|
||||
# data policy. Configure: https://openrouter.ai/settings/privacy"
|
||||
#
|
||||
# We classify this as `provider_policy_blocked` rather than
|
||||
# `model_not_found` because:
|
||||
# - The model *exists* — model_not_found is misleading in logs
|
||||
# - Provider fallback won't help: the account-level setting applies to
|
||||
# every call on the same OpenRouter account
|
||||
# - The error body already contains the fix URL, so the user gets
|
||||
# actionable guidance without us rewriting the message
|
||||
_PROVIDER_POLICY_BLOCKED_PATTERNS = [
|
||||
"no endpoints available matching your guardrail",
|
||||
"no endpoints available matching your data policy",
|
||||
"no endpoints found matching your data policy",
|
||||
]
|
||||
|
||||
# Auth patterns (non-status-code signals)
|
||||
_AUTH_PATTERNS = [
|
||||
"invalid api key",
|
||||
@@ -523,6 +547,17 @@ def _classify_by_status(
|
||||
return _classify_402(error_msg, result_fn)
|
||||
|
||||
if status_code == 404:
|
||||
# OpenRouter policy-block 404 — distinct from "model not found".
|
||||
# The model exists; the user's account privacy setting excludes the
|
||||
# only endpoint serving it. Falling back to another provider won't
|
||||
# help (same account setting applies). The error body already
|
||||
# contains the fix URL, so just surface it.
|
||||
if any(p in error_msg for p in _PROVIDER_POLICY_BLOCKED_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.provider_policy_blocked,
|
||||
retryable=False,
|
||||
should_fallback=False,
|
||||
)
|
||||
if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.model_not_found,
|
||||
@@ -640,6 +675,12 @@ def _classify_400(
|
||||
)
|
||||
|
||||
# Some providers return model-not-found as 400 instead of 404 (e.g. OpenRouter).
|
||||
if any(p in error_msg for p in _PROVIDER_POLICY_BLOCKED_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.provider_policy_blocked,
|
||||
retryable=False,
|
||||
should_fallback=False,
|
||||
)
|
||||
if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.model_not_found,
|
||||
@@ -812,6 +853,15 @@ def _classify_by_message(
|
||||
should_fallback=True,
|
||||
)
|
||||
|
||||
# Provider policy-block (aggregator-side guardrail) — check before
|
||||
# model_not_found so we don't mis-label as a missing model.
|
||||
if any(p in error_msg for p in _PROVIDER_POLICY_BLOCKED_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.provider_policy_blocked,
|
||||
retryable=False,
|
||||
should_fallback=False,
|
||||
)
|
||||
|
||||
# Model not found patterns
|
||||
if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
|
||||
return result_fn(
|
||||
|
||||
+121
-2
@@ -123,8 +123,9 @@ DEFAULT_CONTEXT_LENGTHS = {
|
||||
"claude": 200000,
|
||||
# OpenAI — GPT-5 family (most have 400k; specific overrides first)
|
||||
# Source: https://developers.openai.com/api/docs/models
|
||||
# GPT-5.5 (launched Apr 23 2026). Verified via live ChatGPT codex/models
|
||||
# endpoint: bare slug `gpt-5.5`, no -pro/-mini variants. 400k context on Codex.
|
||||
# GPT-5.5 (launched Apr 23 2026). 400k is the fallback for providers we
|
||||
# can't probe live. ChatGPT Codex OAuth actually caps lower (272k as of
|
||||
# Apr 2026) and is resolved via _resolve_codex_oauth_context_length().
|
||||
"gpt-5.5": 400000,
|
||||
"gpt-5.4-nano": 400000, # 400k (not 1.05M like full 5.4)
|
||||
"gpt-5.4-mini": 400000, # 400k (not 1.05M like full 5.4)
|
||||
@@ -1005,6 +1006,115 @@ def _query_anthropic_context_length(model: str, base_url: str, api_key: str) ->
|
||||
return None
|
||||
|
||||
|
||||
# Known ChatGPT Codex OAuth context windows (observed via live
|
||||
# chatgpt.com/backend-api/codex/models probe, Apr 2026). These are the
|
||||
# `context_window` values, which are what Codex actually enforces — the
|
||||
# direct OpenAI API has larger limits for the same slugs, but Codex OAuth
|
||||
# caps lower (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex).
|
||||
#
|
||||
# Used as a fallback when the live probe fails (no token, network error).
|
||||
# Longest keys first so substring match picks the most specific entry.
|
||||
_CODEX_OAUTH_CONTEXT_FALLBACK: Dict[str, int] = {
|
||||
"gpt-5.1-codex-max": 272_000,
|
||||
"gpt-5.1-codex-mini": 272_000,
|
||||
"gpt-5.3-codex": 272_000,
|
||||
"gpt-5.2-codex": 272_000,
|
||||
"gpt-5.4-mini": 272_000,
|
||||
"gpt-5.5": 272_000,
|
||||
"gpt-5.4": 272_000,
|
||||
"gpt-5.2": 272_000,
|
||||
"gpt-5": 272_000,
|
||||
}
|
||||
|
||||
|
||||
_codex_oauth_context_cache: Dict[str, int] = {}
|
||||
_codex_oauth_context_cache_time: float = 0.0
|
||||
_CODEX_OAUTH_CONTEXT_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
|
||||
def _fetch_codex_oauth_context_lengths(access_token: str) -> Dict[str, int]:
|
||||
"""Probe the ChatGPT Codex /models endpoint for per-slug context windows.
|
||||
|
||||
Codex OAuth imposes its own context limits that differ from the direct
|
||||
OpenAI API (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex). The
|
||||
`context_window` field in each model entry is the authoritative source.
|
||||
|
||||
Returns a ``{slug: context_window}`` dict. Empty on failure.
|
||||
"""
|
||||
global _codex_oauth_context_cache, _codex_oauth_context_cache_time
|
||||
now = time.time()
|
||||
if (
|
||||
_codex_oauth_context_cache
|
||||
and now - _codex_oauth_context_cache_time < _CODEX_OAUTH_CONTEXT_CACHE_TTL
|
||||
):
|
||||
return _codex_oauth_context_cache
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
"https://chatgpt.com/backend-api/codex/models?client_version=1.0.0",
|
||||
headers={"Authorization": f"Bearer {access_token}"},
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.debug(
|
||||
"Codex /models probe returned HTTP %s; falling back to hardcoded defaults",
|
||||
resp.status_code,
|
||||
)
|
||||
return {}
|
||||
data = resp.json()
|
||||
except Exception as exc:
|
||||
logger.debug("Codex /models probe failed: %s", exc)
|
||||
return {}
|
||||
|
||||
entries = data.get("models", []) if isinstance(data, dict) else []
|
||||
result: Dict[str, int] = {}
|
||||
for item in entries:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
slug = item.get("slug")
|
||||
ctx = item.get("context_window")
|
||||
if isinstance(slug, str) and isinstance(ctx, int) and ctx > 0:
|
||||
result[slug.strip()] = ctx
|
||||
|
||||
if result:
|
||||
_codex_oauth_context_cache = result
|
||||
_codex_oauth_context_cache_time = now
|
||||
return result
|
||||
|
||||
|
||||
def _resolve_codex_oauth_context_length(
|
||||
model: str, access_token: str = ""
|
||||
) -> Optional[int]:
|
||||
"""Resolve a Codex OAuth model's real context window.
|
||||
|
||||
Prefers a live probe of chatgpt.com/backend-api/codex/models (when we
|
||||
have a bearer token), then falls back to ``_CODEX_OAUTH_CONTEXT_FALLBACK``.
|
||||
"""
|
||||
model_bare = _strip_provider_prefix(model).strip()
|
||||
if not model_bare:
|
||||
return None
|
||||
|
||||
if access_token:
|
||||
live = _fetch_codex_oauth_context_lengths(access_token)
|
||||
if model_bare in live:
|
||||
return live[model_bare]
|
||||
# Case-insensitive match in case casing drifts
|
||||
model_lower = model_bare.lower()
|
||||
for slug, ctx in live.items():
|
||||
if slug.lower() == model_lower:
|
||||
return ctx
|
||||
|
||||
# Fallback: longest-key-first substring match over hardcoded defaults.
|
||||
model_lower = model_bare.lower()
|
||||
for slug, ctx in sorted(
|
||||
_CODEX_OAUTH_CONTEXT_FALLBACK.items(), key=lambda x: len(x[0]), reverse=True
|
||||
):
|
||||
if slug in model_lower:
|
||||
return ctx
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_nous_context_length(model: str) -> Optional[int]:
|
||||
"""Resolve Nous Portal model context length via OpenRouter metadata.
|
||||
|
||||
@@ -1149,6 +1259,15 @@ def get_model_context_length(
|
||||
ctx = _resolve_nous_context_length(model)
|
||||
if ctx:
|
||||
return ctx
|
||||
if effective_provider == "openai-codex":
|
||||
# Codex OAuth enforces lower context limits than the direct OpenAI
|
||||
# API for the same slug (e.g. gpt-5.5 is 1.05M on the API but 272K
|
||||
# on Codex). Authoritative source is Codex's own /models endpoint.
|
||||
codex_ctx = _resolve_codex_oauth_context_length(model, access_token=api_key or "")
|
||||
if codex_ctx:
|
||||
if base_url:
|
||||
save_context_length(model, base_url, codex_ctx)
|
||||
return codex_ctx
|
||||
if effective_provider:
|
||||
from agent.models_dev import lookup_models_dev_context
|
||||
ctx = lookup_models_dev_context(effective_provider, model)
|
||||
|
||||
@@ -0,0 +1,190 @@
|
||||
"""Helpers for translating OpenAI-style tool schemas to Moonshot's schema subset.
|
||||
|
||||
Moonshot (Kimi) accepts a stricter subset of JSON Schema than standard OpenAI
|
||||
tool calling. Requests that violate it fail with HTTP 400:
|
||||
|
||||
tools.function.parameters is not a valid moonshot flavored json schema,
|
||||
details: <...>
|
||||
|
||||
Known rejection modes documented at
|
||||
https://forum.moonshot.ai/t/tool-calling-specification-violation-on-moonshot-api/102
|
||||
and MoonshotAI/kimi-cli#1595:
|
||||
|
||||
1. Every property schema must carry a ``type``. Standard JSON Schema allows
|
||||
type to be omitted (the value is then unconstrained); Moonshot refuses.
|
||||
2. When ``anyOf`` is used, ``type`` must be on the ``anyOf`` children, not
|
||||
the parent. Presence of both causes "type should be defined in anyOf
|
||||
items instead of the parent schema".
|
||||
|
||||
The ``#/definitions/...`` → ``#/$defs/...`` rewrite for draft-07 refs is
|
||||
handled separately in ``tools/mcp_tool._normalize_mcp_input_schema`` so it
|
||||
applies at MCP registration time for all providers.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
from typing import Any, Dict, List
|
||||
|
||||
# Keys whose values are maps of name → schema (not schemas themselves).
|
||||
# When we recurse, we walk the values of these maps as schemas, but we do
|
||||
# NOT apply the missing-type repair to the map itself.
|
||||
_SCHEMA_MAP_KEYS = frozenset({"properties", "patternProperties", "$defs", "definitions"})
|
||||
|
||||
# Keys whose values are lists of schemas.
|
||||
_SCHEMA_LIST_KEYS = frozenset({"anyOf", "oneOf", "allOf", "prefixItems"})
|
||||
|
||||
# Keys whose values are a single nested schema.
|
||||
_SCHEMA_NODE_KEYS = frozenset({"items", "contains", "not", "additionalProperties", "propertyNames"})
|
||||
|
||||
|
||||
def _repair_schema(node: Any, is_schema: bool = True) -> Any:
|
||||
"""Recursively apply Moonshot repairs to a schema node.
|
||||
|
||||
``is_schema=True`` means this dict is a JSON Schema node and gets the
|
||||
missing-type + anyOf-parent repairs applied. ``is_schema=False`` means
|
||||
it's a container map (e.g. the value of ``properties``) and we only
|
||||
recurse into its values.
|
||||
"""
|
||||
if isinstance(node, list):
|
||||
# Lists only show up under schema-list keys (anyOf/oneOf/allOf), so
|
||||
# every element is itself a schema.
|
||||
return [_repair_schema(item, is_schema=True) for item in node]
|
||||
if not isinstance(node, dict):
|
||||
return node
|
||||
|
||||
# Walk the dict, deciding per-key whether recursion is into a schema
|
||||
# node, a container map, or a scalar.
|
||||
repaired: Dict[str, Any] = {}
|
||||
for key, value in node.items():
|
||||
if key in _SCHEMA_MAP_KEYS and isinstance(value, dict):
|
||||
# Map of name → schema. Don't treat the map itself as a schema
|
||||
# (it has no type / properties of its own), but each value is.
|
||||
repaired[key] = {
|
||||
sub_key: _repair_schema(sub_val, is_schema=True)
|
||||
for sub_key, sub_val in value.items()
|
||||
}
|
||||
elif key in _SCHEMA_LIST_KEYS and isinstance(value, list):
|
||||
repaired[key] = [_repair_schema(v, is_schema=True) for v in value]
|
||||
elif key in _SCHEMA_NODE_KEYS:
|
||||
# items / not / additionalProperties: single nested schema.
|
||||
# additionalProperties can also be a bool — leave those alone.
|
||||
if isinstance(value, dict):
|
||||
repaired[key] = _repair_schema(value, is_schema=True)
|
||||
else:
|
||||
repaired[key] = value
|
||||
else:
|
||||
# Scalars (description, title, format, enum values, etc.) pass through.
|
||||
repaired[key] = value
|
||||
|
||||
if not is_schema:
|
||||
return repaired
|
||||
|
||||
# Rule 2: when anyOf is present, type belongs only on the children.
|
||||
if "anyOf" in repaired and isinstance(repaired["anyOf"], list):
|
||||
repaired.pop("type", None)
|
||||
return repaired
|
||||
|
||||
# Rule 1: property schemas without type need one. $ref nodes are exempt
|
||||
# — their type comes from the referenced definition.
|
||||
if "$ref" in repaired:
|
||||
return repaired
|
||||
return _fill_missing_type(repaired)
|
||||
|
||||
|
||||
def _fill_missing_type(node: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Infer a reasonable ``type`` if this schema node has none."""
|
||||
if "type" in node and node["type"] not in (None, ""):
|
||||
return node
|
||||
|
||||
# Heuristic: presence of ``properties`` → object, ``items`` → array, ``enum``
|
||||
# → type of first enum value, else fall back to ``string`` (safest scalar).
|
||||
if "properties" in node or "required" in node or "additionalProperties" in node:
|
||||
inferred = "object"
|
||||
elif "items" in node or "prefixItems" in node:
|
||||
inferred = "array"
|
||||
elif "enum" in node and isinstance(node["enum"], list) and node["enum"]:
|
||||
sample = node["enum"][0]
|
||||
if isinstance(sample, bool):
|
||||
inferred = "boolean"
|
||||
elif isinstance(sample, int):
|
||||
inferred = "integer"
|
||||
elif isinstance(sample, float):
|
||||
inferred = "number"
|
||||
else:
|
||||
inferred = "string"
|
||||
else:
|
||||
inferred = "string"
|
||||
|
||||
return {**node, "type": inferred}
|
||||
|
||||
|
||||
def sanitize_moonshot_tool_parameters(parameters: Any) -> Dict[str, Any]:
|
||||
"""Normalize tool parameters to a Moonshot-compatible object schema.
|
||||
|
||||
Returns a deep-copied schema with the two flavored-JSON-Schema repairs
|
||||
applied. Input is not mutated.
|
||||
"""
|
||||
if not isinstance(parameters, dict):
|
||||
return {"type": "object", "properties": {}}
|
||||
|
||||
repaired = _repair_schema(copy.deepcopy(parameters), is_schema=True)
|
||||
if not isinstance(repaired, dict):
|
||||
return {"type": "object", "properties": {}}
|
||||
|
||||
# Top-level must be an object schema
|
||||
if repaired.get("type") != "object":
|
||||
repaired["type"] = "object"
|
||||
if "properties" not in repaired:
|
||||
repaired["properties"] = {}
|
||||
|
||||
return repaired
|
||||
|
||||
|
||||
def sanitize_moonshot_tools(tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Apply ``sanitize_moonshot_tool_parameters`` to every tool's parameters."""
|
||||
if not tools:
|
||||
return tools
|
||||
|
||||
sanitized: List[Dict[str, Any]] = []
|
||||
any_change = False
|
||||
for tool in tools:
|
||||
if not isinstance(tool, dict):
|
||||
sanitized.append(tool)
|
||||
continue
|
||||
fn = tool.get("function")
|
||||
if not isinstance(fn, dict):
|
||||
sanitized.append(tool)
|
||||
continue
|
||||
params = fn.get("parameters")
|
||||
repaired = sanitize_moonshot_tool_parameters(params)
|
||||
if repaired is not params:
|
||||
any_change = True
|
||||
new_fn = {**fn, "parameters": repaired}
|
||||
sanitized.append({**tool, "function": new_fn})
|
||||
else:
|
||||
sanitized.append(tool)
|
||||
|
||||
return sanitized if any_change else tools
|
||||
|
||||
|
||||
def is_moonshot_model(model: str | None) -> bool:
|
||||
"""True for any Kimi / Moonshot model slug, regardless of aggregator prefix.
|
||||
|
||||
Matches bare names (``kimi-k2.6``, ``moonshotai/Kimi-K2.6``) and aggregator-
|
||||
prefixed slugs (``nous/moonshotai/kimi-k2.6``, ``openrouter/moonshotai/...``).
|
||||
Detection by model name covers Nous / OpenRouter / other aggregators that
|
||||
route to Moonshot's inference, where the base URL is the aggregator's, not
|
||||
``api.moonshot.ai``.
|
||||
"""
|
||||
if not model:
|
||||
return False
|
||||
bare = model.strip().lower()
|
||||
# Last path segment (covers aggregator-prefixed slugs)
|
||||
tail = bare.rsplit("/", 1)[-1]
|
||||
if tail.startswith("kimi-") or tail == "kimi":
|
||||
return True
|
||||
# Vendor-prefixed forms commonly used on aggregators
|
||||
if "moonshot" in bare or "/kimi" in bare or bare.startswith("kimi"):
|
||||
return True
|
||||
return False
|
||||
@@ -12,6 +12,7 @@ reasoning configuration, temperature handling, and extra_body assembly.
|
||||
import copy
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.moonshot_schema import is_moonshot_model, sanitize_moonshot_tools
|
||||
from agent.prompt_builder import DEVELOPER_ROLE_MODELS
|
||||
from agent.transports.base import ProviderTransport
|
||||
from agent.transports.types import NormalizedResponse, ToolCall, Usage
|
||||
@@ -172,6 +173,11 @@ class ChatCompletionsTransport(ProviderTransport):
|
||||
|
||||
# Tools
|
||||
if tools:
|
||||
# Moonshot/Kimi uses a stricter flavored JSON Schema. Rewriting
|
||||
# tool parameters here keeps aggregator routes (Nous, OpenRouter,
|
||||
# etc.) compatible, in addition to direct moonshot.ai endpoints.
|
||||
if is_moonshot_model(model):
|
||||
tools = sanitize_moonshot_tools(tools)
|
||||
api_kwargs["tools"] = tools
|
||||
|
||||
# max_tokens resolution — priority: ephemeral > user > provider default
|
||||
|
||||
@@ -6685,6 +6685,13 @@ class HermesCLI:
|
||||
print(f" ⚠ Port {_port} is not reachable at {cdp_url}")
|
||||
|
||||
os.environ["BROWSER_CDP_URL"] = cdp_url
|
||||
# Eagerly start the CDP supervisor so pending_dialogs + frame_tree
|
||||
# show up in the next browser_snapshot. No-op if already started.
|
||||
try:
|
||||
from tools.browser_tool import _ensure_cdp_supervisor # type: ignore[import-not-found]
|
||||
_ensure_cdp_supervisor("default")
|
||||
except Exception:
|
||||
pass
|
||||
print()
|
||||
print("🌐 Browser connected to live Chrome via CDP")
|
||||
print(f" Endpoint: {cdp_url}")
|
||||
@@ -6706,7 +6713,8 @@ class HermesCLI:
|
||||
if current:
|
||||
os.environ.pop("BROWSER_CDP_URL", None)
|
||||
try:
|
||||
from tools.browser_tool import cleanup_all_browsers
|
||||
from tools.browser_tool import cleanup_all_browsers, _stop_cdp_supervisor
|
||||
_stop_cdp_supervisor("default")
|
||||
cleanup_all_browsers()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
+32
-1
@@ -40,6 +40,37 @@ from hermes_time import now as _hermes_now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _resolve_cron_enabled_toolsets(job: dict, cfg: dict) -> list[str] | None:
|
||||
"""Resolve the toolset list for a cron job.
|
||||
|
||||
Precedence:
|
||||
1. Per-job ``enabled_toolsets`` (set via ``cronjob`` tool on create/update).
|
||||
Keeps the agent's job-scoped toolset override intact — #6130.
|
||||
2. Per-platform ``hermes tools`` config for the ``cron`` platform.
|
||||
Mirrors gateway behavior (``_get_platform_tools(cfg, platform_key)``)
|
||||
so users can gate cron toolsets globally without recreating every job.
|
||||
3. ``None`` on any lookup failure — AIAgent loads the full default set
|
||||
(legacy behavior before this change, preserved as the safety net).
|
||||
|
||||
_DEFAULT_OFF_TOOLSETS ({moa, homeassistant, rl}) are removed by
|
||||
``_get_platform_tools`` for unconfigured platforms, so fresh installs
|
||||
get cron WITHOUT ``moa`` by default (issue reported by Norbert —
|
||||
surprise $4.63 run).
|
||||
"""
|
||||
per_job = job.get("enabled_toolsets")
|
||||
if per_job:
|
||||
return per_job
|
||||
try:
|
||||
from hermes_cli.tools_config import _get_platform_tools # lazy: avoid heavy import at cron module load
|
||||
return sorted(_get_platform_tools(cfg or {}, "cron"))
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Cron toolset resolution failed, falling back to full default toolset: %s",
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
# Valid delivery platforms — used to validate user-supplied platform names
|
||||
# in cron delivery targets, preventing env var enumeration via crafted names.
|
||||
_KNOWN_DELIVERY_PLATFORMS = frozenset({
|
||||
@@ -886,7 +917,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
providers_ignored=pr.get("ignore"),
|
||||
providers_order=pr.get("order"),
|
||||
provider_sort=pr.get("sort"),
|
||||
enabled_toolsets=job.get("enabled_toolsets") or None,
|
||||
enabled_toolsets=_resolve_cron_enabled_toolsets(job, _cfg),
|
||||
disabled_toolsets=["cronjob", "messaging", "clarify"],
|
||||
quiet_mode=True,
|
||||
skip_context_files=True, # Don't inject SOUL.md/AGENTS.md from scheduler cwd
|
||||
|
||||
@@ -2440,6 +2440,9 @@ class BasePlatformAdapter(ABC):
|
||||
user_id_alt: Optional[str] = None,
|
||||
chat_id_alt: Optional[str] = None,
|
||||
is_bot: bool = False,
|
||||
guild_id: Optional[str] = None,
|
||||
parent_chat_id: Optional[str] = None,
|
||||
message_id: Optional[str] = None,
|
||||
) -> SessionSource:
|
||||
"""Helper to build a SessionSource for this platform."""
|
||||
# Normalize empty topic to None
|
||||
@@ -2457,6 +2460,9 @@ class BasePlatformAdapter(ABC):
|
||||
user_id_alt=user_id_alt,
|
||||
chat_id_alt=chat_id_alt,
|
||||
is_bot=is_bot,
|
||||
guild_id=str(guild_id) if guild_id else None,
|
||||
parent_chat_id=str(parent_chat_id) if parent_chat_id else None,
|
||||
message_id=str(message_id) if message_id else None,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -3256,6 +3256,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
if auto_thread and not skip_thread and not is_voice_linked_channel and not is_reply_message:
|
||||
thread = await self._auto_create_thread(message)
|
||||
if thread:
|
||||
parent_channel_id = str(message.channel.id)
|
||||
is_thread = True
|
||||
thread_id = str(thread.id)
|
||||
auto_threaded_channel = thread
|
||||
@@ -3315,6 +3316,9 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
thread_id=thread_id,
|
||||
chat_topic=chat_topic,
|
||||
is_bot=getattr(message.author, "bot", False),
|
||||
guild_id=str(message.guild.id) if message.guild else None,
|
||||
parent_chat_id=parent_channel_id,
|
||||
message_id=str(message.id),
|
||||
)
|
||||
|
||||
# Build media URLs -- download image attachments to local cache so the
|
||||
|
||||
+41
-9
@@ -83,6 +83,9 @@ class SessionSource:
|
||||
user_id_alt: Optional[str] = None # Platform-specific stable alt ID (Signal UUID, Feishu union_id)
|
||||
chat_id_alt: Optional[str] = None # Signal group internal ID
|
||||
is_bot: bool = False # True when the message author is a bot/webhook (Discord)
|
||||
guild_id: Optional[str] = None # Discord guild / Slack workspace / Matrix server scope
|
||||
parent_chat_id: Optional[str] = None # Parent channel when chat_id refers to a thread
|
||||
message_id: Optional[str] = None # ID of the triggering message (for pin/reply/react)
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
@@ -120,8 +123,14 @@ class SessionSource:
|
||||
d["user_id_alt"] = self.user_id_alt
|
||||
if self.chat_id_alt:
|
||||
d["chat_id_alt"] = self.chat_id_alt
|
||||
if self.guild_id:
|
||||
d["guild_id"] = self.guild_id
|
||||
if self.parent_chat_id:
|
||||
d["parent_chat_id"] = self.parent_chat_id
|
||||
if self.message_id:
|
||||
d["message_id"] = self.message_id
|
||||
return d
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "SessionSource":
|
||||
return cls(
|
||||
@@ -135,6 +144,9 @@ class SessionSource:
|
||||
chat_topic=data.get("chat_topic"),
|
||||
user_id_alt=data.get("user_id_alt"),
|
||||
chat_id_alt=data.get("chat_id_alt"),
|
||||
guild_id=data.get("guild_id"),
|
||||
parent_chat_id=data.get("parent_chat_id"),
|
||||
message_id=data.get("message_id"),
|
||||
)
|
||||
|
||||
|
||||
@@ -273,14 +285,34 @@ def build_session_context_prompt(
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
elif context.source.platform == Platform.DISCORD:
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"**Platform notes:** You are running inside Discord. "
|
||||
"You do NOT have access to Discord-specific APIs — you cannot search "
|
||||
"channel history, pin messages, manage roles, or list server members. "
|
||||
"Do not promise to perform these actions. If the user asks, explain "
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
# The discord tool self-gates on DISCORD_BOT_TOKEN at registry
|
||||
# check time. Match that condition so the prompt stays honest:
|
||||
# with a token the agent has fetch_messages/search_members/
|
||||
# create_thread (and optionally discord_admin) and should know
|
||||
# the IDs it can call them with; without one it really is
|
||||
# limited to reading/replying via the gateway.
|
||||
if (os.environ.get("DISCORD_BOT_TOKEN") or "").strip():
|
||||
src = context.source
|
||||
id_lines = ["", "**Discord IDs (for the `discord` / `discord_admin` tools):**"]
|
||||
if src.guild_id:
|
||||
id_lines.append(f" - Guild: `{src.guild_id}`")
|
||||
if src.thread_id and src.parent_chat_id:
|
||||
id_lines.append(f" - Parent channel: `{src.parent_chat_id}`")
|
||||
id_lines.append(f" - Thread: `{src.thread_id}` (use as `channel_id` for fetch_messages etc.)")
|
||||
else:
|
||||
id_lines.append(f" - Channel: `{src.chat_id}`")
|
||||
if src.message_id:
|
||||
id_lines.append(f" - Triggering message: `{src.message_id}`")
|
||||
lines.extend(id_lines)
|
||||
else:
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"**Platform notes:** You are running inside Discord. "
|
||||
"You do NOT have access to Discord-specific APIs — you cannot search "
|
||||
"channel history, pin messages, manage roles, or list server members. "
|
||||
"Do not promise to perform these actions. If the user asks, explain "
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
|
||||
# Connected platforms
|
||||
platforms_list = ["local (files on this machine)"]
|
||||
|
||||
+54
-1
@@ -238,6 +238,52 @@ def get_git_banner_state(repo_dir: Optional[Path] = None) -> Optional[dict]:
|
||||
return {"upstream": upstream, "local": local, "ahead": max(ahead, 0)}
|
||||
|
||||
|
||||
_RELEASE_URL_BASE = "https://github.com/NousResearch/hermes-agent/releases/tag"
|
||||
_latest_release_cache: Optional[tuple] = None # (tag, url) once resolved
|
||||
|
||||
|
||||
def get_latest_release_tag(repo_dir: Optional[Path] = None) -> Optional[tuple]:
|
||||
"""Return ``(tag, release_url)`` for the latest git tag, or None.
|
||||
|
||||
Local-only — runs ``git describe --tags --abbrev=0`` against the
|
||||
Hermes checkout. Cached per-process. Release URL always points at the
|
||||
canonical NousResearch/hermes-agent repo (forks don't get a link).
|
||||
"""
|
||||
global _latest_release_cache
|
||||
if _latest_release_cache is not None:
|
||||
return _latest_release_cache or None
|
||||
|
||||
repo_dir = repo_dir or _resolve_repo_dir()
|
||||
if repo_dir is None:
|
||||
_latest_release_cache = () # falsy sentinel — skip future lookups
|
||||
return None
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "describe", "--tags", "--abbrev=0"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=3,
|
||||
cwd=str(repo_dir),
|
||||
)
|
||||
except Exception:
|
||||
_latest_release_cache = ()
|
||||
return None
|
||||
|
||||
if result.returncode != 0:
|
||||
_latest_release_cache = ()
|
||||
return None
|
||||
|
||||
tag = (result.stdout or "").strip()
|
||||
if not tag:
|
||||
_latest_release_cache = ()
|
||||
return None
|
||||
|
||||
url = f"{_RELEASE_URL_BASE}/{tag}"
|
||||
_latest_release_cache = (tag, url)
|
||||
return _latest_release_cache
|
||||
|
||||
|
||||
def format_banner_version_label() -> str:
|
||||
"""Return the version label shown in the startup banner title."""
|
||||
base = f"Hermes Agent v{VERSION} ({RELEASE_DATE})"
|
||||
@@ -519,9 +565,16 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
|
||||
agent_name = _skin_branding("agent_name", "Hermes Agent")
|
||||
title_color = _skin_color("banner_title", "#FFD700")
|
||||
border_color = _skin_color("banner_border", "#CD7F32")
|
||||
version_label = format_banner_version_label()
|
||||
release_info = get_latest_release_tag()
|
||||
if release_info:
|
||||
_tag, _url = release_info
|
||||
title_markup = f"[bold {title_color}][link={_url}]{version_label}[/link][/]"
|
||||
else:
|
||||
title_markup = f"[bold {title_color}]{version_label}[/]"
|
||||
outer_panel = Panel(
|
||||
layout_table,
|
||||
title=f"[bold {title_color}]{format_banner_version_label()}[/]",
|
||||
title=title_markup,
|
||||
border_style=border_color,
|
||||
padding=(0, 2),
|
||||
)
|
||||
|
||||
+32
-2
@@ -466,6 +466,12 @@ DEFAULT_CONFIG = {
|
||||
"record_sessions": False, # Auto-record browser sessions as WebM videos
|
||||
"allow_private_urls": False, # Allow navigating to private/internal IPs (localhost, 192.168.x.x, etc.)
|
||||
"cdp_url": "", # Optional persistent CDP endpoint for attaching to an existing Chromium/Chrome
|
||||
# CDP supervisor — dialog + frame detection via a persistent WebSocket.
|
||||
# Active only when a CDP-capable backend is attached (Browserbase or
|
||||
# local Chrome via /browser connect). See
|
||||
# website/docs/developer-guide/browser-supervisor.md.
|
||||
"dialog_policy": "must_respond", # must_respond | auto_dismiss | auto_accept
|
||||
"dialog_timeout_s": 300, # Safety auto-dismiss after N seconds under must_respond
|
||||
"camofox": {
|
||||
# When true, Hermes sends a stable profile-scoped userId to Camofox
|
||||
# so the server maps it to a persistent Firefox profile automatically.
|
||||
@@ -486,7 +492,27 @@ DEFAULT_CONFIG = {
|
||||
# exceed this are rejected with guidance to use offset+limit.
|
||||
# 100K chars ≈ 25–35K tokens across typical tokenisers.
|
||||
"file_read_max_chars": 100_000,
|
||||
|
||||
|
||||
# Tool-output truncation thresholds. When terminal output or a
|
||||
# single read_file page exceeds these limits, Hermes truncates the
|
||||
# payload sent to the model (keeping head + tail for terminal,
|
||||
# enforcing pagination for read_file). Tuning these trades context
|
||||
# footprint against how much raw output the model can see in one
|
||||
# shot. Ported from anomalyco/opencode PR #23770.
|
||||
#
|
||||
# - max_bytes: terminal_tool output cap, in chars
|
||||
# (default 50_000 ≈ 12-15K tokens).
|
||||
# - max_lines: read_file pagination cap — the maximum `limit`
|
||||
# a single read_file call can request before
|
||||
# being clamped (default 2000).
|
||||
# - max_line_length: per-line cap applied when read_file emits a
|
||||
# line-numbered view (default 2000 chars).
|
||||
"tool_output": {
|
||||
"max_bytes": 50_000,
|
||||
"max_lines": 2000,
|
||||
"max_line_length": 2000,
|
||||
},
|
||||
|
||||
"compression": {
|
||||
"enabled": True,
|
||||
"threshold": 0.50, # compress when context usage exceeds this ratio
|
||||
@@ -739,6 +765,10 @@ DEFAULT_CONFIG = {
|
||||
"inherit_mcp_toolsets": True,
|
||||
"max_iterations": 50, # per-subagent iteration cap (each subagent gets its own budget,
|
||||
# independent of the parent's max_iterations)
|
||||
"child_timeout_seconds": 600, # wall-clock timeout for each child agent (floor 30s,
|
||||
# no ceiling). High-reasoning models on large tasks
|
||||
# (e.g. gpt-5.5 xhigh, opus-4.6) need generous budgets;
|
||||
# raise if children time out before producing output.
|
||||
"reasoning_effort": "", # reasoning effort for subagents: "xhigh", "high", "medium",
|
||||
# "low", "minimal", "none" (empty = inherit parent's level)
|
||||
"max_concurrent_children": 3, # max parallel children per batch; floor of 1 enforced, no ceiling
|
||||
@@ -803,7 +833,7 @@ DEFAULT_CONFIG = {
|
||||
"auto_thread": True, # Auto-create threads on @mention in channels (like Slack)
|
||||
"reactions": True, # Add 👀/✅/❌ reactions to messages during processing
|
||||
"channel_prompts": {}, # Per-channel ephemeral system prompts (forum parents apply to child threads)
|
||||
# discord_server tool: restrict which actions the agent may call.
|
||||
# discord / discord_admin tools: restrict which actions the agent may call.
|
||||
# Default (empty) = all actions allowed (subject to bot privileged intents).
|
||||
# Accepts comma-separated string ("list_guilds,list_channels,fetch_messages")
|
||||
# or YAML list. Unknown names are dropped with a warning at load time.
|
||||
|
||||
@@ -33,6 +33,8 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"]
|
||||
# (model_id, display description shown in menus)
|
||||
OPENROUTER_MODELS: list[tuple[str, str]] = [
|
||||
("moonshotai/kimi-k2.6", "recommended"),
|
||||
("deepseek/deepseek-v4-pro", ""),
|
||||
("deepseek/deepseek-v4-flash", ""),
|
||||
("anthropic/claude-opus-4.7", ""),
|
||||
("anthropic/claude-opus-4.6", ""),
|
||||
("anthropic/claude-sonnet-4.6", ""),
|
||||
@@ -109,6 +111,8 @@ def _codex_curated_models() -> list[str]:
|
||||
_PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"nous": [
|
||||
"moonshotai/kimi-k2.6",
|
||||
"deepseek/deepseek-v4-pro",
|
||||
"deepseek/deepseek-v4-flash",
|
||||
"xiaomi/mimo-v2.5-pro",
|
||||
"xiaomi/mimo-v2.5",
|
||||
"anthropic/claude-opus-4.7",
|
||||
@@ -246,6 +250,8 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"claude-haiku-4-5-20251001",
|
||||
],
|
||||
"deepseek": [
|
||||
"deepseek-v4-pro",
|
||||
"deepseek-v4-flash",
|
||||
"deepseek-chat",
|
||||
"deepseek-reasoner",
|
||||
],
|
||||
|
||||
@@ -38,6 +38,7 @@ PLATFORMS: OrderedDict[str, PlatformInfo] = OrderedDict([
|
||||
("qqbot", PlatformInfo(label="💬 QQBot", default_toolset="hermes-qqbot")),
|
||||
("webhook", PlatformInfo(label="🔗 Webhook", default_toolset="hermes-webhook")),
|
||||
("api_server", PlatformInfo(label="🌐 API Server", default_toolset="hermes-api-server")),
|
||||
("cron", PlatformInfo(label="⏰ Cron", default_toolset="hermes-cron")),
|
||||
])
|
||||
|
||||
|
||||
|
||||
@@ -67,12 +67,13 @@ CONFIGURABLE_TOOLSETS = [
|
||||
("messaging", "📨 Cross-Platform Messaging", "send_message"),
|
||||
("rl", "🧪 RL Training", "Tinker-Atropos training tools"),
|
||||
("homeassistant", "🏠 Home Assistant", "smart home device control"),
|
||||
("discord_admin", "🛡️ Discord Server Admin", "list channels/roles, pin, assign roles"),
|
||||
]
|
||||
|
||||
# Toolsets that are OFF by default for new installs.
|
||||
# They're still in _HERMES_CORE_TOOLS (available at runtime if enabled),
|
||||
# but the setup checklist won't pre-select them for first-time users.
|
||||
_DEFAULT_OFF_TOOLSETS = {"moa", "homeassistant", "rl"}
|
||||
_DEFAULT_OFF_TOOLSETS = {"moa", "homeassistant", "rl", "discord_admin"}
|
||||
|
||||
|
||||
def _get_effective_configurable_toolsets():
|
||||
@@ -549,7 +550,7 @@ def _get_platform_tools(
|
||||
include_default_mcp_servers: bool = True,
|
||||
) -> Set[str]:
|
||||
"""Resolve which individual toolset names are enabled for a platform."""
|
||||
from toolsets import resolve_toolset
|
||||
from toolsets import resolve_toolset, TOOLSETS
|
||||
|
||||
platform_toolsets = config.get("platform_toolsets") or {}
|
||||
toolset_names = platform_toolsets.get(platform)
|
||||
@@ -563,6 +564,8 @@ def _get_platform_tools(
|
||||
toolset_names = [str(ts) for ts in toolset_names]
|
||||
|
||||
configurable_keys = {ts_key for ts_key, _, _ in CONFIGURABLE_TOOLSETS}
|
||||
plugin_ts_keys = _get_plugin_toolset_keys()
|
||||
platform_default_keys = {p["default_toolset"] for p in PLATFORMS.values()}
|
||||
|
||||
# If the saved list contains any configurable keys directly, the user
|
||||
# has explicitly configured this platform — use direct membership.
|
||||
@@ -585,16 +588,46 @@ def _get_platform_tools(
|
||||
ts_tools = set(resolve_toolset(ts_key))
|
||||
if ts_tools and ts_tools.issubset(all_tool_names):
|
||||
enabled_toolsets.add(ts_key)
|
||||
|
||||
default_off = set(_DEFAULT_OFF_TOOLSETS)
|
||||
if platform in default_off:
|
||||
default_off.remove(platform)
|
||||
enabled_toolsets -= default_off
|
||||
|
||||
# Recover non-configurable platform toolsets (e.g. discord, feishu_doc,
|
||||
# feishu_drive). These are part of the platform's default composite but
|
||||
# absent from CONFIGURABLE_TOOLSETS, so they can't appear in the TUI
|
||||
# checklist or in a user-saved config. Must run in BOTH branches —
|
||||
# otherwise saving via `hermes tools` (which flips has_explicit_config
|
||||
# to True) silently drops them.
|
||||
platform_tool_universe = set(resolve_toolset(PLATFORMS[platform]["default_toolset"]))
|
||||
configurable_tool_universe = set()
|
||||
for ck in configurable_keys:
|
||||
configurable_tool_universe.update(resolve_toolset(ck))
|
||||
claimed = set()
|
||||
for ts_key in enabled_toolsets:
|
||||
claimed.update(resolve_toolset(ts_key))
|
||||
skip = configurable_keys | plugin_ts_keys | platform_default_keys
|
||||
skip |= {k for k in TOOLSETS if k.startswith("hermes-")}
|
||||
skip |= set(_DEFAULT_OFF_TOOLSETS) - {platform}
|
||||
for ts_key, ts_def in TOOLSETS.items():
|
||||
if ts_key in skip:
|
||||
continue
|
||||
if ts_def.get("includes"):
|
||||
continue
|
||||
ts_tools = set(resolve_toolset(ts_key))
|
||||
if not ts_tools or not ts_tools.issubset(platform_tool_universe):
|
||||
continue
|
||||
if ts_tools.issubset(configurable_tool_universe):
|
||||
continue
|
||||
if not ts_tools.issubset(claimed):
|
||||
enabled_toolsets.add(ts_key)
|
||||
claimed.update(ts_tools)
|
||||
|
||||
# Plugin toolsets: enabled by default unless explicitly disabled.
|
||||
# A plugin toolset is "known" for a platform once `hermes tools`
|
||||
# has been saved for that platform (tracked via known_plugin_toolsets).
|
||||
# Unknown plugins default to enabled; known-but-absent = disabled.
|
||||
plugin_ts_keys = _get_plugin_toolset_keys()
|
||||
if plugin_ts_keys:
|
||||
known_map = config.get("known_plugin_toolsets", {})
|
||||
known_for_platform = set(known_map.get(platform, []))
|
||||
@@ -609,7 +642,6 @@ def _get_platform_tools(
|
||||
|
||||
# Preserve any explicit non-configurable toolset entries (for example,
|
||||
# custom toolsets or MCP server names saved in platform_toolsets).
|
||||
platform_default_keys = {p["default_toolset"] for p in PLATFORMS.values()}
|
||||
explicit_passthrough = {
|
||||
ts
|
||||
for ts in toolset_names
|
||||
@@ -669,6 +701,7 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
existing_toolsets = config.get("platform_toolsets", {}).get(platform, [])
|
||||
if not isinstance(existing_toolsets, list):
|
||||
existing_toolsets = []
|
||||
existing_toolsets = [str(ts) for ts in existing_toolsets]
|
||||
|
||||
# Preserve any entries that are NOT configurable toolsets and NOT platform
|
||||
# defaults (i.e. only MCP server names should be preserved)
|
||||
@@ -676,6 +709,8 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
entry for entry in existing_toolsets
|
||||
if entry not in configurable_keys and entry not in platform_default_keys
|
||||
}
|
||||
if "no_mcp" not in enabled_toolset_keys:
|
||||
preserved_entries.discard("no_mcp")
|
||||
|
||||
# Merge preserved entries with new enabled toolsets
|
||||
config["platform_toolsets"][platform] = sorted(enabled_toolset_keys | preserved_entries)
|
||||
|
||||
@@ -0,0 +1,548 @@
|
||||
"""Process-wide voice recording + TTS API for the TUI gateway.
|
||||
|
||||
Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
|
||||
(text-to-speech) behind idempotent, stateful entry points that the gateway's
|
||||
``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can
|
||||
call from a dedicated thread. The gateway imports this module lazily so that
|
||||
missing optional audio deps (sounddevice, faster-whisper, numpy) surface as
|
||||
an ``ImportError`` at call time, not at startup.
|
||||
|
||||
Two usage modes are exposed:
|
||||
|
||||
* **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single
|
||||
manually-bounded capture used when the caller drives the start/stop pair
|
||||
explicitly.
|
||||
* **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors
|
||||
the classic CLI voice mode: recording auto-stops on silence, transcribes,
|
||||
hands the result to a callback, and then auto-restarts for the next turn.
|
||||
Three consecutive no-speech cycles stop the loop and fire
|
||||
``on_silent_limit`` so the UI can turn the mode off.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from tools.voice_mode import (
|
||||
create_audio_recorder,
|
||||
is_whisper_hallucination,
|
||||
play_audio_file,
|
||||
transcribe_recording,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _debug(msg: str) -> None:
|
||||
"""Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1.
|
||||
|
||||
Goes to stderr so the TUI gateway wraps it as a gateway.stderr event,
|
||||
which createGatewayEventHandler shows as an Activity line — exactly
|
||||
what we need to diagnose "why didn't the loop auto-restart?" in the
|
||||
user's real terminal without shipping a separate debug RPC.
|
||||
|
||||
Any OSError / BrokenPipeError is swallowed because this fires from
|
||||
background threads (silence callback, TTS daemon, beep) where a
|
||||
broken stderr pipe must not kill the whole gateway — the main
|
||||
command pipe (stdin+stdout) is what actually matters.
|
||||
"""
|
||||
if os.environ.get("HERMES_VOICE_DEBUG", "").strip() != "1":
|
||||
return
|
||||
try:
|
||||
print(f"[voice] {msg}", file=sys.stderr, flush=True)
|
||||
except (BrokenPipeError, OSError):
|
||||
pass
|
||||
|
||||
|
||||
def _beeps_enabled() -> bool:
|
||||
"""CLI parity: voice.beep_enabled in config.yaml (default True)."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
voice_cfg = load_config().get("voice", {})
|
||||
if isinstance(voice_cfg, dict):
|
||||
return bool(voice_cfg.get("beep_enabled", True))
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def _play_beep(frequency: int, count: int = 1) -> None:
|
||||
"""Audible cue matching cli.py's record/stop beeps.
|
||||
|
||||
880 Hz single-beep on start (cli.py:_voice_start_recording line 7532),
|
||||
660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585).
|
||||
Best-effort — sounddevice failures are silently swallowed so the
|
||||
voice loop never breaks because a speaker was unavailable.
|
||||
"""
|
||||
if not _beeps_enabled():
|
||||
return
|
||||
try:
|
||||
from tools.voice_mode import play_beep
|
||||
|
||||
play_beep(frequency=frequency, count=count)
|
||||
except Exception as e:
|
||||
_debug(f"beep {frequency}Hz failed: {e}")
|
||||
|
||||
# ── Push-to-talk state ───────────────────────────────────────────────
|
||||
_recorder = None
|
||||
_recorder_lock = threading.Lock()
|
||||
|
||||
# ── Continuous (VAD) state ───────────────────────────────────────────
|
||||
_continuous_lock = threading.Lock()
|
||||
_continuous_active = False
|
||||
_continuous_recorder: Any = None
|
||||
|
||||
# ── TTS-vs-STT feedback guard ────────────────────────────────────────
|
||||
# When TTS plays the agent reply over the speakers, the live microphone
|
||||
# picks it up and transcribes the agent's own voice as user input — an
|
||||
# infinite loop the agent happily joins ("Ha, looks like we're in a loop").
|
||||
# This Event mirrors cli.py:_voice_tts_done: cleared while speak_text is
|
||||
# playing, set while silent. _continuous_on_silence waits on it before
|
||||
# re-arming the recorder, and speak_text itself cancels any live capture
|
||||
# before starting playback so the tail of the previous utterance doesn't
|
||||
# leak into the mic.
|
||||
_tts_playing = threading.Event()
|
||||
_tts_playing.set() # initially "not playing"
|
||||
_continuous_on_transcript: Optional[Callable[[str], None]] = None
|
||||
_continuous_on_status: Optional[Callable[[str], None]] = None
|
||||
_continuous_on_silent_limit: Optional[Callable[[], None]] = None
|
||||
_continuous_no_speech_count = 0
|
||||
_CONTINUOUS_NO_SPEECH_LIMIT = 3
|
||||
|
||||
|
||||
# ── Push-to-talk API ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def start_recording() -> None:
|
||||
"""Begin capturing from the default input device (push-to-talk).
|
||||
|
||||
Idempotent — calling again while a recording is in progress is a no-op.
|
||||
"""
|
||||
global _recorder
|
||||
|
||||
with _recorder_lock:
|
||||
if _recorder is not None and getattr(_recorder, "is_recording", False):
|
||||
return
|
||||
rec = create_audio_recorder()
|
||||
rec.start()
|
||||
_recorder = rec
|
||||
|
||||
|
||||
def stop_and_transcribe() -> Optional[str]:
|
||||
"""Stop the active push-to-talk recording, transcribe, return text.
|
||||
|
||||
Returns ``None`` when no recording is active, when the microphone
|
||||
captured no speech, or when Whisper returned a known hallucination.
|
||||
"""
|
||||
global _recorder
|
||||
|
||||
with _recorder_lock:
|
||||
rec = _recorder
|
||||
_recorder = None
|
||||
|
||||
if rec is None:
|
||||
return None
|
||||
|
||||
wav_path = rec.stop()
|
||||
if not wav_path:
|
||||
return None
|
||||
|
||||
try:
|
||||
result = transcribe_recording(wav_path)
|
||||
except Exception as e:
|
||||
logger.warning("voice transcription failed: %s", e)
|
||||
return None
|
||||
finally:
|
||||
try:
|
||||
if os.path.isfile(wav_path):
|
||||
os.unlink(wav_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# transcribe_recording returns {"success": bool, "transcript": str, ...}
|
||||
# — matches cli.py:_voice_stop_and_transcribe's result.get("transcript").
|
||||
if not result.get("success"):
|
||||
return None
|
||||
text = (result.get("transcript") or "").strip()
|
||||
if not text or is_whisper_hallucination(text):
|
||||
return None
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# ── Continuous (VAD) API ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def start_continuous(
|
||||
on_transcript: Callable[[str], None],
|
||||
on_status: Optional[Callable[[str], None]] = None,
|
||||
on_silent_limit: Optional[Callable[[], None]] = None,
|
||||
silence_threshold: int = 200,
|
||||
silence_duration: float = 3.0,
|
||||
) -> None:
|
||||
"""Start a VAD-driven continuous recording loop.
|
||||
|
||||
The loop calls ``on_transcript(text)`` each time speech is detected and
|
||||
transcribed successfully, then auto-restarts. After
|
||||
``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
|
||||
picked up at all) the loop stops itself and calls ``on_silent_limit``
|
||||
so the UI can reflect "voice off". Idempotent — calling while already
|
||||
active is a no-op.
|
||||
|
||||
``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
|
||||
``"idle"`` so the UI can show a live indicator.
|
||||
"""
|
||||
global _continuous_active, _continuous_recorder
|
||||
global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
|
||||
global _continuous_no_speech_count
|
||||
|
||||
with _continuous_lock:
|
||||
if _continuous_active:
|
||||
_debug("start_continuous: already active — no-op")
|
||||
return
|
||||
_continuous_active = True
|
||||
_continuous_on_transcript = on_transcript
|
||||
_continuous_on_status = on_status
|
||||
_continuous_on_silent_limit = on_silent_limit
|
||||
_continuous_no_speech_count = 0
|
||||
|
||||
if _continuous_recorder is None:
|
||||
_continuous_recorder = create_audio_recorder()
|
||||
|
||||
_continuous_recorder._silence_threshold = silence_threshold
|
||||
_continuous_recorder._silence_duration = silence_duration
|
||||
rec = _continuous_recorder
|
||||
|
||||
_debug(
|
||||
f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)"
|
||||
)
|
||||
|
||||
# CLI parity: single 880 Hz beep *before* opening the stream — placing
|
||||
# the beep after stream.start() on macOS triggers a CoreAudio conflict
|
||||
# (cli.py:7528 comment).
|
||||
_play_beep(frequency=880, count=1)
|
||||
|
||||
try:
|
||||
rec.start(on_silence_stop=_continuous_on_silence)
|
||||
except Exception as e:
|
||||
logger.error("failed to start continuous recording: %s", e)
|
||||
_debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}")
|
||||
with _continuous_lock:
|
||||
_continuous_active = False
|
||||
raise
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("listening")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def stop_continuous() -> None:
|
||||
"""Stop the active continuous loop and release the microphone.
|
||||
|
||||
Idempotent — calling while not active is a no-op. Any in-flight
|
||||
transcription completes but its result is discarded (the callback
|
||||
checks ``_continuous_active`` before firing).
|
||||
"""
|
||||
global _continuous_active, _continuous_on_transcript
|
||||
global _continuous_on_status, _continuous_on_silent_limit
|
||||
global _continuous_recorder, _continuous_no_speech_count
|
||||
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
return
|
||||
_continuous_active = False
|
||||
rec = _continuous_recorder
|
||||
on_status = _continuous_on_status
|
||||
_continuous_on_transcript = None
|
||||
_continuous_on_status = None
|
||||
_continuous_on_silent_limit = None
|
||||
_continuous_no_speech_count = 0
|
||||
|
||||
if rec is not None:
|
||||
try:
|
||||
# cancel() (not stop()) discards buffered frames — the loop
|
||||
# is over, we don't want to transcribe a half-captured turn.
|
||||
rec.cancel()
|
||||
except Exception as e:
|
||||
logger.warning("failed to cancel recorder: %s", e)
|
||||
|
||||
# Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the
|
||||
# silence-auto-stop path plays).
|
||||
_play_beep(frequency=660, count=2)
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("idle")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def is_continuous_active() -> bool:
|
||||
"""Whether a continuous voice loop is currently running."""
|
||||
with _continuous_lock:
|
||||
return _continuous_active
|
||||
|
||||
|
||||
def _continuous_on_silence() -> None:
|
||||
"""AudioRecorder silence callback — runs in a daemon thread.
|
||||
|
||||
Stops the current capture, transcribes, delivers the text via
|
||||
``on_transcript``, and — if the loop is still active — starts the
|
||||
next capture. Three consecutive silent cycles end the loop.
|
||||
"""
|
||||
global _continuous_active, _continuous_no_speech_count
|
||||
|
||||
_debug("_continuous_on_silence: fired")
|
||||
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
_debug("_continuous_on_silence: loop inactive — abort")
|
||||
return
|
||||
rec = _continuous_recorder
|
||||
on_transcript = _continuous_on_transcript
|
||||
on_status = _continuous_on_status
|
||||
on_silent_limit = _continuous_on_silent_limit
|
||||
|
||||
if rec is None:
|
||||
_debug("_continuous_on_silence: no recorder — abort")
|
||||
return
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("transcribing")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
wav_path = rec.stop()
|
||||
# Peak RMS is the critical diagnostic when stop() returns None despite
|
||||
# the VAD firing — tells us at a glance whether the mic was too quiet
|
||||
# for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree.
|
||||
peak_rms = getattr(rec, "_peak_rms", -1)
|
||||
_debug(
|
||||
f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})"
|
||||
)
|
||||
|
||||
# CLI parity: double 660 Hz beep after the stream stops (safe from the
|
||||
# CoreAudio conflict that blocks pre-start beeps).
|
||||
_play_beep(frequency=660, count=2)
|
||||
|
||||
transcript: Optional[str] = None
|
||||
|
||||
if wav_path:
|
||||
try:
|
||||
result = transcribe_recording(wav_path)
|
||||
# transcribe_recording returns {"success": bool, "transcript": str,
|
||||
# "error": str?} — NOT {"text": str}. Using the wrong key silently
|
||||
# produced empty transcripts even when Groq/local STT returned fine,
|
||||
# which masqueraded as "not hearing the user" to the caller.
|
||||
success = bool(result.get("success"))
|
||||
text = (result.get("transcript") or "").strip()
|
||||
err = result.get("error")
|
||||
_debug(
|
||||
f"_continuous_on_silence: transcribe -> success={success} "
|
||||
f"text={text!r} err={err!r}"
|
||||
)
|
||||
if success and text and not is_whisper_hallucination(text):
|
||||
transcript = text
|
||||
except Exception as e:
|
||||
logger.warning("continuous transcription failed: %s", e)
|
||||
_debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}")
|
||||
finally:
|
||||
try:
|
||||
if os.path.isfile(wav_path):
|
||||
os.unlink(wav_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
# User stopped us while we were transcribing — discard.
|
||||
_debug("_continuous_on_silence: stopped during transcribe — no restart")
|
||||
return
|
||||
if transcript:
|
||||
_continuous_no_speech_count = 0
|
||||
else:
|
||||
_continuous_no_speech_count += 1
|
||||
should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT
|
||||
no_speech = _continuous_no_speech_count
|
||||
|
||||
if transcript and on_transcript:
|
||||
try:
|
||||
on_transcript(transcript)
|
||||
except Exception as e:
|
||||
logger.warning("on_transcript callback raised: %s", e)
|
||||
|
||||
if should_halt:
|
||||
_debug(f"_continuous_on_silence: {no_speech} silent cycles — halting")
|
||||
with _continuous_lock:
|
||||
_continuous_active = False
|
||||
_continuous_no_speech_count = 0
|
||||
if on_silent_limit:
|
||||
try:
|
||||
on_silent_limit()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
rec.cancel()
|
||||
except Exception:
|
||||
pass
|
||||
if on_status:
|
||||
try:
|
||||
on_status("idle")
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
# CLI parity (cli.py:10619-10621): wait for any in-flight TTS to
|
||||
# finish before re-arming the mic, then leave a small gap to avoid
|
||||
# catching the tail of the speaker output. Without this the voice
|
||||
# loop becomes a feedback loop — the agent's spoken reply lands
|
||||
# back in the mic and gets re-submitted.
|
||||
if not _tts_playing.is_set():
|
||||
_debug("_continuous_on_silence: waiting for TTS to finish")
|
||||
_tts_playing.wait(timeout=60)
|
||||
import time as _time
|
||||
_time.sleep(0.3)
|
||||
|
||||
# User may have stopped the loop during the wait.
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
_debug("_continuous_on_silence: stopped while waiting for TTS")
|
||||
return
|
||||
|
||||
# Restart for the next turn.
|
||||
_debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
|
||||
_play_beep(frequency=880, count=1)
|
||||
try:
|
||||
rec.start(on_silence_stop=_continuous_on_silence)
|
||||
except Exception as e:
|
||||
logger.error("failed to restart continuous recording: %s", e)
|
||||
_debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
|
||||
with _continuous_lock:
|
||||
_continuous_active = False
|
||||
return
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("listening")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ── TTS API ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def speak_text(text: str) -> None:
|
||||
"""Synthesize ``text`` with the configured TTS provider and play it.
|
||||
|
||||
Mirrors cli.py:_voice_speak_response exactly — same markdown strip
|
||||
pipeline, same 4000-char cap, same explicit mp3 output path, same
|
||||
MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
|
||||
of both extensions. Keeping these in sync means a voice-mode TTS
|
||||
session in the TUI sounds identical to one in the classic CLI.
|
||||
|
||||
While playback is in flight the module-level _tts_playing Event is
|
||||
cleared so the continuous-recording loop knows to wait before
|
||||
re-arming the mic (otherwise the agent's spoken reply feedback-loops
|
||||
through the microphone and the agent ends up replying to itself).
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return
|
||||
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
# Cancel any live capture before we open the speakers — otherwise the
|
||||
# last ~200ms of the user's turn tail + the first syllables of our TTS
|
||||
# both end up in the next recording window. The continuous loop will
|
||||
# re-arm itself after _tts_playing flips back (see _continuous_on_silence).
|
||||
paused_recording = False
|
||||
with _continuous_lock:
|
||||
if (
|
||||
_continuous_active
|
||||
and _continuous_recorder is not None
|
||||
and getattr(_continuous_recorder, "is_recording", False)
|
||||
):
|
||||
try:
|
||||
_continuous_recorder.cancel()
|
||||
paused_recording = True
|
||||
except Exception as e:
|
||||
logger.warning("failed to pause recorder for TTS: %s", e)
|
||||
|
||||
_tts_playing.clear()
|
||||
_debug(f"speak_text: TTS begin (paused_recording={paused_recording})")
|
||||
|
||||
try:
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
tts_text = text[:4000] if len(text) > 4000 else text
|
||||
tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text) # fenced code blocks
|
||||
tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text) # [text](url) → text
|
||||
tts_text = re.sub(r'https?://\S+', '', tts_text) # bare URLs
|
||||
tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold
|
||||
tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic
|
||||
tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # inline code
|
||||
tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers
|
||||
tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list bullets
|
||||
tts_text = re.sub(r'---+', '', tts_text) # horizontal rules
|
||||
tts_text = re.sub(r'\n{3,}', '\n\n', tts_text) # excess newlines
|
||||
tts_text = tts_text.strip()
|
||||
if not tts_text:
|
||||
return
|
||||
|
||||
# MP3 output path, pre-chosen so we can play the MP3 directly even
|
||||
# when text_to_speech_tool auto-converts to OGG for messaging
|
||||
# platforms. afplay's OGG support is flaky, MP3 always works.
|
||||
os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True)
|
||||
mp3_path = os.path.join(
|
||||
tempfile.gettempdir(),
|
||||
"hermes_voice",
|
||||
f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3",
|
||||
)
|
||||
|
||||
_debug(f"speak_text: synthesizing {len(tts_text)} chars -> {mp3_path}")
|
||||
text_to_speech_tool(text=tts_text, output_path=mp3_path)
|
||||
|
||||
if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0:
|
||||
_debug(f"speak_text: playing {mp3_path} ({os.path.getsize(mp3_path)} bytes)")
|
||||
play_audio_file(mp3_path)
|
||||
try:
|
||||
os.unlink(mp3_path)
|
||||
ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg"
|
||||
if os.path.isfile(ogg_path):
|
||||
os.unlink(ogg_path)
|
||||
except OSError:
|
||||
pass
|
||||
else:
|
||||
_debug(f"speak_text: TTS tool produced no audio at {mp3_path}")
|
||||
except Exception as e:
|
||||
logger.warning("Voice TTS playback failed: %s", e)
|
||||
_debug(f"speak_text raised {type(e).__name__}: {e}")
|
||||
finally:
|
||||
_tts_playing.set()
|
||||
_debug("speak_text: TTS done")
|
||||
|
||||
# Re-arm the mic so the user can answer without pressing Ctrl+B.
|
||||
# Small delay lets the OS flush speaker output and afplay fully
|
||||
# release the audio device before sounddevice re-opens the input.
|
||||
if paused_recording:
|
||||
time.sleep(0.3)
|
||||
with _continuous_lock:
|
||||
if _continuous_active and _continuous_recorder is not None:
|
||||
try:
|
||||
_continuous_recorder.start(
|
||||
on_silence_stop=_continuous_on_silence
|
||||
)
|
||||
_debug("speak_text: recording resumed after TTS")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"failed to resume recorder after TTS: %s", e
|
||||
)
|
||||
+52
-23
@@ -288,30 +288,34 @@ def get_tool_definitions(
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic_schema}
|
||||
break
|
||||
|
||||
# Rebuild discord_server schema based on the bot's privileged intents
|
||||
# (detected from GET /applications/@me) and the user's action allowlist
|
||||
# in config. Hides actions the bot's intents don't support so the
|
||||
# model never attempts them, and annotates fetch_messages when the
|
||||
# Rebuild discord / discord_admin schemas based on the bot's privileged
|
||||
# intents (detected from GET /applications/@me) and the user's action
|
||||
# allowlist in config. Hides actions the bot's intents don't support so
|
||||
# the model never attempts them, and annotates fetch_messages when the
|
||||
# MESSAGE_CONTENT intent is missing.
|
||||
if "discord_server" in available_tool_names:
|
||||
try:
|
||||
from tools.discord_tool import get_dynamic_schema
|
||||
dynamic = get_dynamic_schema()
|
||||
except Exception: # pragma: no cover — defensive, fall back to static
|
||||
dynamic = None
|
||||
if dynamic is None:
|
||||
# Tool filtered out entirely (empty allowlist or detection disabled
|
||||
# the only remaining actions). Drop it from the schema list.
|
||||
filtered_tools = [
|
||||
t for t in filtered_tools
|
||||
if t.get("function", {}).get("name") != "discord_server"
|
||||
]
|
||||
available_tool_names.discard("discord_server")
|
||||
else:
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == "discord_server":
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic}
|
||||
break
|
||||
_discord_schema_fns = {
|
||||
"discord": "get_dynamic_schema_core",
|
||||
"discord_admin": "get_dynamic_schema_admin",
|
||||
}
|
||||
for discord_tool_name in _discord_schema_fns:
|
||||
if discord_tool_name in available_tool_names:
|
||||
try:
|
||||
from tools import discord_tool as _dt
|
||||
schema_fn = getattr(_dt, _discord_schema_fns[discord_tool_name])
|
||||
dynamic = schema_fn()
|
||||
except Exception:
|
||||
dynamic = None
|
||||
if dynamic is None:
|
||||
filtered_tools = [
|
||||
t for t in filtered_tools
|
||||
if t.get("function", {}).get("name") != discord_tool_name
|
||||
]
|
||||
available_tool_names.discard(discord_tool_name)
|
||||
else:
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == discord_tool_name:
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic}
|
||||
break
|
||||
|
||||
# Strip web tool cross-references from browser_navigate description when
|
||||
# web_search / web_extract are not available. The static schema says
|
||||
@@ -418,6 +422,31 @@ def _coerce_value(value: str, expected_type):
|
||||
return _coerce_number(value, integer_only=(expected_type == "integer"))
|
||||
if expected_type == "boolean":
|
||||
return _coerce_boolean(value)
|
||||
if expected_type == "array":
|
||||
return _coerce_json(value, list)
|
||||
if expected_type == "object":
|
||||
return _coerce_json(value, dict)
|
||||
return value
|
||||
|
||||
|
||||
def _coerce_json(value: str, expected_python_type: type):
|
||||
"""Parse *value* as JSON when the schema expects an array or object.
|
||||
|
||||
Handles model output drift where a complex oneOf/discriminated-union schema
|
||||
causes the LLM to emit the array/object as a JSON string instead of a native
|
||||
structure. Returns the original string if parsing fails or yields the wrong
|
||||
Python type.
|
||||
"""
|
||||
try:
|
||||
parsed = json.loads(value)
|
||||
except (ValueError, TypeError):
|
||||
return value
|
||||
if isinstance(parsed, expected_python_type):
|
||||
logger.debug(
|
||||
"coerce_tool_args: coerced string to %s via json.loads",
|
||||
expected_python_type.__name__,
|
||||
)
|
||||
return parsed
|
||||
return value
|
||||
|
||||
|
||||
|
||||
@@ -167,6 +167,8 @@ AUTHOR_MAP = {
|
||||
"socrates1024@gmail.com": "socrates1024",
|
||||
"seanalt555@gmail.com": "Salt-555",
|
||||
"satelerd@gmail.com": "satelerd",
|
||||
"dan@danlynn.com": "danklynn",
|
||||
"mattmaximo@hotmail.com": "MattMaximo",
|
||||
"numman.ali@gmail.com": "nummanali",
|
||||
"rohithsaimidigudla@gmail.com": "whitehatjr1001",
|
||||
"0xNyk@users.noreply.github.com": "0xNyk",
|
||||
|
||||
@@ -0,0 +1,196 @@
|
||||
---
|
||||
name: design-md
|
||||
description: Author, validate, diff, and export DESIGN.md files — Google's open-source format spec that gives coding agents a persistent, structured understanding of a design system (tokens + rationale in one file). Use when building a design system, porting style rules between projects, generating UI with consistent brand, or auditing accessibility/contrast.
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [design, design-system, tokens, ui, accessibility, wcag, tailwind, dtcg, google]
|
||||
related_skills: [popular-web-designs, excalidraw, architecture-diagram]
|
||||
---
|
||||
|
||||
# DESIGN.md Skill
|
||||
|
||||
DESIGN.md is Google's open spec (Apache-2.0, `google-labs-code/design.md`) for
|
||||
describing a visual identity to coding agents. One file combines:
|
||||
|
||||
- **YAML front matter** — machine-readable design tokens (normative values)
|
||||
- **Markdown body** — human-readable rationale, organized into canonical sections
|
||||
|
||||
Tokens give exact values. Prose tells agents *why* those values exist and how to
|
||||
apply them. The CLI (`npx @google/design.md`) lints structure + WCAG contrast,
|
||||
diffs versions for regressions, and exports to Tailwind or W3C DTCG JSON.
|
||||
|
||||
## When to use this skill
|
||||
|
||||
- User asks for a DESIGN.md file, design tokens, or a design system spec
|
||||
- User wants consistent UI/brand across multiple projects or tools
|
||||
- User pastes an existing DESIGN.md and asks to lint, diff, export, or extend it
|
||||
- User asks to port a style guide into a format agents can consume
|
||||
- User wants contrast / WCAG accessibility validation on their color palette
|
||||
|
||||
For purely visual inspiration or layout examples, use `popular-web-designs`
|
||||
instead. This skill is for the *formal spec file* itself.
|
||||
|
||||
## File anatomy
|
||||
|
||||
```md
|
||||
---
|
||||
version: alpha
|
||||
name: Heritage
|
||||
description: Architectural minimalism meets journalistic gravitas.
|
||||
colors:
|
||||
primary: "#1A1C1E"
|
||||
secondary: "#6C7278"
|
||||
tertiary: "#B8422E"
|
||||
neutral: "#F7F5F2"
|
||||
typography:
|
||||
h1:
|
||||
fontFamily: Public Sans
|
||||
fontSize: 3rem
|
||||
fontWeight: 700
|
||||
lineHeight: 1.1
|
||||
letterSpacing: "-0.02em"
|
||||
body-md:
|
||||
fontFamily: Public Sans
|
||||
fontSize: 1rem
|
||||
rounded:
|
||||
sm: 4px
|
||||
md: 8px
|
||||
lg: 16px
|
||||
spacing:
|
||||
sm: 8px
|
||||
md: 16px
|
||||
lg: 24px
|
||||
components:
|
||||
button-primary:
|
||||
backgroundColor: "{colors.tertiary}"
|
||||
textColor: "#FFFFFF"
|
||||
rounded: "{rounded.sm}"
|
||||
padding: 12px
|
||||
button-primary-hover:
|
||||
backgroundColor: "{colors.primary}"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Architectural Minimalism meets Journalistic Gravitas...
|
||||
|
||||
## Colors
|
||||
|
||||
- **Primary (#1A1C1E):** Deep ink for headlines and core text.
|
||||
- **Tertiary (#B8422E):** "Boston Clay" — the sole driver for interaction.
|
||||
|
||||
## Typography
|
||||
|
||||
Public Sans for everything except small all-caps labels...
|
||||
|
||||
## Components
|
||||
|
||||
`button-primary` is the only high-emphasis action on a page...
|
||||
```
|
||||
|
||||
## Token types
|
||||
|
||||
| Type | Format | Example |
|
||||
|------|--------|---------|
|
||||
| Color | `#` + hex (sRGB) | `"#1A1C1E"` |
|
||||
| Dimension | number + unit (`px`, `em`, `rem`) | `48px`, `-0.02em` |
|
||||
| Token reference | `{path.to.token}` | `{colors.primary}` |
|
||||
| Typography | object with `fontFamily`, `fontSize`, `fontWeight`, `lineHeight`, `letterSpacing`, `fontFeature`, `fontVariation` | see above |
|
||||
|
||||
Component property whitelist: `backgroundColor`, `textColor`, `typography`,
|
||||
`rounded`, `padding`, `size`, `height`, `width`. Variants (hover, active,
|
||||
pressed) are **separate component entries** with related key names
|
||||
(`button-primary-hover`), not nested.
|
||||
|
||||
## Canonical section order
|
||||
|
||||
Sections are optional, but present ones MUST appear in this order. Duplicate
|
||||
headings reject the file.
|
||||
|
||||
1. Overview (alias: Brand & Style)
|
||||
2. Colors
|
||||
3. Typography
|
||||
4. Layout (alias: Layout & Spacing)
|
||||
5. Elevation & Depth (alias: Elevation)
|
||||
6. Shapes
|
||||
7. Components
|
||||
8. Do's and Don'ts
|
||||
|
||||
Unknown sections are preserved, not errored. Unknown token names are accepted
|
||||
if the value type is valid. Unknown component properties produce a warning.
|
||||
|
||||
## Workflow: authoring a new DESIGN.md
|
||||
|
||||
1. **Ask the user** (or infer) the brand tone, accent color, and typography
|
||||
direction. If they provided a site, image, or vibe, translate it to the
|
||||
token shape above.
|
||||
2. **Write `DESIGN.md`** in their project root using `write_file`. Always
|
||||
include `name:` and `colors:`; other sections optional but encouraged.
|
||||
3. **Use token references** (`{colors.primary}`) in the `components:` section
|
||||
instead of re-typing hex values. Keeps the palette single-source.
|
||||
4. **Lint it** (see below). Fix any broken references or WCAG failures
|
||||
before returning.
|
||||
5. **If the user has an existing project**, also write Tailwind or DTCG
|
||||
exports next to the file (`tailwind.theme.json`, `tokens.json`).
|
||||
|
||||
## Workflow: lint / diff / export
|
||||
|
||||
The CLI is `@google/design.md` (Node). Use `npx` — no global install needed.
|
||||
|
||||
```bash
|
||||
# Validate structure + token references + WCAG contrast
|
||||
npx -y @google/design.md lint DESIGN.md
|
||||
|
||||
# Compare two versions, fail on regression (exit 1 = regression)
|
||||
npx -y @google/design.md diff DESIGN.md DESIGN-v2.md
|
||||
|
||||
# Export to Tailwind theme JSON
|
||||
npx -y @google/design.md export --format tailwind DESIGN.md > tailwind.theme.json
|
||||
|
||||
# Export to W3C DTCG (Design Tokens Format Module) JSON
|
||||
npx -y @google/design.md export --format dtcg DESIGN.md > tokens.json
|
||||
|
||||
# Print the spec itself — useful when injecting into an agent prompt
|
||||
npx -y @google/design.md spec --rules-only --format json
|
||||
```
|
||||
|
||||
All commands accept `-` for stdin. `lint` returns exit 1 on errors. Use the
|
||||
`--format json` flag and parse the output if you need to report findings
|
||||
structurally.
|
||||
|
||||
### Lint rule reference (what the 7 rules catch)
|
||||
|
||||
- `broken-ref` (error) — `{colors.missing}` points at a non-existent token
|
||||
- `duplicate-section` (error) — same `## Heading` appears twice
|
||||
- `invalid-color`, `invalid-dimension`, `invalid-typography` (error)
|
||||
- `wcag-contrast` (warning/info) — component `textColor` vs `backgroundColor`
|
||||
ratio against WCAG AA (4.5:1) and AAA (7:1)
|
||||
- `unknown-component-property` (warning) — outside the whitelist above
|
||||
|
||||
When the user cares about accessibility, call this out explicitly in your
|
||||
summary — WCAG findings are the most load-bearing reason to use the CLI.
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **Don't nest component variants.** `button-primary.hover` is wrong;
|
||||
`button-primary-hover` as a sibling key is right.
|
||||
- **Hex colors must be quoted strings.** YAML will otherwise choke on `#` or
|
||||
truncate values like `#1A1C1E` oddly.
|
||||
- **Negative dimensions need quotes too.** `letterSpacing: -0.02em` parses as
|
||||
a YAML flow — write `letterSpacing: "-0.02em"`.
|
||||
- **Section order is enforced.** If the user gives you prose in a random order,
|
||||
reorder it to match the canonical list before saving.
|
||||
- **`version: alpha` is the current spec version** (as of Apr 2026). The spec
|
||||
is marked alpha — watch for breaking changes.
|
||||
- **Token references resolve by dotted path.** `{colors.primary}` works;
|
||||
`{primary}` does not.
|
||||
|
||||
## Spec source of truth
|
||||
|
||||
- Repo: https://github.com/google-labs-code/design.md (Apache-2.0)
|
||||
- CLI: `@google/design.md` on npm
|
||||
- License of generated DESIGN.md files: whatever the user's project uses;
|
||||
the spec itself is Apache-2.0.
|
||||
@@ -0,0 +1,99 @@
|
||||
---
|
||||
version: alpha
|
||||
name: MyBrand
|
||||
description: One-sentence description of the visual identity.
|
||||
colors:
|
||||
primary: "#0F172A"
|
||||
secondary: "#64748B"
|
||||
tertiary: "#2563EB"
|
||||
neutral: "#F8FAFC"
|
||||
on-primary: "#FFFFFF"
|
||||
on-tertiary: "#FFFFFF"
|
||||
typography:
|
||||
h1:
|
||||
fontFamily: Inter
|
||||
fontSize: 3rem
|
||||
fontWeight: 700
|
||||
lineHeight: 1.1
|
||||
letterSpacing: "-0.02em"
|
||||
h2:
|
||||
fontFamily: Inter
|
||||
fontSize: 2rem
|
||||
fontWeight: 600
|
||||
lineHeight: 1.2
|
||||
body-md:
|
||||
fontFamily: Inter
|
||||
fontSize: 1rem
|
||||
lineHeight: 1.5
|
||||
label-caps:
|
||||
fontFamily: Inter
|
||||
fontSize: 0.75rem
|
||||
fontWeight: 600
|
||||
letterSpacing: "0.08em"
|
||||
rounded:
|
||||
sm: 4px
|
||||
md: 8px
|
||||
lg: 16px
|
||||
full: 9999px
|
||||
spacing:
|
||||
xs: 4px
|
||||
sm: 8px
|
||||
md: 16px
|
||||
lg: 24px
|
||||
xl: 48px
|
||||
components:
|
||||
button-primary:
|
||||
backgroundColor: "{colors.tertiary}"
|
||||
textColor: "{colors.on-tertiary}"
|
||||
rounded: "{rounded.sm}"
|
||||
padding: 12px
|
||||
button-primary-hover:
|
||||
backgroundColor: "{colors.primary}"
|
||||
textColor: "{colors.on-primary}"
|
||||
card:
|
||||
backgroundColor: "{colors.neutral}"
|
||||
textColor: "{colors.primary}"
|
||||
rounded: "{rounded.md}"
|
||||
padding: 24px
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Describe the voice and feel of the brand in one or two paragraphs. What mood
|
||||
does it evoke? What emotional response should a user have on first impression?
|
||||
|
||||
## Colors
|
||||
|
||||
- **Primary ({colors.primary}):** Core text, headlines, high-emphasis surfaces.
|
||||
- **Secondary ({colors.secondary}):** Supporting text, borders, metadata.
|
||||
- **Tertiary ({colors.tertiary}):** Interaction driver — buttons, links,
|
||||
selected states. Use sparingly to preserve its signal.
|
||||
- **Neutral ({colors.neutral}):** Page background and surface fills.
|
||||
|
||||
## Typography
|
||||
|
||||
Inter for everything. Weight and size carry hierarchy, not font family. Tight
|
||||
letter-spacing on display sizes; default tracking on body.
|
||||
|
||||
## Layout
|
||||
|
||||
Spacing scale is a 4px baseline. Use `md` (16px) for intra-component gaps,
|
||||
`lg` (24px) for inter-component gaps, `xl` (48px) for section breaks.
|
||||
|
||||
## Shapes
|
||||
|
||||
Rounded corners are modest — `sm` on interactive elements, `md` on cards.
|
||||
`full` is reserved for avatars and pill badges.
|
||||
|
||||
## Components
|
||||
|
||||
- `button-primary` is the only high-emphasis action per screen.
|
||||
- `card` is the default surface for grouped content. No shadow by default.
|
||||
|
||||
## Do's and Don'ts
|
||||
|
||||
- **Do** use token references (`{colors.primary}`) instead of literal hex in
|
||||
component definitions.
|
||||
- **Don't** introduce colors outside the palette — extend the palette first.
|
||||
- **Don't** nest component variants. `button-primary-hover` is a sibling,
|
||||
not a child.
|
||||
@@ -56,6 +56,7 @@ class TestFailoverReason:
|
||||
"overloaded", "server_error", "timeout",
|
||||
"context_overflow", "payload_too_large",
|
||||
"model_not_found", "format_error",
|
||||
"provider_policy_blocked",
|
||||
"thinking_signature", "long_context_tier", "unknown",
|
||||
}
|
||||
actual = {r.value for r in FailoverReason}
|
||||
@@ -308,6 +309,59 @@ class TestClassifyApiError:
|
||||
assert result.retryable is True
|
||||
assert result.should_fallback is False
|
||||
|
||||
# ── Provider policy-block (OpenRouter privacy/guardrail) ──
|
||||
|
||||
def test_404_openrouter_policy_blocked(self):
|
||||
# Real OpenRouter error when the user's account privacy setting
|
||||
# excludes the only endpoint serving a model (e.g. DeepSeek V4 Pro
|
||||
# which is hosted only by DeepSeek, and their endpoint may log
|
||||
# inputs). Must NOT classify as model_not_found — the model
|
||||
# exists, falling back won't help (same account setting applies),
|
||||
# and the error body already tells the user where to fix it.
|
||||
e = MockAPIError(
|
||||
"No endpoints available matching your guardrail restrictions "
|
||||
"and data policy. Configure: https://openrouter.ai/settings/privacy",
|
||||
status_code=404,
|
||||
)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.provider_policy_blocked
|
||||
assert result.retryable is False
|
||||
assert result.should_fallback is False
|
||||
|
||||
def test_400_openrouter_policy_blocked(self):
|
||||
# Defense-in-depth: if OpenRouter ever returns this as 400 instead
|
||||
# of 404, still classify it distinctly rather than as format_error
|
||||
# or model_not_found.
|
||||
e = MockAPIError(
|
||||
"No endpoints available matching your data policy",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.provider_policy_blocked
|
||||
assert result.retryable is False
|
||||
assert result.should_fallback is False
|
||||
|
||||
def test_message_only_openrouter_policy_blocked(self):
|
||||
# No status code — classifier should still catch the fingerprint
|
||||
# via the message-pattern fallback.
|
||||
e = Exception(
|
||||
"No endpoints available matching your guardrail restrictions "
|
||||
"and data policy"
|
||||
)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.provider_policy_blocked
|
||||
|
||||
def test_404_model_not_found_still_works(self):
|
||||
# Regression guard: the new policy-block check must not swallow
|
||||
# genuine model_not_found 404s.
|
||||
e = MockAPIError(
|
||||
"openrouter/nonexistent-model is not a valid model ID",
|
||||
status_code=404,
|
||||
)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.model_not_found
|
||||
assert result.should_fallback is True
|
||||
|
||||
# ── Payload too large ──
|
||||
|
||||
def test_413_payload_too_large(self):
|
||||
|
||||
@@ -200,6 +200,126 @@ class TestDefaultContextLengths:
|
||||
assert len(DEFAULT_CONTEXT_LENGTHS) >= 10
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Codex OAuth context-window resolution (provider="openai-codex")
|
||||
# =========================================================================
|
||||
|
||||
class TestCodexOAuthContextLength:
|
||||
"""ChatGPT Codex OAuth imposes lower context limits than the direct
|
||||
OpenAI API for the same slugs. Verified Apr 2026 via live probe of
|
||||
chatgpt.com/backend-api/codex/models: every model returns 272k, while
|
||||
models.dev reports 1.05M for gpt-5.5/gpt-5.4 and 400k for the rest.
|
||||
"""
|
||||
|
||||
def setup_method(self):
|
||||
import agent.model_metadata as mm
|
||||
mm._codex_oauth_context_cache = {}
|
||||
mm._codex_oauth_context_cache_time = 0.0
|
||||
|
||||
def test_fallback_table_used_without_token(self):
|
||||
"""With no access token, the hardcoded Codex fallback table wins
|
||||
over models.dev (which reports 1.05M for gpt-5.5 but Codex is 272k).
|
||||
"""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.save_context_length"):
|
||||
for model in (
|
||||
"gpt-5.5",
|
||||
"gpt-5.4",
|
||||
"gpt-5.4-mini",
|
||||
"gpt-5.3-codex",
|
||||
"gpt-5.2-codex",
|
||||
"gpt-5.1-codex-max",
|
||||
"gpt-5.1-codex-mini",
|
||||
):
|
||||
ctx = get_model_context_length(
|
||||
model=model,
|
||||
base_url="https://chatgpt.com/backend-api/codex",
|
||||
api_key="",
|
||||
provider="openai-codex",
|
||||
)
|
||||
assert ctx == 272_000, (
|
||||
f"Codex {model}: expected 272000 fallback, got {ctx} "
|
||||
"(models.dev leakage?)"
|
||||
)
|
||||
|
||||
def test_live_probe_overrides_fallback(self):
|
||||
"""When a token is provided, the live /models probe is preferred
|
||||
and its context_window drives the result."""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.status_code = 200
|
||||
fake_response.json.return_value = {
|
||||
"models": [
|
||||
{"slug": "gpt-5.5", "context_window": 300_000},
|
||||
{"slug": "gpt-5.4", "context_window": 400_000},
|
||||
]
|
||||
}
|
||||
|
||||
with patch("agent.model_metadata.requests.get", return_value=fake_response), \
|
||||
patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.save_context_length"):
|
||||
ctx_55 = get_model_context_length(
|
||||
model="gpt-5.5",
|
||||
base_url="https://chatgpt.com/backend-api/codex",
|
||||
api_key="fake-token",
|
||||
provider="openai-codex",
|
||||
)
|
||||
ctx_54 = get_model_context_length(
|
||||
model="gpt-5.4",
|
||||
base_url="https://chatgpt.com/backend-api/codex",
|
||||
api_key="fake-token",
|
||||
provider="openai-codex",
|
||||
)
|
||||
assert ctx_55 == 300_000
|
||||
assert ctx_54 == 400_000
|
||||
|
||||
def test_probe_failure_falls_back_to_hardcoded(self):
|
||||
"""If the probe fails (non-200 / network error), we still return
|
||||
the hardcoded 272k rather than leaking through to models.dev 1.05M."""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.status_code = 401
|
||||
fake_response.json.return_value = {}
|
||||
|
||||
with patch("agent.model_metadata.requests.get", return_value=fake_response), \
|
||||
patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.model_metadata.save_context_length"):
|
||||
ctx = get_model_context_length(
|
||||
model="gpt-5.5",
|
||||
base_url="https://chatgpt.com/backend-api/codex",
|
||||
api_key="expired-token",
|
||||
provider="openai-codex",
|
||||
)
|
||||
assert ctx == 272_000
|
||||
|
||||
def test_non_codex_providers_unaffected(self):
|
||||
"""Resolving gpt-5.5 on non-Codex providers must NOT use the Codex
|
||||
272k override — OpenRouter / direct OpenAI API have different limits.
|
||||
"""
|
||||
from agent.model_metadata import get_model_context_length
|
||||
|
||||
# OpenRouter — should hit its own catalog path first; when mocked
|
||||
# empty, falls through to hardcoded DEFAULT_CONTEXT_LENGTHS (400k).
|
||||
with patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
|
||||
patch("agent.model_metadata.get_cached_context_length", return_value=None), \
|
||||
patch("agent.models_dev.lookup_models_dev_context", return_value=None):
|
||||
ctx = get_model_context_length(
|
||||
model="openai/gpt-5.5",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_key="",
|
||||
provider="openrouter",
|
||||
)
|
||||
assert ctx == 400_000, (
|
||||
f"Non-Codex gpt-5.5 resolved to {ctx}; Codex 272k override "
|
||||
"leaked outside openai-codex provider"
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# get_model_context_length — resolution order
|
||||
# =========================================================================
|
||||
|
||||
@@ -0,0 +1,254 @@
|
||||
"""Tests for Moonshot/Kimi flavored-JSON-Schema sanitizer.
|
||||
|
||||
Moonshot's tool-parameter validator rejects several shapes that the rest of
|
||||
the JSON Schema ecosystem accepts:
|
||||
|
||||
1. Properties without ``type`` — Moonshot requires ``type`` on every node.
|
||||
2. ``type`` at the parent of ``anyOf`` — Moonshot requires it only inside
|
||||
``anyOf`` children.
|
||||
|
||||
These tests cover the repairs applied by ``agent/moonshot_schema.py``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.moonshot_schema import (
|
||||
is_moonshot_model,
|
||||
sanitize_moonshot_tool_parameters,
|
||||
sanitize_moonshot_tools,
|
||||
)
|
||||
|
||||
|
||||
class TestMoonshotModelDetection:
|
||||
"""is_moonshot_model() must match across aggregator prefixes."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"kimi-k2.6",
|
||||
"kimi-k2-thinking",
|
||||
"moonshotai/Kimi-K2.6",
|
||||
"moonshotai/kimi-k2.6",
|
||||
"nous/moonshotai/kimi-k2.6",
|
||||
"openrouter/moonshotai/kimi-k2-thinking",
|
||||
"MOONSHOTAI/KIMI-K2.6",
|
||||
],
|
||||
)
|
||||
def test_positive_matches(self, model):
|
||||
assert is_moonshot_model(model) is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"",
|
||||
None,
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
"openai/gpt-5.4",
|
||||
"google/gemini-3-flash-preview",
|
||||
"deepseek-chat",
|
||||
],
|
||||
)
|
||||
def test_negative_matches(self, model):
|
||||
assert is_moonshot_model(model) is False
|
||||
|
||||
|
||||
class TestMissingTypeFilled:
|
||||
"""Rule 1: every property must carry a type."""
|
||||
|
||||
def test_property_without_type_gets_string(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {"query": {"description": "a bare property"}},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert out["properties"]["query"]["type"] == "string"
|
||||
|
||||
def test_property_with_enum_infers_type_from_first_value(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {"flag": {"enum": [True, False]}},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert out["properties"]["flag"]["type"] == "boolean"
|
||||
|
||||
def test_nested_properties_are_repaired(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filter": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"field": {"description": "no type"},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert out["properties"]["filter"]["properties"]["field"]["type"] == "string"
|
||||
|
||||
def test_array_items_without_type_get_repaired(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"description": "tag entry"},
|
||||
},
|
||||
},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert out["properties"]["tags"]["items"]["type"] == "string"
|
||||
|
||||
def test_ref_node_is_not_given_synthetic_type(self):
|
||||
"""$ref nodes should NOT get a synthetic type — the referenced
|
||||
definition supplies it, and Moonshot would reject the conflict."""
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {"payload": {"$ref": "#/$defs/Payload"}},
|
||||
"$defs": {"Payload": {"type": "object", "properties": {}}},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert "type" not in out["properties"]["payload"]
|
||||
assert out["properties"]["payload"]["$ref"] == "#/$defs/Payload"
|
||||
|
||||
|
||||
class TestAnyOfParentType:
|
||||
"""Rule 2: type must not appear at the anyOf parent level."""
|
||||
|
||||
def test_parent_type_stripped_when_anyof_present(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from_format": {
|
||||
"type": "string",
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "null"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
from_format = out["properties"]["from_format"]
|
||||
assert "type" not in from_format
|
||||
assert "anyOf" in from_format
|
||||
|
||||
def test_anyof_children_missing_type_get_filled(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": {
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"description": "A typeless option"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
children = out["properties"]["value"]["anyOf"]
|
||||
assert children[0]["type"] == "string"
|
||||
assert "type" in children[1]
|
||||
|
||||
|
||||
class TestTopLevelGuarantees:
|
||||
"""The returned top-level schema is always a well-formed object."""
|
||||
|
||||
def test_non_dict_input_returns_empty_object(self):
|
||||
assert sanitize_moonshot_tool_parameters(None) == {"type": "object", "properties": {}}
|
||||
assert sanitize_moonshot_tool_parameters("garbage") == {"type": "object", "properties": {}}
|
||||
assert sanitize_moonshot_tool_parameters([]) == {"type": "object", "properties": {}}
|
||||
|
||||
def test_non_object_top_level_coerced(self):
|
||||
params = {"type": "string"}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert out["type"] == "object"
|
||||
assert "properties" in out
|
||||
|
||||
def test_does_not_mutate_input(self):
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {"q": {"description": "no type"}},
|
||||
}
|
||||
snapshot = {
|
||||
"type": params["type"],
|
||||
"properties": {"q": dict(params["properties"]["q"])},
|
||||
}
|
||||
sanitize_moonshot_tool_parameters(params)
|
||||
assert params["type"] == snapshot["type"]
|
||||
assert "type" not in params["properties"]["q"]
|
||||
|
||||
|
||||
class TestToolListSanitizer:
|
||||
"""sanitize_moonshot_tools() walks an OpenAI-format tool list."""
|
||||
|
||||
def test_applies_per_tool(self):
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search",
|
||||
"description": "Search",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"q": {"description": "query"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "noop",
|
||||
"description": "Does nothing",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
},
|
||||
]
|
||||
out = sanitize_moonshot_tools(tools)
|
||||
assert out[0]["function"]["parameters"]["properties"]["q"]["type"] == "string"
|
||||
# Second tool already clean — should be structurally equivalent
|
||||
assert out[1]["function"]["parameters"] == {"type": "object", "properties": {}}
|
||||
|
||||
def test_empty_list_is_passthrough(self):
|
||||
assert sanitize_moonshot_tools([]) == []
|
||||
assert sanitize_moonshot_tools(None) is None
|
||||
|
||||
def test_skips_malformed_entries(self):
|
||||
"""Entries without a function dict are passed through untouched."""
|
||||
tools = [{"type": "function"}, {"not": "a tool"}]
|
||||
out = sanitize_moonshot_tools(tools)
|
||||
assert out == tools
|
||||
|
||||
|
||||
class TestRealWorldMCPShape:
|
||||
"""End-to-end: a realistic MCP-style schema that used to 400 on Moonshot."""
|
||||
|
||||
def test_combined_rewrites(self):
|
||||
# Shape: missing type on a property, anyOf with parent type, array
|
||||
# items without type — all in one tool.
|
||||
params = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"description": "search text"},
|
||||
"filter": {
|
||||
"type": "string",
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "null"},
|
||||
],
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"description": "tag"},
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
}
|
||||
out = sanitize_moonshot_tool_parameters(params)
|
||||
assert out["properties"]["query"]["type"] == "string"
|
||||
assert "type" not in out["properties"]["filter"]
|
||||
assert out["properties"]["filter"]["anyOf"][0]["type"] == "string"
|
||||
assert out["properties"]["tags"]["items"]["type"] == "string"
|
||||
assert out["required"] == ["query"]
|
||||
@@ -238,6 +238,56 @@ class TestChatCompletionsKimi:
|
||||
)
|
||||
assert kw["extra_body"]["thinking"] == {"type": "disabled"}
|
||||
|
||||
def test_moonshot_tool_schemas_are_sanitized_by_model_name(self, transport):
|
||||
"""Aggregator routes (Nous, OpenRouter) hit Moonshot by model name, not base URL."""
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search",
|
||||
"description": "Search",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"q": {"description": "query"}, # missing type
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
kw = transport.build_kwargs(
|
||||
model="moonshotai/kimi-k2.6",
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
tools=tools,
|
||||
max_tokens_param_fn=lambda n: {"max_tokens": n},
|
||||
)
|
||||
assert kw["tools"][0]["function"]["parameters"]["properties"]["q"]["type"] == "string"
|
||||
|
||||
def test_non_moonshot_tools_are_not_mutated(self, transport):
|
||||
"""Other models don't go through the Moonshot sanitizer."""
|
||||
original_params = {
|
||||
"type": "object",
|
||||
"properties": {"q": {"description": "query"}}, # missing type
|
||||
}
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search",
|
||||
"description": "Search",
|
||||
"parameters": original_params,
|
||||
},
|
||||
},
|
||||
]
|
||||
kw = transport.build_kwargs(
|
||||
model="anthropic/claude-sonnet-4.6",
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
tools=tools,
|
||||
max_tokens_param_fn=lambda n: {"max_tokens": n},
|
||||
)
|
||||
# The parameters dict is passed through untouched (no synthetic type)
|
||||
assert "type" not in kw["tools"][0]["function"]["parameters"]["properties"]["q"]
|
||||
|
||||
|
||||
class TestChatCompletionsValidate:
|
||||
|
||||
|
||||
@@ -710,7 +710,15 @@ class TestRunJobSessionPersistence:
|
||||
kwargs = mock_agent_cls.call_args.kwargs
|
||||
assert kwargs["enabled_toolsets"] == ["web", "terminal", "file"]
|
||||
|
||||
def test_run_job_enabled_toolsets_none_when_not_set(self, tmp_path):
|
||||
def test_run_job_enabled_toolsets_resolves_from_platform_config_when_not_set(self, tmp_path):
|
||||
"""When a job has no explicit enabled_toolsets, the scheduler now
|
||||
resolves them from ``hermes tools`` platform config for ``cron``
|
||||
(PR #14xxx — blanket fix for Norbert's surprise ``moa`` run).
|
||||
|
||||
The legacy "pass None → AIAgent loads full default" path is still
|
||||
reachable, but only when ``_get_platform_tools`` raises (safety net
|
||||
for any unexpected config shape).
|
||||
"""
|
||||
job = {
|
||||
"id": "no-toolset-job",
|
||||
"name": "test",
|
||||
@@ -725,7 +733,39 @@ class TestRunJobSessionPersistence:
|
||||
run_job(job)
|
||||
|
||||
kwargs = mock_agent_cls.call_args.kwargs
|
||||
assert kwargs["enabled_toolsets"] is None
|
||||
# Resolution happened — not None, is a list.
|
||||
assert isinstance(kwargs["enabled_toolsets"], list)
|
||||
# The cron default is _HERMES_CORE_TOOLS with _DEFAULT_OFF_TOOLSETS
|
||||
# (``moa``, ``homeassistant``, ``rl``) removed. The most important
|
||||
# invariant: ``moa`` is NOT in the default cron toolset, so a cron
|
||||
# run cannot accidentally spin up frontier models.
|
||||
assert "moa" not in kwargs["enabled_toolsets"]
|
||||
|
||||
def test_run_job_per_job_toolsets_win_over_platform_config(self, tmp_path):
|
||||
"""Per-job enabled_toolsets (via cronjob tool) always take precedence
|
||||
over the platform-level ``hermes tools`` config."""
|
||||
job = {
|
||||
"id": "override-job",
|
||||
"name": "test",
|
||||
"prompt": "hello",
|
||||
"enabled_toolsets": ["terminal"],
|
||||
}
|
||||
fake_db, patches = self._make_run_job_patches(tmp_path)
|
||||
# Even if the user has ``hermes tools`` configured to enable web+file
|
||||
# for cron, the per-job override wins.
|
||||
with patches[0], patches[1], patches[2], patches[3], patches[4], \
|
||||
patch("run_agent.AIAgent") as mock_agent_cls, \
|
||||
patch(
|
||||
"hermes_cli.tools_config._get_platform_tools",
|
||||
return_value={"web", "file"},
|
||||
):
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run_conversation.return_value = {"final_response": "ok"}
|
||||
mock_agent_cls.return_value = mock_agent
|
||||
run_job(job)
|
||||
|
||||
kwargs = mock_agent_cls.call_args.kwargs
|
||||
assert kwargs["enabled_toolsets"] == ["terminal"]
|
||||
|
||||
def test_run_job_empty_response_returns_empty_not_placeholder(self, tmp_path):
|
||||
"""Empty final_response should stay empty for delivery logic (issue #2234).
|
||||
|
||||
@@ -1,22 +1,28 @@
|
||||
"""Regression tests for the TUI gateway's `complete.path` handler.
|
||||
|
||||
Reported during the TUI v2 blitz retest: typing `@folder:` (and `@folder`
|
||||
with no colon yet) still surfaced files alongside directories in the
|
||||
TUI composer, because the gateway-side completion lives in
|
||||
`tui_gateway/server.py` and was never touched by the earlier fix to
|
||||
`hermes_cli/commands.py`.
|
||||
Reported during the TUI v2 blitz retest:
|
||||
- typing `@folder:` (and `@folder` with no colon yet) surfaced files
|
||||
alongside directories — the gateway-side completion lives in
|
||||
`tui_gateway/server.py` and was never touched by the earlier fix to
|
||||
`hermes_cli/commands.py`.
|
||||
- typing `@appChrome` required the full `@ui-tui/src/components/app…`
|
||||
path to find the file — users expect Cmd-P-style fuzzy basename
|
||||
matching across the repo, not a strict directory prefix filter.
|
||||
|
||||
Covers:
|
||||
- `@folder:` only yields directories
|
||||
- `@file:` only yields regular files
|
||||
- Bare `@folder` / `@file` (no colon) lists cwd directly
|
||||
- Explicit prefix is preserved in the completion text
|
||||
- `@<name>` with no slash fuzzy-matches basenames anywhere in the tree
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from tui_gateway import server
|
||||
|
||||
|
||||
@@ -33,6 +39,15 @@ def _items(word: str):
|
||||
return [(it["text"], it["display"], it.get("meta", "")) for it in resp["result"]["items"]]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_fuzzy_cache(monkeypatch):
|
||||
# Each test walks a fresh tmp dir; clear the cached listing so prior
|
||||
# roots can't leak through the TTL window.
|
||||
server._fuzzy_cache.clear()
|
||||
yield
|
||||
server._fuzzy_cache.clear()
|
||||
|
||||
|
||||
def test_at_folder_colon_only_dirs(tmp_path, monkeypatch):
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_fixture(tmp_path)
|
||||
@@ -89,3 +104,176 @@ def test_bare_at_still_shows_static_refs(tmp_path, monkeypatch):
|
||||
|
||||
for expected in ("@diff", "@staged", "@file:", "@folder:", "@url:", "@git:"):
|
||||
assert expected in texts, f"missing static ref {expected!r} in {texts!r}"
|
||||
|
||||
|
||||
# ── Fuzzy basename matching ──────────────────────────────────────────────
|
||||
# Users shouldn't have to know the full path — typing `@appChrome` should
|
||||
# find `ui-tui/src/components/appChrome.tsx`.
|
||||
|
||||
|
||||
def _nested_fixture(tmp_path: Path):
|
||||
(tmp_path / "readme.md").write_text("x")
|
||||
(tmp_path / ".env").write_text("x")
|
||||
(tmp_path / "ui-tui/src/components").mkdir(parents=True)
|
||||
(tmp_path / "ui-tui/src/components/appChrome.tsx").write_text("x")
|
||||
(tmp_path / "ui-tui/src/components/appLayout.tsx").write_text("x")
|
||||
(tmp_path / "ui-tui/src/components/thinking.tsx").write_text("x")
|
||||
(tmp_path / "ui-tui/src/hooks").mkdir(parents=True)
|
||||
(tmp_path / "ui-tui/src/hooks/useCompletion.ts").write_text("x")
|
||||
(tmp_path / "tui_gateway").mkdir()
|
||||
(tmp_path / "tui_gateway/server.py").write_text("x")
|
||||
|
||||
|
||||
def test_fuzzy_at_finds_file_without_directory_prefix(tmp_path, monkeypatch):
|
||||
"""`@appChrome` — with no slash — should surface the nested file."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
entries = _items("@appChrome")
|
||||
texts = [t for t, _, _ in entries]
|
||||
|
||||
assert "@file:ui-tui/src/components/appChrome.tsx" in texts, texts
|
||||
|
||||
# Display is the basename, meta is the containing directory, so the
|
||||
# picker can show `appChrome.tsx ui-tui/src/components` on one row.
|
||||
row = next(r for r in entries if r[0] == "@file:ui-tui/src/components/appChrome.tsx")
|
||||
assert row[1] == "appChrome.tsx"
|
||||
assert row[2] == "ui-tui/src/components"
|
||||
|
||||
|
||||
def test_fuzzy_ranks_exact_before_prefix_before_subseq(tmp_path, monkeypatch):
|
||||
"""Better matches sort before weaker matches regardless of path depth."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
(tmp_path / "server.py").write_text("x") # exact basename match at root
|
||||
|
||||
texts = [t for t, _, _ in _items("@server")]
|
||||
|
||||
# Exact `server.py` beats `tui_gateway/server.py` (prefix match) — both
|
||||
# rank 1 on basename but exact basename wins on the sort key; shorter
|
||||
# rel path breaks ties.
|
||||
assert texts[0] == "@file:server.py", texts
|
||||
assert "@file:tui_gateway/server.py" in texts
|
||||
|
||||
|
||||
def test_fuzzy_camelcase_word_boundary(tmp_path, monkeypatch):
|
||||
"""Mid-basename camelCase pieces match without substring scanning."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
texts = [t for t, _, _ in _items("@Chrome")]
|
||||
|
||||
# `Chrome` starts a camelCase word inside `appChrome.tsx`.
|
||||
assert "@file:ui-tui/src/components/appChrome.tsx" in texts, texts
|
||||
|
||||
|
||||
def test_fuzzy_subsequence_catches_sparse_queries(tmp_path, monkeypatch):
|
||||
"""`@uCo` → `useCompletion.ts` via subsequence, last-resort tier."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
texts = [t for t, _, _ in _items("@uCo")]
|
||||
|
||||
assert "@file:ui-tui/src/hooks/useCompletion.ts" in texts, texts
|
||||
|
||||
|
||||
def test_fuzzy_at_file_prefix_preserved(tmp_path, monkeypatch):
|
||||
"""Explicit `@file:` prefix still wins the completion tag."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
texts = [t for t, _, _ in _items("@file:appChrome")]
|
||||
|
||||
assert "@file:ui-tui/src/components/appChrome.tsx" in texts, texts
|
||||
|
||||
|
||||
def test_fuzzy_skipped_when_path_has_slash(tmp_path, monkeypatch):
|
||||
"""Any `/` in the query = user is navigating; keep directory listing."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
texts = [t for t, _, _ in _items("@ui-tui/src/components/app")]
|
||||
|
||||
# Directory-listing mode prefixes with `@file:` / `@folder:` per entry.
|
||||
# It should only surface direct children of the named dir — not the
|
||||
# nested `useCompletion.ts`.
|
||||
assert any("appChrome.tsx" in t for t in texts), texts
|
||||
assert not any("useCompletion.ts" in t for t in texts), texts
|
||||
|
||||
|
||||
def test_fuzzy_skipped_when_folder_tag(tmp_path, monkeypatch):
|
||||
"""`@folder:<name>` still lists directories — fuzzy scanner only walks
|
||||
files (git-tracked + untracked), so defer to the dir-listing path."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
texts = [t for t, _, _ in _items("@folder:ui")]
|
||||
|
||||
# Root has `ui-tui/` as a directory; the listing branch should surface it.
|
||||
assert any(t.startswith("@folder:ui-tui") for t in texts), texts
|
||||
|
||||
|
||||
def test_fuzzy_hides_dotfiles_unless_asked(tmp_path, monkeypatch):
|
||||
"""`.env` doesn't leak into `@env` but does show for `@.env`."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
_nested_fixture(tmp_path)
|
||||
|
||||
assert not any(".env" in t for t, _, _ in _items("@env"))
|
||||
assert any(t.endswith(".env") for t, _, _ in _items("@.env"))
|
||||
|
||||
|
||||
def test_fuzzy_caps_results(tmp_path, monkeypatch):
|
||||
"""The 30-item cap survives a big tree."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
for i in range(60):
|
||||
(tmp_path / f"mod_{i:03d}.py").write_text("x")
|
||||
|
||||
items = _items("@mod")
|
||||
|
||||
assert len(items) == 30
|
||||
|
||||
|
||||
def test_fuzzy_paths_relative_to_cwd_inside_subdir(tmp_path, monkeypatch):
|
||||
"""When the gateway runs from a subdirectory of a git repo, fuzzy
|
||||
completion paths must resolve under that cwd — not under the repo root.
|
||||
|
||||
Without this, `@appChrome` from inside `apps/web/` would suggest
|
||||
`@file:apps/web/src/foo.tsx` but the agent (resolving from cwd) would
|
||||
look for `apps/web/apps/web/src/foo.tsx` and fail. We translate every
|
||||
`git ls-files` result back to a `relpath(root)` and drop anything
|
||||
outside `root` so the completion contract stays "paths are cwd-relative".
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
subprocess.run(["git", "init", "-q"], cwd=tmp_path, check=True)
|
||||
subprocess.run(["git", "config", "user.email", "test@example.com"], cwd=tmp_path, check=True)
|
||||
subprocess.run(["git", "config", "user.name", "test"], cwd=tmp_path, check=True)
|
||||
|
||||
(tmp_path / "apps" / "web" / "src").mkdir(parents=True)
|
||||
(tmp_path / "apps" / "web" / "src" / "appChrome.tsx").write_text("x")
|
||||
(tmp_path / "apps" / "api" / "src").mkdir(parents=True)
|
||||
(tmp_path / "apps" / "api" / "src" / "server.ts").write_text("x")
|
||||
(tmp_path / "README.md").write_text("x")
|
||||
|
||||
subprocess.run(["git", "add", "."], cwd=tmp_path, check=True)
|
||||
subprocess.run(["git", "commit", "-q", "-m", "init"], cwd=tmp_path, check=True)
|
||||
|
||||
# Run from `apps/web/` — completions should be relative to here, and
|
||||
# files outside this subtree (apps/api, README.md at root) shouldn't
|
||||
# appear at all.
|
||||
monkeypatch.chdir(tmp_path / "apps" / "web")
|
||||
|
||||
texts = [t for t, _, _ in _items("@appChrome")]
|
||||
|
||||
assert "@file:src/appChrome.tsx" in texts, texts
|
||||
assert not any("apps/web/" in t for t in texts), texts
|
||||
|
||||
server._fuzzy_cache.clear()
|
||||
other_texts = [t for t, _, _ in _items("@server")]
|
||||
|
||||
assert not any("server.ts" in t for t in other_texts), other_texts
|
||||
|
||||
server._fuzzy_cache.clear()
|
||||
readme_texts = [t for t, _, _ in _items("@README")]
|
||||
|
||||
assert not any("README.md" in t for t in readme_texts), readme_texts
|
||||
|
||||
@@ -68,3 +68,68 @@ def test_build_welcome_banner_uses_normalized_toolset_names():
|
||||
assert "homeassistant_tools:" not in output
|
||||
assert "honcho_tools:" not in output
|
||||
assert "web_tools:" not in output
|
||||
|
||||
|
||||
def test_build_welcome_banner_title_is_hyperlinked_to_release():
|
||||
"""Panel title (version label) is wrapped in an OSC-8 hyperlink to the GitHub release."""
|
||||
import io
|
||||
from unittest.mock import patch as _patch
|
||||
import hermes_cli.banner as _banner
|
||||
import model_tools as _mt
|
||||
import tools.mcp_tool as _mcp
|
||||
|
||||
_banner._latest_release_cache = None
|
||||
tag_url = ("v2026.4.23", "https://github.com/NousResearch/hermes-agent/releases/tag/v2026.4.23")
|
||||
|
||||
buf = io.StringIO()
|
||||
with (
|
||||
_patch.object(_mt, "check_tool_availability", return_value=(["web"], [])),
|
||||
_patch.object(_banner, "get_available_skills", return_value={}),
|
||||
_patch.object(_banner, "get_update_result", return_value=None),
|
||||
_patch.object(_mcp, "get_mcp_status", return_value=[]),
|
||||
_patch.object(_banner, "get_latest_release_tag", return_value=tag_url),
|
||||
):
|
||||
console = Console(file=buf, force_terminal=True, color_system="truecolor", width=160)
|
||||
_banner.build_welcome_banner(
|
||||
console=console, model="x", cwd="/tmp",
|
||||
session_id="abc123",
|
||||
tools=[{"function": {"name": "read_file"}}],
|
||||
get_toolset_for_tool=lambda n: "file",
|
||||
)
|
||||
|
||||
raw = buf.getvalue()
|
||||
# The existing version label must still be present in the title
|
||||
assert "Hermes Agent v" in raw, "Version label missing from title"
|
||||
# OSC-8 hyperlink escape sequence present with the release URL
|
||||
assert "\x1b]8;" in raw, "OSC-8 hyperlink not emitted"
|
||||
assert "releases/tag/v2026.4.23" in raw, "Release URL missing from banner output"
|
||||
|
||||
|
||||
def test_build_welcome_banner_title_falls_back_when_no_tag():
|
||||
"""Without a resolvable tag, the panel title renders as plain text (no hyperlink escape)."""
|
||||
import io
|
||||
from unittest.mock import patch as _patch
|
||||
import hermes_cli.banner as _banner
|
||||
import model_tools as _mt
|
||||
import tools.mcp_tool as _mcp
|
||||
|
||||
_banner._latest_release_cache = None
|
||||
buf = io.StringIO()
|
||||
with (
|
||||
_patch.object(_mt, "check_tool_availability", return_value=(["web"], [])),
|
||||
_patch.object(_banner, "get_available_skills", return_value={}),
|
||||
_patch.object(_banner, "get_update_result", return_value=None),
|
||||
_patch.object(_mcp, "get_mcp_status", return_value=[]),
|
||||
_patch.object(_banner, "get_latest_release_tag", return_value=None),
|
||||
):
|
||||
console = Console(file=buf, force_terminal=True, color_system="truecolor", width=160)
|
||||
_banner.build_welcome_banner(
|
||||
console=console, model="x", cwd="/tmp",
|
||||
session_id="abc123",
|
||||
tools=[{"function": {"name": "read_file"}}],
|
||||
get_toolset_for_tool=lambda n: "file",
|
||||
)
|
||||
|
||||
raw = buf.getvalue()
|
||||
assert "Hermes Agent v" in raw, "Version label missing from title"
|
||||
assert "\x1b]8;" not in raw, "OSC-8 hyperlink should not be emitted without a tag"
|
||||
|
||||
@@ -463,7 +463,7 @@ class TestPlatformToolsetConsistency:
|
||||
|
||||
gateway_includes = set(TOOLSETS["hermes-gateway"]["includes"])
|
||||
# Exclude non-messaging platforms from the check
|
||||
non_messaging = {"cli", "api_server"}
|
||||
non_messaging = {"cli", "api_server", "cron"}
|
||||
for platform, meta in PLATFORMS.items():
|
||||
if platform in non_messaging:
|
||||
continue
|
||||
@@ -601,3 +601,122 @@ class TestImagegenModelPicker:
|
||||
_configure_imagegen_model("fal", config)
|
||||
assert isinstance(config["image_gen"], dict)
|
||||
assert config["image_gen"]["model"] == "fal-ai/flux-2/klein/9b"
|
||||
|
||||
|
||||
def test_get_platform_tools_recovers_non_configurable_toolsets_from_composite():
|
||||
"""Non-configurable toolsets whose tools are in the composite but not in
|
||||
CONFIGURABLE_TOOLSETS should still appear in the result.
|
||||
"""
|
||||
from toolsets import TOOLSETS
|
||||
from hermes_cli.tools_config import PLATFORMS
|
||||
from unittest.mock import patch as mock_patch
|
||||
|
||||
fake_toolsets = dict(TOOLSETS)
|
||||
fake_toolsets["_test_platform_tool"] = {
|
||||
"description": "test",
|
||||
"tools": ["_test_special_tool"],
|
||||
"includes": [],
|
||||
}
|
||||
fake_toolsets["hermes-_test_platform"] = {
|
||||
"description": "test composite",
|
||||
"tools": ["web_search", "web_extract", "terminal", "process", "_test_special_tool"],
|
||||
"includes": [],
|
||||
}
|
||||
|
||||
test_platforms = {
|
||||
"_test_platform": {"label": "Test", "default_toolset": "hermes-_test_platform"},
|
||||
}
|
||||
|
||||
with mock_patch("hermes_cli.tools_config.PLATFORMS", {**PLATFORMS, **test_platforms}):
|
||||
with mock_patch("toolsets.TOOLSETS", fake_toolsets):
|
||||
enabled = _get_platform_tools({}, "_test_platform")
|
||||
|
||||
assert "_test_platform_tool" in enabled
|
||||
assert "web" in enabled
|
||||
assert "terminal" in enabled
|
||||
|
||||
|
||||
def test_get_platform_tools_second_pass_skips_fully_claimed_toolsets():
|
||||
"""Toolsets whose tools are fully covered by configurable keys should NOT
|
||||
be added by the second pass (prevents 'search', 'hermes-acp' noise).
|
||||
"""
|
||||
enabled = _get_platform_tools({}, "cli")
|
||||
|
||||
assert "search" not in enabled
|
||||
|
||||
|
||||
def test_get_platform_tools_discord_includes_discord_not_admin():
|
||||
enabled = _get_platform_tools({}, "discord")
|
||||
assert "discord" in enabled
|
||||
assert "discord_admin" not in enabled
|
||||
|
||||
|
||||
def test_discord_admin_in_configurable_toolsets():
|
||||
assert any(ts_key == "discord_admin" for ts_key, _, _ in CONFIGURABLE_TOOLSETS)
|
||||
|
||||
|
||||
def test_discord_admin_in_default_off():
|
||||
assert "discord_admin" in _DEFAULT_OFF_TOOLSETS
|
||||
|
||||
|
||||
def test_get_platform_tools_feishu_includes_doc_and_drive():
|
||||
enabled = _get_platform_tools({}, "feishu")
|
||||
assert "feishu_doc" in enabled
|
||||
assert "feishu_drive" in enabled
|
||||
|
||||
|
||||
def test_get_platform_tools_feishu_tools_not_on_other_platforms():
|
||||
for plat in ["cli", "telegram", "discord"]:
|
||||
enabled = _get_platform_tools({}, plat)
|
||||
assert "feishu_doc" not in enabled, f"feishu_doc leaked onto {plat}"
|
||||
assert "feishu_drive" not in enabled, f"feishu_drive leaked onto {plat}"
|
||||
|
||||
|
||||
def test_save_platform_tools_normalizes_numeric_entries():
|
||||
"""YAML may parse bare numeric toolset names as int. They should be
|
||||
normalized to str so they survive the save round-trip.
|
||||
"""
|
||||
config = {
|
||||
"platform_toolsets": {
|
||||
"cli": ["web", "terminal", 12306, "custom-mcp"]
|
||||
}
|
||||
}
|
||||
|
||||
with patch("hermes_cli.tools_config.save_config"):
|
||||
_save_platform_tools(config, "cli", {"web", "browser"})
|
||||
|
||||
saved = config["platform_toolsets"]["cli"]
|
||||
assert "12306" in saved
|
||||
assert 12306 not in saved
|
||||
|
||||
|
||||
def test_save_platform_tools_clears_stale_no_mcp():
|
||||
"""When the new selection doesn't include no_mcp, the sentinel should
|
||||
be stripped from preserved entries so MCP servers are re-enabled.
|
||||
"""
|
||||
config = {
|
||||
"platform_toolsets": {
|
||||
"cli": ["web", "terminal", "no_mcp"]
|
||||
}
|
||||
}
|
||||
|
||||
with patch("hermes_cli.tools_config.save_config"):
|
||||
_save_platform_tools(config, "cli", {"web", "browser"})
|
||||
|
||||
saved = config["platform_toolsets"]["cli"]
|
||||
assert "no_mcp" not in saved
|
||||
|
||||
|
||||
def test_save_platform_tools_preserves_explicit_no_mcp():
|
||||
"""When the new selection explicitly includes no_mcp, it should be kept."""
|
||||
config = {
|
||||
"platform_toolsets": {
|
||||
"cli": ["web", "no_mcp"]
|
||||
}
|
||||
}
|
||||
|
||||
with patch("hermes_cli.tools_config.save_config"):
|
||||
_save_platform_tools(config, "cli", {"web", "no_mcp"})
|
||||
|
||||
saved = config["platform_toolsets"]["cli"]
|
||||
assert "no_mcp" in saved
|
||||
|
||||
@@ -0,0 +1,255 @@
|
||||
"""Tests for ``hermes_cli.voice`` — the TUI gateway's voice wrapper.
|
||||
|
||||
The module is imported *lazily* by ``tui_gateway/server.py`` so that a
|
||||
box with missing audio deps fails at call time (returning a clean RPC
|
||||
error) rather than at gateway startup. These tests therefore only
|
||||
assert the public contract the gateway depends on: the three symbols
|
||||
exist, ``stop_and_transcribe`` is a no-op when nothing is recording,
|
||||
and ``speak_text`` tolerates empty input without touching the provider
|
||||
stack.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
|
||||
class TestPublicAPI:
|
||||
def test_gateway_symbols_importable(self):
|
||||
"""Match the exact import shape tui_gateway/server.py uses."""
|
||||
from hermes_cli.voice import (
|
||||
speak_text,
|
||||
start_recording,
|
||||
stop_and_transcribe,
|
||||
)
|
||||
|
||||
assert callable(start_recording)
|
||||
assert callable(stop_and_transcribe)
|
||||
assert callable(speak_text)
|
||||
|
||||
|
||||
class TestStopWithoutStart:
|
||||
def test_returns_none_when_no_recording_active(self, monkeypatch):
|
||||
"""Idempotent no-op: stop before start must not raise or touch state."""
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
monkeypatch.setattr(voice, "_recorder", None)
|
||||
|
||||
assert voice.stop_and_transcribe() is None
|
||||
|
||||
|
||||
class TestSpeakTextGuards:
|
||||
@pytest.mark.parametrize("text", ["", " ", "\n\t "])
|
||||
def test_empty_text_is_noop(self, text):
|
||||
"""Empty / whitespace-only text must return without importing tts_tool
|
||||
(the gateway spawns a thread per call, so a no-op on empty input
|
||||
keeps the thread pool from churning on trivial inputs)."""
|
||||
from hermes_cli.voice import speak_text
|
||||
|
||||
# Should simply return None without raising.
|
||||
assert speak_text(text) is None
|
||||
|
||||
|
||||
class TestContinuousAPI:
|
||||
"""Continuous (VAD) mode API — CLI-parity loop entry points."""
|
||||
|
||||
def test_continuous_exports(self):
|
||||
from hermes_cli.voice import (
|
||||
is_continuous_active,
|
||||
start_continuous,
|
||||
stop_continuous,
|
||||
)
|
||||
|
||||
assert callable(start_continuous)
|
||||
assert callable(stop_continuous)
|
||||
assert callable(is_continuous_active)
|
||||
|
||||
def test_not_active_by_default(self, monkeypatch):
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
# Isolate from any state left behind by other tests in the session.
|
||||
monkeypatch.setattr(voice, "_continuous_active", False)
|
||||
monkeypatch.setattr(voice, "_continuous_recorder", None)
|
||||
|
||||
assert voice.is_continuous_active() is False
|
||||
|
||||
def test_stop_continuous_idempotent_when_inactive(self, monkeypatch):
|
||||
"""stop_continuous must not raise when no loop is active — the
|
||||
gateway's voice.toggle off path calls it unconditionally."""
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
monkeypatch.setattr(voice, "_continuous_active", False)
|
||||
monkeypatch.setattr(voice, "_continuous_recorder", None)
|
||||
|
||||
# Should return cleanly without exceptions
|
||||
assert voice.stop_continuous() is None
|
||||
assert voice.is_continuous_active() is False
|
||||
|
||||
def test_double_start_is_idempotent(self, monkeypatch):
|
||||
"""A second start_continuous while already active is a no-op — prevents
|
||||
two overlapping capture threads fighting over the microphone when the
|
||||
UI double-fires (e.g. both /voice on and Ctrl+B within the same tick)."""
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
monkeypatch.setattr(voice, "_continuous_active", True)
|
||||
called = {"n": 0}
|
||||
|
||||
class FakeRecorder:
|
||||
def start(self, on_silence_stop=None):
|
||||
called["n"] += 1
|
||||
|
||||
def cancel(self):
|
||||
pass
|
||||
|
||||
monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder())
|
||||
|
||||
voice.start_continuous(on_transcript=lambda _t: None)
|
||||
|
||||
# The guard inside start_continuous short-circuits before rec.start()
|
||||
assert called["n"] == 0
|
||||
|
||||
|
||||
class TestContinuousLoopSimulation:
|
||||
"""End-to-end simulation of the VAD loop with a fake recorder.
|
||||
|
||||
Proves auto-restart works: the silence callback must trigger transcribe →
|
||||
on_transcript → re-call rec.start(on_silence_stop=same_cb). Also covers
|
||||
the 3-strikes no-speech halt.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def fake_recorder(self, monkeypatch):
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
# Reset module state between tests.
|
||||
monkeypatch.setattr(voice, "_continuous_active", False)
|
||||
monkeypatch.setattr(voice, "_continuous_recorder", None)
|
||||
monkeypatch.setattr(voice, "_continuous_no_speech_count", 0)
|
||||
monkeypatch.setattr(voice, "_continuous_on_transcript", None)
|
||||
monkeypatch.setattr(voice, "_continuous_on_status", None)
|
||||
monkeypatch.setattr(voice, "_continuous_on_silent_limit", None)
|
||||
|
||||
class FakeRecorder:
|
||||
_silence_threshold = 200
|
||||
_silence_duration = 3.0
|
||||
is_recording = False
|
||||
|
||||
def __init__(self):
|
||||
self.start_calls = 0
|
||||
self.last_callback = None
|
||||
self.stopped = 0
|
||||
self.cancelled = 0
|
||||
# Preset WAV path returned by stop()
|
||||
self.next_stop_wav = "/tmp/fake.wav"
|
||||
|
||||
def start(self, on_silence_stop=None):
|
||||
self.start_calls += 1
|
||||
self.last_callback = on_silence_stop
|
||||
self.is_recording = True
|
||||
|
||||
def stop(self):
|
||||
self.stopped += 1
|
||||
self.is_recording = False
|
||||
return self.next_stop_wav
|
||||
|
||||
def cancel(self):
|
||||
self.cancelled += 1
|
||||
self.is_recording = False
|
||||
|
||||
rec = FakeRecorder()
|
||||
monkeypatch.setattr(voice, "create_audio_recorder", lambda: rec)
|
||||
# Skip real file ops in the silence callback.
|
||||
monkeypatch.setattr(voice.os.path, "isfile", lambda _p: False)
|
||||
return rec
|
||||
|
||||
def test_loop_auto_restarts_after_transcript(self, fake_recorder, monkeypatch):
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
monkeypatch.setattr(
|
||||
voice,
|
||||
"transcribe_recording",
|
||||
lambda _p: {"success": True, "transcript": "hello world"},
|
||||
)
|
||||
monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
|
||||
|
||||
transcripts = []
|
||||
statuses = []
|
||||
|
||||
voice.start_continuous(
|
||||
on_transcript=lambda t: transcripts.append(t),
|
||||
on_status=lambda s: statuses.append(s),
|
||||
)
|
||||
|
||||
assert fake_recorder.start_calls == 1
|
||||
assert statuses == ["listening"]
|
||||
|
||||
# Simulate AudioRecorder's silence detector firing.
|
||||
fake_recorder.last_callback()
|
||||
|
||||
assert transcripts == ["hello world"]
|
||||
assert fake_recorder.start_calls == 2 # auto-restarted
|
||||
assert statuses == ["listening", "transcribing", "listening"]
|
||||
assert voice.is_continuous_active() is True
|
||||
|
||||
voice.stop_continuous()
|
||||
|
||||
def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch):
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
# Transcription returns no speech — fake_recorder.stop() returns the
|
||||
# path, but transcribe returns empty text, counting as silence.
|
||||
monkeypatch.setattr(
|
||||
voice,
|
||||
"transcribe_recording",
|
||||
lambda _p: {"success": True, "transcript": ""},
|
||||
)
|
||||
monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
|
||||
|
||||
transcripts = []
|
||||
silent_limit_fired = []
|
||||
|
||||
voice.start_continuous(
|
||||
on_transcript=lambda t: transcripts.append(t),
|
||||
on_silent_limit=lambda: silent_limit_fired.append(True),
|
||||
)
|
||||
|
||||
# Fire silence callback 3 times
|
||||
for _ in range(3):
|
||||
fake_recorder.last_callback()
|
||||
|
||||
assert transcripts == []
|
||||
assert silent_limit_fired == [True]
|
||||
assert voice.is_continuous_active() is False
|
||||
assert fake_recorder.cancelled >= 1
|
||||
|
||||
def test_stop_during_transcription_discards_restart(self, fake_recorder, monkeypatch):
|
||||
"""User hits Ctrl+B mid-transcription: the in-flight transcript must
|
||||
still fire (it's a real utterance), but the loop must NOT restart."""
|
||||
import hermes_cli.voice as voice
|
||||
|
||||
stop_triggered = {"flag": False}
|
||||
|
||||
def late_transcribe(_p):
|
||||
# Simulate stop_continuous arriving while we're inside transcribe
|
||||
voice.stop_continuous()
|
||||
stop_triggered["flag"] = True
|
||||
return {"success": True, "transcript": "final word"}
|
||||
|
||||
monkeypatch.setattr(voice, "transcribe_recording", late_transcribe)
|
||||
monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
|
||||
|
||||
transcripts = []
|
||||
voice.start_continuous(on_transcript=lambda t: transcripts.append(t))
|
||||
|
||||
initial_starts = fake_recorder.start_calls # 1
|
||||
fake_recorder.last_callback()
|
||||
|
||||
assert stop_triggered["flag"] is True
|
||||
# Loop is stopped — no auto-restart
|
||||
assert fake_recorder.start_calls == initial_starts
|
||||
# The in-flight transcript was suppressed because we stopped mid-flight
|
||||
assert transcripts == []
|
||||
assert voice.is_continuous_active() is False
|
||||
@@ -134,6 +134,31 @@ class TestCoerceValue:
|
||||
"""A non-numeric string in [number, string] should stay a string."""
|
||||
assert _coerce_value("hello", ["number", "string"]) == "hello"
|
||||
|
||||
def test_array_type_parsed_from_json_string(self):
|
||||
"""Stringified JSON arrays are parsed into native lists."""
|
||||
assert _coerce_value('["a", "b"]', "array") == ["a", "b"]
|
||||
assert _coerce_value("[1, 2, 3]", "array") == [1, 2, 3]
|
||||
|
||||
def test_object_type_parsed_from_json_string(self):
|
||||
"""Stringified JSON objects are parsed into native dicts."""
|
||||
assert _coerce_value('{"k": "v"}', "object") == {"k": "v"}
|
||||
assert _coerce_value('{"n": 1}', "object") == {"n": 1}
|
||||
|
||||
def test_array_invalid_json_preserved(self):
|
||||
"""Unparseable strings are returned unchanged."""
|
||||
assert _coerce_value("not-json", "array") == "not-json"
|
||||
|
||||
def test_object_invalid_json_preserved(self):
|
||||
assert _coerce_value("not-json", "object") == "not-json"
|
||||
|
||||
def test_array_type_wrong_shape_preserved(self):
|
||||
"""A JSON object passed for an 'array' slot is preserved as a string."""
|
||||
assert _coerce_value('{"k": "v"}', "array") == '{"k": "v"}'
|
||||
|
||||
def test_object_type_wrong_shape_preserved(self):
|
||||
"""A JSON array passed for an 'object' slot is preserved as a string."""
|
||||
assert _coerce_value('["a"]', "object") == '["a"]'
|
||||
|
||||
|
||||
# ── Full coerce_tool_args with registry ───────────────────────────────────
|
||||
|
||||
@@ -212,6 +237,32 @@ class TestCoerceToolArgs:
|
||||
assert result["items"] == [1, 2, 3]
|
||||
assert result["config"] == {"key": "val"}
|
||||
|
||||
def test_coerces_stringified_array_arg(self):
|
||||
"""Regression for #3947 — MCP servers using z.array() expect lists, not strings."""
|
||||
schema = self._mock_schema({
|
||||
"messageIds": {"type": "array", "items": {"type": "string"}},
|
||||
})
|
||||
with patch("model_tools.registry.get_schema", return_value=schema):
|
||||
args = {"messageIds": '["abc", "def"]'}
|
||||
result = coerce_tool_args("test_tool", args)
|
||||
assert result["messageIds"] == ["abc", "def"]
|
||||
|
||||
def test_coerces_stringified_object_arg(self):
|
||||
"""Stringified JSON objects get parsed into dicts."""
|
||||
schema = self._mock_schema({"config": {"type": "object"}})
|
||||
with patch("model_tools.registry.get_schema", return_value=schema):
|
||||
args = {"config": '{"max": 50}'}
|
||||
result = coerce_tool_args("test_tool", args)
|
||||
assert result["config"] == {"max": 50}
|
||||
|
||||
def test_invalid_json_array_preserved_as_string(self):
|
||||
"""If the string isn't valid JSON, pass it through — let the tool decide."""
|
||||
schema = self._mock_schema({"items": {"type": "array"}})
|
||||
with patch("model_tools.registry.get_schema", return_value=schema):
|
||||
args = {"items": "not-json"}
|
||||
result = coerce_tool_args("test_tool", args)
|
||||
assert result["items"] == "not-json"
|
||||
|
||||
def test_extra_args_without_schema_left_alone(self):
|
||||
"""Args not in the schema properties are not touched."""
|
||||
schema = self._mock_schema({"limit": {"type": "integer"}})
|
||||
|
||||
@@ -200,8 +200,8 @@ class TestToolsetConsistency:
|
||||
def test_hermes_platforms_share_core_tools(self):
|
||||
"""All hermes-* platform toolsets share the same core tools.
|
||||
|
||||
Platform-specific additions (e.g. ``discord_server`` on
|
||||
hermes-discord, gated on DISCORD_BOT_TOKEN) are allowed on top —
|
||||
Platform-specific additions (e.g. ``discord`` / ``discord_admin``
|
||||
on hermes-discord, gated on DISCORD_BOT_TOKEN) are allowed on top —
|
||||
the invariant is that the core set is identical across platforms.
|
||||
"""
|
||||
platforms = ["hermes-cli", "hermes-telegram", "hermes-discord", "hermes-whatsapp", "hermes-slack", "hermes-signal", "hermes-homeassistant"]
|
||||
|
||||
@@ -0,0 +1,563 @@
|
||||
"""Integration tests for tools.browser_supervisor.
|
||||
|
||||
Exercises the supervisor end-to-end against a real local Chrome
|
||||
(``--remote-debugging-port``). Skipped when Chrome is not installed
|
||||
— these are the tests that actually verify the CDP wire protocol
|
||||
works, since mock-CDP unit tests can only prove the happy paths we
|
||||
thought to model.
|
||||
|
||||
Run manually:
|
||||
scripts/run_tests.sh tests/tools/test_browser_supervisor.py
|
||||
|
||||
Automated: skipped in CI unless ``HERMES_E2E_BROWSER=1`` is set.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not shutil.which("google-chrome") and not shutil.which("chromium"),
|
||||
reason="Chrome/Chromium not installed",
|
||||
)
|
||||
|
||||
|
||||
def _find_chrome() -> str:
|
||||
for candidate in ("google-chrome", "chromium", "chromium-browser"):
|
||||
path = shutil.which(candidate)
|
||||
if path:
|
||||
return path
|
||||
pytest.skip("no Chrome binary found")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def chrome_cdp(worker_id):
|
||||
"""Start a headless Chrome with --remote-debugging-port, yield its WS URL.
|
||||
|
||||
Uses a unique port per xdist worker to avoid cross-worker collisions.
|
||||
Always launches with ``--site-per-process`` so cross-origin iframes
|
||||
become real OOPIFs (needed by the iframe interaction tests).
|
||||
"""
|
||||
import socket
|
||||
|
||||
# xdist worker_id is "master" in single-process mode or "gw0".."gwN" otherwise.
|
||||
if worker_id == "master":
|
||||
port_offset = 0
|
||||
else:
|
||||
port_offset = int(worker_id.lstrip("gw"))
|
||||
port = 9225 + port_offset
|
||||
profile = tempfile.mkdtemp(prefix="hermes-supervisor-test-")
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
_find_chrome(),
|
||||
f"--remote-debugging-port={port}",
|
||||
f"--user-data-dir={profile}",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--headless=new",
|
||||
"--disable-gpu",
|
||||
"--site-per-process", # force OOPIFs for cross-origin iframes
|
||||
],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
ws_url = None
|
||||
deadline = time.monotonic() + 15
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
import urllib.request
|
||||
with urllib.request.urlopen(
|
||||
f"http://127.0.0.1:{port}/json/version", timeout=1
|
||||
) as r:
|
||||
info = json.loads(r.read().decode())
|
||||
ws_url = info["webSocketDebuggerUrl"]
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(0.25)
|
||||
if ws_url is None:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=5)
|
||||
shutil.rmtree(profile, ignore_errors=True)
|
||||
pytest.skip("Chrome didn't expose CDP in time")
|
||||
|
||||
yield ws_url, port
|
||||
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=3)
|
||||
except Exception:
|
||||
proc.kill()
|
||||
shutil.rmtree(profile, ignore_errors=True)
|
||||
|
||||
|
||||
def _test_page_url() -> str:
|
||||
html = """<!doctype html>
|
||||
<html><head><title>Supervisor pytest</title></head><body>
|
||||
<h1>Supervisor pytest</h1>
|
||||
<iframe id="inner" srcdoc="<body><h2>frame-marker</h2></body>" width="400" height="100"></iframe>
|
||||
</body></html>"""
|
||||
return "data:text/html;base64," + base64.b64encode(html.encode()).decode()
|
||||
|
||||
|
||||
def _fire_on_page(cdp_url: str, expression: str) -> None:
|
||||
"""Navigate the first page target to a data URL and fire `expression`."""
|
||||
import asyncio
|
||||
import websockets as _ws_mod
|
||||
|
||||
async def run():
|
||||
async with _ws_mod.connect(cdp_url, max_size=50 * 1024 * 1024) as ws:
|
||||
next_id = [1]
|
||||
|
||||
async def call(method, params=None, session_id=None):
|
||||
cid = next_id[0]
|
||||
next_id[0] += 1
|
||||
p = {"id": cid, "method": method}
|
||||
if params:
|
||||
p["params"] = params
|
||||
if session_id:
|
||||
p["sessionId"] = session_id
|
||||
await ws.send(json.dumps(p))
|
||||
async for raw in ws:
|
||||
m = json.loads(raw)
|
||||
if m.get("id") == cid:
|
||||
return m
|
||||
|
||||
targets = (await call("Target.getTargets"))["result"]["targetInfos"]
|
||||
page = next(t for t in targets if t.get("type") == "page")
|
||||
attach = await call(
|
||||
"Target.attachToTarget", {"targetId": page["targetId"], "flatten": True}
|
||||
)
|
||||
sid = attach["result"]["sessionId"]
|
||||
await call("Page.navigate", {"url": _test_page_url()}, session_id=sid)
|
||||
await asyncio.sleep(1.5) # let the page load
|
||||
await call(
|
||||
"Runtime.evaluate",
|
||||
{"expression": expression, "returnByValue": True},
|
||||
session_id=sid,
|
||||
)
|
||||
|
||||
asyncio.run(run())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def supervisor_registry():
|
||||
"""Yield the global registry and tear down any supervisors after the test."""
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY
|
||||
|
||||
yield SUPERVISOR_REGISTRY
|
||||
SUPERVISOR_REGISTRY.stop_all()
|
||||
|
||||
|
||||
def _wait_for_dialog(supervisor, timeout: float = 5.0):
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
snap = supervisor.snapshot()
|
||||
if snap.pending_dialogs:
|
||||
return snap.pending_dialogs
|
||||
time.sleep(0.1)
|
||||
return ()
|
||||
|
||||
|
||||
def test_supervisor_start_and_snapshot(chrome_cdp, supervisor_registry):
|
||||
"""Supervisor attaches, exposes an active snapshot with a top frame."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-1", cdp_url=cdp_url)
|
||||
|
||||
# Navigate so the frame tree populates.
|
||||
_fire_on_page(cdp_url, "/* no dialog */ void 0")
|
||||
|
||||
# Give a moment for frame events to propagate
|
||||
time.sleep(1.0)
|
||||
snap = supervisor.snapshot()
|
||||
assert snap.active is True
|
||||
assert snap.task_id == "pytest-1"
|
||||
assert snap.pending_dialogs == ()
|
||||
# At minimum a top frame should exist after the navigate.
|
||||
assert snap.frame_tree.get("top") is not None
|
||||
|
||||
|
||||
def test_main_frame_alert_detection_and_dismiss(chrome_cdp, supervisor_registry):
|
||||
"""alert() in the main frame surfaces and can be dismissed via the sync API."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-2", cdp_url=cdp_url)
|
||||
|
||||
_fire_on_page(cdp_url, "setTimeout(() => alert('PYTEST-MAIN-ALERT'), 50)")
|
||||
dialogs = _wait_for_dialog(supervisor)
|
||||
assert dialogs, "no dialog detected"
|
||||
d = dialogs[0]
|
||||
assert d.type == "alert"
|
||||
assert "PYTEST-MAIN-ALERT" in d.message
|
||||
|
||||
result = supervisor.respond_to_dialog("dismiss")
|
||||
assert result["ok"] is True
|
||||
# State cleared after dismiss
|
||||
time.sleep(0.3)
|
||||
assert supervisor.snapshot().pending_dialogs == ()
|
||||
|
||||
|
||||
def test_iframe_contentwindow_alert(chrome_cdp, supervisor_registry):
|
||||
"""alert() fired from inside a same-origin iframe surfaces too."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-3", cdp_url=cdp_url)
|
||||
|
||||
_fire_on_page(
|
||||
cdp_url,
|
||||
"setTimeout(() => document.querySelector('#inner').contentWindow.alert('PYTEST-IFRAME'), 50)",
|
||||
)
|
||||
dialogs = _wait_for_dialog(supervisor)
|
||||
assert dialogs, "no iframe dialog detected"
|
||||
assert any("PYTEST-IFRAME" in d.message for d in dialogs)
|
||||
|
||||
result = supervisor.respond_to_dialog("accept")
|
||||
assert result["ok"] is True
|
||||
|
||||
|
||||
def test_prompt_dialog_with_response_text(chrome_cdp, supervisor_registry):
|
||||
"""prompt() gets our prompt_text back inside the page."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-4", cdp_url=cdp_url)
|
||||
|
||||
# Fire a prompt and stash the answer on window
|
||||
_fire_on_page(
|
||||
cdp_url,
|
||||
"setTimeout(() => { window.__promptResult = prompt('give me a token', 'default-x'); }, 50)",
|
||||
)
|
||||
dialogs = _wait_for_dialog(supervisor)
|
||||
assert dialogs
|
||||
d = dialogs[0]
|
||||
assert d.type == "prompt"
|
||||
assert d.default_prompt == "default-x"
|
||||
|
||||
result = supervisor.respond_to_dialog("accept", prompt_text="PYTEST-PROMPT-REPLY")
|
||||
assert result["ok"] is True
|
||||
|
||||
|
||||
def test_respond_with_no_pending_dialog_errors_cleanly(chrome_cdp, supervisor_registry):
|
||||
"""Calling respond_to_dialog when nothing is pending returns a clean error, not an exception."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-5", cdp_url=cdp_url)
|
||||
|
||||
result = supervisor.respond_to_dialog("accept")
|
||||
assert result["ok"] is False
|
||||
assert "no dialog" in result["error"].lower()
|
||||
|
||||
|
||||
def test_auto_dismiss_policy(chrome_cdp, supervisor_registry):
|
||||
"""auto_dismiss policy clears dialogs without the agent responding."""
|
||||
from tools.browser_supervisor import DIALOG_POLICY_AUTO_DISMISS
|
||||
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(
|
||||
task_id="pytest-6",
|
||||
cdp_url=cdp_url,
|
||||
dialog_policy=DIALOG_POLICY_AUTO_DISMISS,
|
||||
)
|
||||
|
||||
_fire_on_page(cdp_url, "setTimeout(() => alert('PYTEST-AUTO-DISMISS'), 50)")
|
||||
# Give the supervisor a moment to see + auto-dismiss
|
||||
time.sleep(2.0)
|
||||
snap = supervisor.snapshot()
|
||||
# Nothing pending because auto-dismiss cleared it immediately
|
||||
assert snap.pending_dialogs == ()
|
||||
|
||||
|
||||
def test_registry_idempotent_get_or_start(chrome_cdp, supervisor_registry):
|
||||
"""Calling get_or_start twice with the same (task, url) returns the same instance."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
a = supervisor_registry.get_or_start(task_id="pytest-idem", cdp_url=cdp_url)
|
||||
b = supervisor_registry.get_or_start(task_id="pytest-idem", cdp_url=cdp_url)
|
||||
assert a is b
|
||||
|
||||
|
||||
def test_registry_stop(chrome_cdp, supervisor_registry):
|
||||
"""stop() tears down the supervisor and snapshot reports inactive."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-stop", cdp_url=cdp_url)
|
||||
assert supervisor.snapshot().active is True
|
||||
supervisor_registry.stop("pytest-stop")
|
||||
# Post-stop snapshot reports inactive; supervisor obj may still exist
|
||||
assert supervisor.snapshot().active is False
|
||||
|
||||
|
||||
def test_browser_dialog_tool_no_supervisor():
|
||||
"""browser_dialog returns a clear error when no supervisor is attached."""
|
||||
from tools.browser_dialog_tool import browser_dialog
|
||||
|
||||
r = json.loads(browser_dialog(action="accept", task_id="nonexistent-task"))
|
||||
assert r["success"] is False
|
||||
assert "No CDP supervisor" in r["error"]
|
||||
|
||||
|
||||
def test_browser_dialog_invalid_action(chrome_cdp, supervisor_registry):
|
||||
"""browser_dialog rejects actions that aren't accept/dismiss."""
|
||||
from tools.browser_dialog_tool import browser_dialog
|
||||
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor_registry.get_or_start(task_id="pytest-bad-action", cdp_url=cdp_url)
|
||||
|
||||
r = json.loads(browser_dialog(action="eat", task_id="pytest-bad-action"))
|
||||
assert r["success"] is False
|
||||
assert "accept" in r["error"] and "dismiss" in r["error"]
|
||||
|
||||
|
||||
def test_recent_dialogs_ring_buffer(chrome_cdp, supervisor_registry):
|
||||
"""Closed dialogs show up in recent_dialogs with a closed_by tag."""
|
||||
from tools.browser_supervisor import DIALOG_POLICY_AUTO_DISMISS
|
||||
|
||||
cdp_url, _port = chrome_cdp
|
||||
sv = supervisor_registry.get_or_start(
|
||||
task_id="pytest-recent",
|
||||
cdp_url=cdp_url,
|
||||
dialog_policy=DIALOG_POLICY_AUTO_DISMISS,
|
||||
)
|
||||
|
||||
_fire_on_page(cdp_url, "setTimeout(() => alert('PYTEST-RECENT'), 50)")
|
||||
# Wait for auto-dismiss to cycle the dialog through
|
||||
deadline = time.time() + 5
|
||||
while time.time() < deadline:
|
||||
recent = sv.snapshot().recent_dialogs
|
||||
if recent and any("PYTEST-RECENT" in r.message for r in recent):
|
||||
break
|
||||
time.sleep(0.1)
|
||||
|
||||
recent = sv.snapshot().recent_dialogs
|
||||
assert recent, "recent_dialogs should contain the auto-dismissed dialog"
|
||||
match = next((r for r in recent if "PYTEST-RECENT" in r.message), None)
|
||||
assert match is not None
|
||||
assert match.type == "alert"
|
||||
assert match.closed_by == "auto_policy"
|
||||
assert match.closed_at >= match.opened_at
|
||||
|
||||
|
||||
def test_browser_dialog_tool_end_to_end(chrome_cdp, supervisor_registry):
|
||||
"""Full agent-path check: fire an alert, call the tool handler directly."""
|
||||
from tools.browser_dialog_tool import browser_dialog
|
||||
|
||||
cdp_url, _port = chrome_cdp
|
||||
supervisor = supervisor_registry.get_or_start(task_id="pytest-tool", cdp_url=cdp_url)
|
||||
|
||||
_fire_on_page(cdp_url, "setTimeout(() => alert('PYTEST-TOOL-END2END'), 50)")
|
||||
assert _wait_for_dialog(supervisor), "no dialog detected via wait_for_dialog"
|
||||
|
||||
r = json.loads(browser_dialog(action="dismiss", task_id="pytest-tool"))
|
||||
assert r["success"] is True
|
||||
assert r["action"] == "dismiss"
|
||||
assert "PYTEST-TOOL-END2END" in r["dialog"]["message"]
|
||||
|
||||
|
||||
def test_browser_cdp_frame_id_routes_via_supervisor(chrome_cdp, supervisor_registry, monkeypatch):
|
||||
"""browser_cdp(frame_id=...) routes Runtime.evaluate through supervisor.
|
||||
|
||||
Mocks the supervisor with a known frame and verifies browser_cdp sends
|
||||
the call via the supervisor's loop rather than opening a stateless
|
||||
WebSocket. This is the path that makes cross-origin iframe eval work
|
||||
on Browserbase.
|
||||
"""
|
||||
cdp_url, _port = chrome_cdp
|
||||
sv = supervisor_registry.get_or_start(task_id="frame-id-test", cdp_url=cdp_url)
|
||||
assert sv.snapshot().active
|
||||
|
||||
# Inject a fake OOPIF frame pointing at the SUPERVISOR's own page session
|
||||
# so we can verify routing. We fake is_oopif=True so the code path
|
||||
# treats it as an OOPIF child.
|
||||
import tools.browser_supervisor as _bs
|
||||
with sv._state_lock:
|
||||
fake_frame_id = "FAKE-FRAME-001"
|
||||
sv._frames[fake_frame_id] = _bs.FrameInfo(
|
||||
frame_id=fake_frame_id,
|
||||
url="fake://",
|
||||
origin="",
|
||||
parent_frame_id=None,
|
||||
is_oopif=True,
|
||||
cdp_session_id=sv._page_session_id, # route at page scope
|
||||
)
|
||||
|
||||
# Route the tool through the supervisor. Should succeed and return
|
||||
# something that clearly came from CDP.
|
||||
from tools.browser_cdp_tool import browser_cdp
|
||||
result = browser_cdp(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression": "1 + 1", "returnByValue": True},
|
||||
frame_id=fake_frame_id,
|
||||
task_id="frame-id-test",
|
||||
)
|
||||
r = json.loads(result)
|
||||
assert r.get("success") is True, f"expected success, got: {r}"
|
||||
assert r.get("frame_id") == fake_frame_id
|
||||
assert r.get("session_id") == sv._page_session_id
|
||||
value = r.get("result", {}).get("result", {}).get("value")
|
||||
assert value == 2, f"expected 2, got {value!r}"
|
||||
|
||||
|
||||
def test_browser_cdp_frame_id_real_oopif_smoke_documented():
|
||||
"""Document that real-OOPIF E2E was manually verified — see PR #14540.
|
||||
|
||||
A pytest version of this hits an asyncio version-quirk in the venv
|
||||
(3.11) that doesn't show up in standalone scripts (3.13 + system
|
||||
websockets). The mechanism IS verified end-to-end by two separate
|
||||
smoke scripts in /tmp/dialog-iframe-test/:
|
||||
|
||||
* smoke_local_oopif.py — local Chrome + 2 http servers on
|
||||
different hostnames + --site-per-process. Outer page on
|
||||
localhost:18905, iframe src=http://127.0.0.1:18906. Calls
|
||||
browser_cdp(method='Runtime.evaluate', frame_id=<OOPIF>) and
|
||||
verifies inner page's title comes back from the OOPIF session.
|
||||
PASSED on 2026-04-23: iframe document.title = 'INNER-FRAME-XYZ'
|
||||
|
||||
* smoke_bb_iframe_agent_path.py — Browserbase + real cross-origin
|
||||
iframe (src=https://example.com/). Same browser_cdp(frame_id=)
|
||||
path. PASSED on 2026-04-23: iframe document.title =
|
||||
'Example Domain'
|
||||
|
||||
The test_browser_cdp_frame_id_routes_via_supervisor pytest covers
|
||||
the supervisor-routing plumbing with a fake injected OOPIF.
|
||||
"""
|
||||
pytest.skip(
|
||||
"Real-OOPIF E2E verified manually with smoke_local_oopif.py and "
|
||||
"smoke_bb_iframe_agent_path.py — pytest version hits an asyncio "
|
||||
"version quirk between venv (3.11) and standalone (3.13). "
|
||||
"Smoke logs preserved in /tmp/dialog-iframe-test/."
|
||||
)
|
||||
|
||||
|
||||
def test_browser_cdp_frame_id_missing_supervisor():
|
||||
"""browser_cdp(frame_id=...) errors cleanly when no supervisor is attached."""
|
||||
from tools.browser_cdp_tool import browser_cdp
|
||||
result = browser_cdp(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression": "1"},
|
||||
frame_id="any-frame-id",
|
||||
task_id="no-such-task",
|
||||
)
|
||||
r = json.loads(result)
|
||||
assert r.get("success") is not True
|
||||
assert "supervisor" in (r.get("error") or "").lower()
|
||||
|
||||
|
||||
def test_browser_cdp_frame_id_not_in_frame_tree(chrome_cdp, supervisor_registry):
|
||||
"""browser_cdp(frame_id=...) errors when the frame_id isn't known."""
|
||||
cdp_url, _port = chrome_cdp
|
||||
sv = supervisor_registry.get_or_start(task_id="bad-frame-test", cdp_url=cdp_url)
|
||||
assert sv.snapshot().active
|
||||
|
||||
from tools.browser_cdp_tool import browser_cdp
|
||||
result = browser_cdp(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression": "1"},
|
||||
frame_id="nonexistent-frame",
|
||||
task_id="bad-frame-test",
|
||||
)
|
||||
r = json.loads(result)
|
||||
assert r.get("success") is not True
|
||||
assert "not found" in (r.get("error") or "").lower()
|
||||
|
||||
|
||||
def test_bridge_captures_prompt_and_returns_reply_text(chrome_cdp, supervisor_registry):
|
||||
"""End-to-end: agent's prompt_text round-trips INTO the page's JS.
|
||||
|
||||
Proves the bridge isn't just catching dialogs — it's properly round-
|
||||
tripping our reply back into the page via Fetch.fulfillRequest, so
|
||||
``prompt()`` actually returns the agent-supplied string to the page.
|
||||
"""
|
||||
import base64 as _b64
|
||||
|
||||
cdp_url, _port = chrome_cdp
|
||||
sv = supervisor_registry.get_or_start(task_id="pytest-bridge-prompt", cdp_url=cdp_url)
|
||||
|
||||
# Page fires prompt and stashes the return value on window.
|
||||
html = """<!doctype html><html><body><script>
|
||||
window.__ret = null;
|
||||
setTimeout(() => { window.__ret = prompt('PROMPT-MSG', 'default'); }, 50);
|
||||
</script></body></html>"""
|
||||
url = "data:text/html;base64," + _b64.b64encode(html.encode()).decode()
|
||||
|
||||
import asyncio as _asyncio
|
||||
import websockets as _ws_mod
|
||||
|
||||
async def nav_and_read():
|
||||
async with _ws_mod.connect(cdp_url, max_size=50 * 1024 * 1024) as ws:
|
||||
nid = [1]
|
||||
pending: dict = {}
|
||||
|
||||
async def reader_fn():
|
||||
try:
|
||||
async for raw in ws:
|
||||
m = json.loads(raw)
|
||||
if "id" in m:
|
||||
fut = pending.pop(m["id"], None)
|
||||
if fut and not fut.done():
|
||||
fut.set_result(m)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
rd = _asyncio.create_task(reader_fn())
|
||||
|
||||
async def call(method, params=None, sid=None):
|
||||
c = nid[0]; nid[0] += 1
|
||||
p = {"id": c, "method": method}
|
||||
if params: p["params"] = params
|
||||
if sid: p["sessionId"] = sid
|
||||
fut = _asyncio.get_event_loop().create_future()
|
||||
pending[c] = fut
|
||||
await ws.send(json.dumps(p))
|
||||
return await _asyncio.wait_for(fut, timeout=20)
|
||||
|
||||
try:
|
||||
t = (await call("Target.getTargets"))["result"]["targetInfos"]
|
||||
pg = next(x for x in t if x.get("type") == "page")
|
||||
a = await call("Target.attachToTarget", {"targetId": pg["targetId"], "flatten": True})
|
||||
sid = a["result"]["sessionId"]
|
||||
|
||||
# Fire navigate but don't await — prompt() blocks the page
|
||||
nav_id = nid[0]; nid[0] += 1
|
||||
nav_fut = _asyncio.get_event_loop().create_future()
|
||||
pending[nav_id] = nav_fut
|
||||
await ws.send(json.dumps({"id": nav_id, "method": "Page.navigate", "params": {"url": url}, "sessionId": sid}))
|
||||
|
||||
# Wait for supervisor to see the prompt
|
||||
deadline = time.monotonic() + 10
|
||||
dialog = None
|
||||
while time.monotonic() < deadline:
|
||||
snap = sv.snapshot()
|
||||
if snap.pending_dialogs:
|
||||
dialog = snap.pending_dialogs[0]
|
||||
break
|
||||
await _asyncio.sleep(0.05)
|
||||
assert dialog is not None, "no dialog captured"
|
||||
assert dialog.bridge_request_id is not None, "expected bridge path"
|
||||
assert dialog.type == "prompt"
|
||||
|
||||
# Agent responds
|
||||
resp = sv.respond_to_dialog("accept", prompt_text="AGENT-SUPPLIED-REPLY")
|
||||
assert resp["ok"] is True
|
||||
|
||||
# Wait for nav to complete + read back
|
||||
try:
|
||||
await _asyncio.wait_for(nav_fut, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
await _asyncio.sleep(0.5)
|
||||
r = await call(
|
||||
"Runtime.evaluate",
|
||||
{"expression": "window.__ret", "returnByValue": True},
|
||||
sid=sid,
|
||||
)
|
||||
return r.get("result", {}).get("result", {}).get("value")
|
||||
finally:
|
||||
rd.cancel()
|
||||
try: await rd
|
||||
except BaseException: pass
|
||||
|
||||
value = asyncio.run(nav_and_read())
|
||||
assert value == "AGENT-SUPPLIED-REPLY", f"expected AGENT-SUPPLIED-REPLY, got {value!r}"
|
||||
@@ -11,6 +11,8 @@ import pytest
|
||||
from tools.discord_tool import (
|
||||
DiscordAPIError,
|
||||
_ACTIONS,
|
||||
_ADMIN_ACTIONS,
|
||||
_CORE_ACTIONS,
|
||||
_available_actions,
|
||||
_build_schema,
|
||||
_channel_type_name,
|
||||
@@ -21,8 +23,11 @@ from tools.discord_tool import (
|
||||
_load_allowed_actions_config,
|
||||
_reset_capability_cache,
|
||||
check_discord_tool_requirements,
|
||||
discord_server,
|
||||
discord_admin_handler,
|
||||
discord_core,
|
||||
get_dynamic_schema,
|
||||
get_dynamic_schema_admin,
|
||||
get_dynamic_schema_core,
|
||||
)
|
||||
|
||||
|
||||
@@ -147,32 +152,32 @@ class TestDiscordRequest:
|
||||
class TestDiscordServerValidation:
|
||||
def test_no_token(self, monkeypatch):
|
||||
monkeypatch.delenv("DISCORD_BOT_TOKEN", raising=False)
|
||||
result = json.loads(discord_server(action="list_guilds"))
|
||||
result = json.loads(discord_admin_handler(action="list_guilds"))
|
||||
assert "error" in result
|
||||
assert "DISCORD_BOT_TOKEN" in result["error"]
|
||||
|
||||
def test_unknown_action(self, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
result = json.loads(discord_server(action="bad_action"))
|
||||
result = json.loads(discord_core(action="bad_action"))
|
||||
assert "error" in result
|
||||
assert "Unknown action" in result["error"]
|
||||
assert "available_actions" in result
|
||||
|
||||
def test_missing_required_guild_id(self, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
result = json.loads(discord_server(action="list_channels"))
|
||||
result = json.loads(discord_admin_handler(action="list_channels"))
|
||||
assert "error" in result
|
||||
assert "guild_id" in result["error"]
|
||||
|
||||
def test_missing_required_channel_id(self, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
result = json.loads(discord_server(action="fetch_messages"))
|
||||
result = json.loads(discord_core(action="fetch_messages"))
|
||||
assert "error" in result
|
||||
assert "channel_id" in result["error"]
|
||||
|
||||
def test_missing_multiple_params(self, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
result = json.loads(discord_server(action="add_role"))
|
||||
result = json.loads(discord_admin_handler(action="add_role"))
|
||||
assert "error" in result
|
||||
assert "guild_id" in result["error"]
|
||||
assert "user_id" in result["error"]
|
||||
@@ -191,7 +196,7 @@ class TestListGuilds:
|
||||
{"id": "111", "name": "Test Server", "icon": "abc", "owner": True, "permissions": "123"},
|
||||
{"id": "222", "name": "Other Server", "icon": None, "owner": False, "permissions": "456"},
|
||||
]
|
||||
result = json.loads(discord_server(action="list_guilds"))
|
||||
result = json.loads(discord_admin_handler(action="list_guilds"))
|
||||
assert result["count"] == 2
|
||||
assert result["guilds"][0]["name"] == "Test Server"
|
||||
assert result["guilds"][1]["id"] == "222"
|
||||
@@ -219,7 +224,7 @@ class TestServerInfo:
|
||||
"premium_subscription_count": 5,
|
||||
"verification_level": 1,
|
||||
}
|
||||
result = json.loads(discord_server(action="server_info", guild_id="111"))
|
||||
result = json.loads(discord_admin_handler(action="server_info", guild_id="111"))
|
||||
assert result["name"] == "My Server"
|
||||
assert result["member_count"] == 42
|
||||
assert result["online_count"] == 10
|
||||
@@ -242,7 +247,7 @@ class TestListChannels:
|
||||
{"id": "12", "name": "voice", "type": 2, "position": 1, "parent_id": "10", "topic": None, "nsfw": False},
|
||||
{"id": "13", "name": "no-category", "type": 0, "position": 0, "parent_id": None, "topic": None, "nsfw": False},
|
||||
]
|
||||
result = json.loads(discord_server(action="list_channels", guild_id="111"))
|
||||
result = json.loads(discord_admin_handler(action="list_channels", guild_id="111"))
|
||||
assert result["total_channels"] == 3 # excludes the category itself
|
||||
groups = result["channel_groups"]
|
||||
# Uncategorized first
|
||||
@@ -257,7 +262,7 @@ class TestListChannels:
|
||||
def test_empty_guild(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = []
|
||||
result = json.loads(discord_server(action="list_channels", guild_id="111"))
|
||||
result = json.loads(discord_admin_handler(action="list_channels", guild_id="111"))
|
||||
assert result["total_channels"] == 0
|
||||
|
||||
|
||||
@@ -274,7 +279,7 @@ class TestChannelInfo:
|
||||
"topic": "Welcome!", "nsfw": False, "position": 0,
|
||||
"parent_id": "10", "rate_limit_per_user": 0, "last_message_id": "999",
|
||||
}
|
||||
result = json.loads(discord_server(action="channel_info", channel_id="11"))
|
||||
result = json.loads(discord_admin_handler(action="channel_info", channel_id="11"))
|
||||
assert result["name"] == "general"
|
||||
assert result["type"] == "text"
|
||||
assert result["guild_id"] == "111"
|
||||
@@ -293,7 +298,7 @@ class TestListRoles:
|
||||
{"id": "2", "name": "Admin", "position": 2, "color": 16711680, "mentionable": True, "managed": False, "hoist": True},
|
||||
{"id": "3", "name": "Mod", "position": 1, "color": 255, "mentionable": True, "managed": False, "hoist": True},
|
||||
]
|
||||
result = json.loads(discord_server(action="list_roles", guild_id="111"))
|
||||
result = json.loads(discord_admin_handler(action="list_roles", guild_id="111"))
|
||||
assert result["count"] == 3
|
||||
# Should be sorted by position descending
|
||||
assert result["roles"][0]["name"] == "Admin"
|
||||
@@ -317,7 +322,7 @@ class TestMemberInfo:
|
||||
"joined_at": "2024-01-01T00:00:00Z",
|
||||
"premium_since": None,
|
||||
}
|
||||
result = json.loads(discord_server(action="member_info", guild_id="111", user_id="42"))
|
||||
result = json.loads(discord_admin_handler(action="member_info", guild_id="111", user_id="42"))
|
||||
assert result["username"] == "testuser"
|
||||
assert result["nickname"] == "Testy"
|
||||
assert result["roles"] == ["2", "3"]
|
||||
@@ -334,7 +339,7 @@ class TestSearchMembers:
|
||||
mock_req.return_value = [
|
||||
{"user": {"id": "42", "username": "testuser", "global_name": "Test", "bot": False}, "nick": None, "roles": []},
|
||||
]
|
||||
result = json.loads(discord_server(action="search_members", guild_id="111", query="test"))
|
||||
result = json.loads(discord_core(action="search_members", guild_id="111", query="test"))
|
||||
assert result["count"] == 1
|
||||
assert result["members"][0]["username"] == "testuser"
|
||||
mock_req.assert_called_once_with(
|
||||
@@ -346,7 +351,7 @@ class TestSearchMembers:
|
||||
def test_search_members_limit_capped(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = []
|
||||
discord_server(action="search_members", guild_id="111", query="x", limit=200)
|
||||
discord_core(action="search_members", guild_id="111", query="x", limit=200)
|
||||
call_params = mock_req.call_args[1]["params"]
|
||||
assert call_params["limit"] == "100" # Capped at 100
|
||||
|
||||
@@ -370,7 +375,7 @@ class TestFetchMessages:
|
||||
"pinned": False,
|
||||
},
|
||||
]
|
||||
result = json.loads(discord_server(action="fetch_messages", channel_id="11"))
|
||||
result = json.loads(discord_core(action="fetch_messages", channel_id="11"))
|
||||
assert result["count"] == 1
|
||||
assert result["messages"][0]["content"] == "Hello world"
|
||||
assert result["messages"][0]["author"]["username"] == "user1"
|
||||
@@ -379,7 +384,7 @@ class TestFetchMessages:
|
||||
def test_fetch_messages_with_pagination(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = []
|
||||
discord_server(action="fetch_messages", channel_id="11", before="999", limit=10)
|
||||
discord_core(action="fetch_messages", channel_id="11", before="999", limit=10)
|
||||
call_params = mock_req.call_args[1]["params"]
|
||||
assert call_params["before"] == "999"
|
||||
assert call_params["limit"] == "10"
|
||||
@@ -396,7 +401,7 @@ class TestListPins:
|
||||
mock_req.return_value = [
|
||||
{"id": "500", "content": "Important announcement", "author": {"username": "admin"}, "timestamp": "2024-01-01T00:00:00Z"},
|
||||
]
|
||||
result = json.loads(discord_server(action="list_pins", channel_id="11"))
|
||||
result = json.loads(discord_admin_handler(action="list_pins", channel_id="11"))
|
||||
assert result["count"] == 1
|
||||
assert result["pinned_messages"][0]["content"] == "Important announcement"
|
||||
|
||||
@@ -410,7 +415,7 @@ class TestPinUnpin:
|
||||
def test_pin_message(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = None # 204
|
||||
result = json.loads(discord_server(action="pin_message", channel_id="11", message_id="500"))
|
||||
result = json.loads(discord_admin_handler(action="pin_message", channel_id="11", message_id="500"))
|
||||
assert result["success"] is True
|
||||
mock_req.assert_called_once_with("PUT", "/channels/11/pins/500", "test-token")
|
||||
|
||||
@@ -418,7 +423,7 @@ class TestPinUnpin:
|
||||
def test_unpin_message(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = None
|
||||
result = json.loads(discord_server(action="unpin_message", channel_id="11", message_id="500"))
|
||||
result = json.loads(discord_admin_handler(action="unpin_message", channel_id="11", message_id="500"))
|
||||
assert result["success"] is True
|
||||
|
||||
|
||||
@@ -431,7 +436,7 @@ class TestCreateThread:
|
||||
def test_create_standalone_thread(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = {"id": "800", "name": "New Thread"}
|
||||
result = json.loads(discord_server(action="create_thread", channel_id="11", name="New Thread"))
|
||||
result = json.loads(discord_core(action="create_thread", channel_id="11", name="New Thread"))
|
||||
assert result["success"] is True
|
||||
assert result["thread_id"] == "800"
|
||||
# Verify the API call
|
||||
@@ -444,7 +449,7 @@ class TestCreateThread:
|
||||
def test_create_thread_from_message(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = {"id": "801", "name": "Discussion"}
|
||||
result = json.loads(discord_server(
|
||||
result = json.loads(discord_core(
|
||||
action="create_thread", channel_id="11", name="Discussion", message_id="1001",
|
||||
))
|
||||
assert result["success"] is True
|
||||
@@ -463,7 +468,7 @@ class TestRoleManagement:
|
||||
def test_add_role(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = None
|
||||
result = json.loads(discord_server(
|
||||
result = json.loads(discord_admin_handler(
|
||||
action="add_role", guild_id="111", user_id="42", role_id="2",
|
||||
))
|
||||
assert result["success"] is True
|
||||
@@ -475,7 +480,7 @@ class TestRoleManagement:
|
||||
def test_remove_role(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.return_value = None
|
||||
result = json.loads(discord_server(
|
||||
result = json.loads(discord_admin_handler(
|
||||
action="remove_role", guild_id="111", user_id="42", role_id="2",
|
||||
))
|
||||
assert result["success"] is True
|
||||
@@ -490,15 +495,23 @@ class TestErrorHandling:
|
||||
def test_api_error_handled(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.side_effect = DiscordAPIError(403, '{"message": "Missing Access"}')
|
||||
result = json.loads(discord_server(action="list_guilds"))
|
||||
result = json.loads(discord_admin_handler(action="list_guilds"))
|
||||
assert "error" in result
|
||||
assert "403" in result["error"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_unexpected_error_handled(self, mock_req, monkeypatch):
|
||||
def test_unexpected_error_handled_admin(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.side_effect = RuntimeError("something broke")
|
||||
result = json.loads(discord_server(action="list_guilds"))
|
||||
result = json.loads(discord_admin_handler(action="list_guilds"))
|
||||
assert "error" in result
|
||||
assert "something broke" in result["error"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_unexpected_error_handled_core(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "test-token")
|
||||
mock_req.side_effect = RuntimeError("something broke")
|
||||
result = json.loads(discord_core(action="fetch_messages", channel_id="11"))
|
||||
assert "error" in result
|
||||
assert "something broke" in result["error"]
|
||||
|
||||
@@ -508,79 +521,109 @@ class TestErrorHandling:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRegistration:
|
||||
def test_tool_registered(self):
|
||||
def test_core_tool_registered(self):
|
||||
from tools.registry import registry
|
||||
entry = registry._tools.get("discord_server")
|
||||
entry = registry._tools.get("discord")
|
||||
assert entry is not None
|
||||
assert entry.schema["name"] == "discord_server"
|
||||
assert entry.schema["name"] == "discord"
|
||||
assert entry.toolset == "discord"
|
||||
assert entry.check_fn is not None
|
||||
assert entry.requires_env == ["DISCORD_BOT_TOKEN"]
|
||||
|
||||
def test_schema_actions(self):
|
||||
"""Static schema should list all actions (the model_tools post-processing
|
||||
narrows this per-session; static registration is the superset)."""
|
||||
def test_admin_tool_registered(self):
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord_server"]
|
||||
actions = entry.schema["parameters"]["properties"]["action"]["enum"]
|
||||
expected = [
|
||||
"list_guilds", "server_info", "list_channels", "channel_info",
|
||||
"list_roles", "member_info", "search_members", "fetch_messages",
|
||||
"list_pins", "pin_message", "unpin_message", "create_thread",
|
||||
"add_role", "remove_role",
|
||||
]
|
||||
assert set(actions) == set(expected)
|
||||
assert set(_ACTIONS.keys()) == set(expected)
|
||||
entry = registry._tools.get("discord_admin")
|
||||
assert entry is not None
|
||||
assert entry.schema["name"] == "discord_admin"
|
||||
assert entry.toolset == "discord_admin"
|
||||
assert entry.check_fn is not None
|
||||
assert entry.requires_env == ["DISCORD_BOT_TOKEN"]
|
||||
|
||||
def test_core_schema_actions(self):
|
||||
"""Core static schema should list only core actions."""
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord"]
|
||||
actions = set(entry.schema["parameters"]["properties"]["action"]["enum"])
|
||||
assert actions == {"fetch_messages", "search_members", "create_thread"}
|
||||
|
||||
def test_admin_schema_actions(self):
|
||||
"""Admin static schema should list only admin actions."""
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord_admin"]
|
||||
actions = set(entry.schema["parameters"]["properties"]["action"]["enum"])
|
||||
expected_admin = set(_ACTIONS.keys()) - {"fetch_messages", "search_members", "create_thread"}
|
||||
assert actions == expected_admin
|
||||
|
||||
def test_all_actions_covered(self):
|
||||
"""Core + admin actions should cover all known actions."""
|
||||
assert set(_CORE_ACTIONS.keys()) | set(_ADMIN_ACTIONS.keys()) == set(_ACTIONS.keys())
|
||||
assert set(_CORE_ACTIONS.keys()) & set(_ADMIN_ACTIONS.keys()) == set()
|
||||
|
||||
def test_schema_parameter_bounds(self):
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord_server"]
|
||||
entry = registry._tools["discord"]
|
||||
props = entry.schema["parameters"]["properties"]
|
||||
assert props["limit"]["minimum"] == 1
|
||||
assert props["limit"]["maximum"] == 100
|
||||
assert props["auto_archive_duration"]["enum"] == [60, 1440, 4320, 10080]
|
||||
|
||||
def test_schema_description_is_action_manifest(self):
|
||||
"""The top-level description should include the action manifest
|
||||
(one-line signatures per action) so the model can find required
|
||||
params without re-reading every parameter description."""
|
||||
def test_core_schema_description(self):
|
||||
"""Core schema description should mention core actions."""
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord_server"]
|
||||
entry = registry._tools["discord"]
|
||||
desc = entry.schema["description"]
|
||||
# Spot-check a few entries
|
||||
assert "list_guilds()" in desc
|
||||
assert "fetch_messages(channel_id)" in desc
|
||||
assert "search_members(guild_id, query)" in desc
|
||||
assert "create_thread(channel_id, name)" in desc
|
||||
# Admin actions should NOT be in core description
|
||||
assert "list_guilds()" not in desc
|
||||
assert "add_role(" not in desc
|
||||
|
||||
def test_admin_schema_description(self):
|
||||
"""Admin schema description should mention admin actions."""
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord_admin"]
|
||||
desc = entry.schema["description"]
|
||||
assert "list_guilds()" in desc
|
||||
assert "add_role(guild_id, user_id, role_id)" in desc
|
||||
# Core actions should NOT be in admin description
|
||||
assert "fetch_messages(" not in desc
|
||||
assert "create_thread(" not in desc
|
||||
|
||||
def test_handler_callable(self):
|
||||
from tools.registry import registry
|
||||
entry = registry._tools["discord_server"]
|
||||
entry = registry._tools["discord"]
|
||||
assert callable(entry.handler)
|
||||
entry_admin = registry._tools["discord_admin"]
|
||||
assert callable(entry_admin.handler)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Toolset: discord_server only in hermes-discord
|
||||
# Toolset: discord / discord_admin only in hermes-discord
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestToolsetInclusion:
|
||||
def test_discord_server_in_hermes_discord_toolset(self):
|
||||
def test_discord_tools_in_hermes_discord_toolset(self):
|
||||
from toolsets import TOOLSETS
|
||||
assert "discord_server" in TOOLSETS["hermes-discord"]["tools"]
|
||||
assert "discord" in TOOLSETS["hermes-discord"]["tools"]
|
||||
assert "discord_admin" in TOOLSETS["hermes-discord"]["tools"]
|
||||
|
||||
def test_discord_server_not_in_core_tools(self):
|
||||
def test_discord_tools_not_in_core_tools(self):
|
||||
from toolsets import _HERMES_CORE_TOOLS
|
||||
assert "discord_server" not in _HERMES_CORE_TOOLS
|
||||
assert "discord" not in _HERMES_CORE_TOOLS
|
||||
assert "discord_admin" not in _HERMES_CORE_TOOLS
|
||||
|
||||
def test_discord_server_not_in_other_toolsets(self):
|
||||
def test_discord_tools_not_in_other_toolsets(self):
|
||||
from toolsets import TOOLSETS
|
||||
for name, ts in TOOLSETS.items():
|
||||
if name == "hermes-discord":
|
||||
if name in ("hermes-discord", "hermes-gateway", "discord", "discord_admin"):
|
||||
continue
|
||||
# The gateway toolset might include it if it unions all platform tools
|
||||
if name == "hermes-gateway":
|
||||
continue
|
||||
assert "discord_server" not in ts.get("tools", []), (
|
||||
f"discord_server should not be in toolset '{name}'"
|
||||
tools = ts.get("tools", [])
|
||||
assert "discord" not in tools or name == "discord", (
|
||||
f"discord tool should not be in toolset '{name}'"
|
||||
)
|
||||
assert "discord_admin" not in tools or name == "discord_admin", (
|
||||
f"discord_admin tool should not be in toolset '{name}'"
|
||||
)
|
||||
|
||||
|
||||
@@ -798,40 +841,69 @@ class TestDynamicSchema:
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_no_token_returns_none(self, mock_req, monkeypatch):
|
||||
monkeypatch.delenv("DISCORD_BOT_TOKEN", raising=False)
|
||||
assert get_dynamic_schema() is None
|
||||
assert get_dynamic_schema_core() is None
|
||||
assert get_dynamic_schema_admin() is None
|
||||
mock_req.assert_not_called()
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_full_intents_full_schema(self, mock_req, monkeypatch):
|
||||
def test_full_intents_core_schema(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.return_value = {"flags": (1 << 14) | (1 << 18)}
|
||||
schema = get_dynamic_schema()
|
||||
actions = schema["parameters"]["properties"]["action"]["enum"]
|
||||
assert set(actions) == set(_ACTIONS.keys())
|
||||
# No content warning
|
||||
schema = get_dynamic_schema_core()
|
||||
actions = set(schema["parameters"]["properties"]["action"]["enum"])
|
||||
assert actions == set(_CORE_ACTIONS.keys())
|
||||
assert schema["name"] == "discord"
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_full_intents_admin_schema(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.return_value = {"flags": (1 << 14) | (1 << 18)}
|
||||
schema = get_dynamic_schema_admin()
|
||||
actions = set(schema["parameters"]["properties"]["action"]["enum"])
|
||||
assert actions == set(_ADMIN_ACTIONS.keys())
|
||||
assert schema["name"] == "discord_admin"
|
||||
# No content warning when MESSAGE_CONTENT is enabled
|
||||
assert "MESSAGE_CONTENT" not in schema["description"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_no_members_intent_removes_member_actions_from_schema(
|
||||
def test_no_members_intent_removes_member_actions_from_admin_schema(
|
||||
self, mock_req, monkeypatch,
|
||||
):
|
||||
"""member_info is an admin action; it should be hidden when
|
||||
GUILD_MEMBERS intent is missing."""
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.return_value = {"flags": 1 << 18} # only MESSAGE_CONTENT
|
||||
schema = get_dynamic_schema()
|
||||
schema = get_dynamic_schema_admin()
|
||||
actions = schema["parameters"]["properties"]["action"]["enum"]
|
||||
assert "member_info" not in actions
|
||||
assert "member_info" not in schema["description"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_no_members_intent_hides_search_members_from_core(
|
||||
self, mock_req, monkeypatch,
|
||||
):
|
||||
"""search_members is a core action gated by GUILD_MEMBERS intent."""
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.return_value = {"flags": 1 << 18} # only MESSAGE_CONTENT
|
||||
schema = get_dynamic_schema_core()
|
||||
actions = schema["parameters"]["properties"]["action"]["enum"]
|
||||
assert "search_members" not in actions
|
||||
assert "member_info" not in actions
|
||||
# Manifest description should also not advertise them
|
||||
assert "search_members" not in schema["description"]
|
||||
assert "member_info" not in schema["description"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_no_message_content_adds_warning_note(self, mock_req, monkeypatch):
|
||||
@@ -841,41 +913,53 @@ class TestDynamicSchema:
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.return_value = {"flags": 1 << 14} # only GUILD_MEMBERS
|
||||
schema = get_dynamic_schema()
|
||||
schema = get_dynamic_schema_core()
|
||||
assert "MESSAGE_CONTENT" in schema["description"]
|
||||
# But fetch_messages is still available
|
||||
actions = schema["parameters"]["properties"]["action"]["enum"]
|
||||
assert "fetch_messages" in actions
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_config_allowlist_narrows_schema(self, mock_req, monkeypatch):
|
||||
def test_config_allowlist_narrows_admin_schema(self, mock_req, monkeypatch):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": "list_guilds,list_channels"}},
|
||||
)
|
||||
mock_req.return_value = {"flags": (1 << 14) | (1 << 18)}
|
||||
schema = get_dynamic_schema()
|
||||
schema = get_dynamic_schema_admin()
|
||||
actions = schema["parameters"]["properties"]["action"]["enum"]
|
||||
assert actions == ["list_guilds", "list_channels"]
|
||||
# Manifest description should only show allowed ones (check for
|
||||
# the signature marker, which is specific to manifest lines)
|
||||
assert "list_guilds()" in schema["description"]
|
||||
assert "add_role(" not in schema["description"]
|
||||
assert "create_thread(" not in schema["description"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_empty_allowlist_with_valid_values_hides_tool(self, mock_req, monkeypatch):
|
||||
def test_empty_allowlist_with_valid_values_hides_tools(self, mock_req, monkeypatch):
|
||||
"""If the allowlist resolves to zero valid actions (e.g. all names
|
||||
were typos), get_dynamic_schema returns None so the tool is dropped
|
||||
entirely rather than showing an empty enum."""
|
||||
were typos), get_dynamic_schema returns None so the tool is dropped."""
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": "typo_one,typo_two"}},
|
||||
)
|
||||
mock_req.return_value = {"flags": (1 << 14) | (1 << 18)}
|
||||
assert get_dynamic_schema() is None
|
||||
assert get_dynamic_schema_core() is None
|
||||
assert get_dynamic_schema_admin() is None
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_backward_compat_wrapper(self, mock_req, monkeypatch):
|
||||
"""get_dynamic_schema() should delegate to get_dynamic_schema_core()."""
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.return_value = {"flags": (1 << 14) | (1 << 18)}
|
||||
schema = get_dynamic_schema()
|
||||
assert schema is not None
|
||||
assert schema["name"] == "discord"
|
||||
actions = set(schema["parameters"]["properties"]["action"]["enum"])
|
||||
assert actions == set(_CORE_ACTIONS.keys())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -890,7 +974,7 @@ class TestRuntimeAllowlistEnforcement:
|
||||
"hermes_cli.config.load_config",
|
||||
lambda: {"discord": {"server_actions": "list_guilds"}},
|
||||
)
|
||||
result = json.loads(discord_server(action="add_role", guild_id="1", user_id="2", role_id="3"))
|
||||
result = json.loads(discord_admin_handler(action="add_role", guild_id="1", user_id="2", role_id="3"))
|
||||
assert "error" in result
|
||||
assert "disabled by config" in result["error"]
|
||||
mock_req.assert_not_called()
|
||||
@@ -903,7 +987,7 @@ class TestRuntimeAllowlistEnforcement:
|
||||
lambda: {"discord": {"server_actions": "list_guilds"}},
|
||||
)
|
||||
mock_req.return_value = []
|
||||
result = json.loads(discord_server(action="list_guilds"))
|
||||
result = json.loads(discord_admin_handler(action="list_guilds"))
|
||||
assert "guilds" in result
|
||||
|
||||
|
||||
@@ -930,7 +1014,7 @@ class Test403Enrichment:
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.side_effect = DiscordAPIError(403, '{"message":"Missing Permissions"}')
|
||||
result = json.loads(discord_server(
|
||||
result = json.loads(discord_admin_handler(
|
||||
action="add_role", guild_id="1", user_id="2", role_id="3",
|
||||
))
|
||||
assert "error" in result
|
||||
@@ -944,7 +1028,7 @@ class Test403Enrichment:
|
||||
lambda: {"discord": {"server_actions": ""}},
|
||||
)
|
||||
mock_req.side_effect = DiscordAPIError(500, "server error")
|
||||
result = json.loads(discord_server(action="list_guilds"))
|
||||
result = json.loads(discord_admin_handler(action="list_guilds"))
|
||||
assert "500" in result["error"]
|
||||
assert "MANAGE_ROLES" not in result["error"]
|
||||
|
||||
@@ -961,10 +1045,10 @@ class TestModelToolsIntegration:
|
||||
_reset_capability_cache()
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_discord_server_schema_rebuilt_by_get_tool_definitions(
|
||||
def test_discord_admin_schema_rebuilt_by_get_tool_definitions(
|
||||
self, mock_req, monkeypatch,
|
||||
):
|
||||
"""When model_tools.get_tool_definitions runs with discord_server
|
||||
"""When model_tools.get_tool_definitions runs with discord_admin
|
||||
available, it should replace the static schema with the dynamic one."""
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
monkeypatch.setattr(
|
||||
@@ -976,16 +1060,16 @@ class TestModelToolsIntegration:
|
||||
|
||||
from model_tools import get_tool_definitions
|
||||
tools = get_tool_definitions(enabled_toolsets=["hermes-discord"], quiet_mode=True)
|
||||
discord_tool = next(
|
||||
(t for t in tools if t.get("function", {}).get("name") == "discord_server"),
|
||||
discord_admin_tool = next(
|
||||
(t for t in tools if t.get("function", {}).get("name") == "discord_admin"),
|
||||
None,
|
||||
)
|
||||
assert discord_tool is not None, "discord_server should be in the schema"
|
||||
actions = discord_tool["function"]["parameters"]["properties"]["action"]["enum"]
|
||||
assert discord_admin_tool is not None, "discord_admin should be in the schema"
|
||||
actions = discord_admin_tool["function"]["parameters"]["properties"]["action"]["enum"]
|
||||
assert actions == ["list_guilds", "server_info"]
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_discord_server_dropped_when_allowlist_empties_it(
|
||||
def test_discord_tools_dropped_when_allowlist_empties_them(
|
||||
self, mock_req, monkeypatch,
|
||||
):
|
||||
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tok")
|
||||
@@ -998,4 +1082,6 @@ class TestModelToolsIntegration:
|
||||
from model_tools import get_tool_definitions
|
||||
tools = get_tool_definitions(enabled_toolsets=["hermes-discord"], quiet_mode=True)
|
||||
names = [t.get("function", {}).get("name") for t in tools]
|
||||
assert "discord" not in names
|
||||
assert "discord_admin" not in names
|
||||
assert "discord_server" not in names
|
||||
|
||||
@@ -120,6 +120,177 @@ class TestSchemaConversion:
|
||||
|
||||
assert schema["parameters"] == {"type": "object", "properties": {}}
|
||||
|
||||
def test_definitions_refs_are_rewritten_to_defs(self):
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
mcp_tool = _make_mcp_tool(
|
||||
name="submit",
|
||||
description="Submit a payload",
|
||||
input_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input": {"$ref": "#/definitions/Payload"},
|
||||
},
|
||||
"required": ["input"],
|
||||
"definitions": {
|
||||
"Payload": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
},
|
||||
"required": ["query"],
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
schema = _convert_mcp_schema("forms", mcp_tool)
|
||||
|
||||
assert schema["parameters"]["properties"]["input"]["$ref"] == "#/$defs/Payload"
|
||||
assert "$defs" in schema["parameters"]
|
||||
assert "definitions" not in schema["parameters"]
|
||||
|
||||
def test_nested_definition_refs_are_rewritten_recursively(self):
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
mcp_tool = _make_mcp_tool(
|
||||
name="nested",
|
||||
description="Nested schema",
|
||||
input_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/Entry"},
|
||||
},
|
||||
},
|
||||
"definitions": {
|
||||
"Entry": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"child": {"$ref": "#/definitions/Child"},
|
||||
},
|
||||
},
|
||||
"Child": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": {"type": "string"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
schema = _convert_mcp_schema("forms", mcp_tool)
|
||||
|
||||
assert schema["parameters"]["properties"]["items"]["items"]["$ref"] == "#/$defs/Entry"
|
||||
assert schema["parameters"]["$defs"]["Entry"]["properties"]["child"]["$ref"] == "#/$defs/Child"
|
||||
|
||||
def test_missing_type_on_object_is_coerced(self):
|
||||
"""Schemas that describe an object but omit ``type`` get type='object'."""
|
||||
from tools.mcp_tool import _normalize_mcp_input_schema
|
||||
|
||||
schema = _normalize_mcp_input_schema({
|
||||
"properties": {"q": {"type": "string"}},
|
||||
"required": ["q"],
|
||||
})
|
||||
|
||||
assert schema["type"] == "object"
|
||||
assert schema["properties"]["q"]["type"] == "string"
|
||||
assert schema["required"] == ["q"]
|
||||
|
||||
def test_null_type_on_object_is_coerced(self):
|
||||
"""type: None should be treated like missing type (common MCP server bug)."""
|
||||
from tools.mcp_tool import _normalize_mcp_input_schema
|
||||
|
||||
schema = _normalize_mcp_input_schema({
|
||||
"type": None,
|
||||
"properties": {"x": {"type": "integer"}},
|
||||
})
|
||||
|
||||
assert schema["type"] == "object"
|
||||
|
||||
def test_required_pruned_when_property_missing(self):
|
||||
"""Gemini 400s on required names that don't exist in properties."""
|
||||
from tools.mcp_tool import _normalize_mcp_input_schema
|
||||
|
||||
schema = _normalize_mcp_input_schema({
|
||||
"type": "object",
|
||||
"properties": {"a": {"type": "string"}},
|
||||
"required": ["a", "ghost", "phantom"],
|
||||
})
|
||||
|
||||
assert schema["required"] == ["a"]
|
||||
|
||||
def test_required_removed_when_all_names_dangle(self):
|
||||
from tools.mcp_tool import _normalize_mcp_input_schema
|
||||
|
||||
schema = _normalize_mcp_input_schema({
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": ["ghost"],
|
||||
})
|
||||
|
||||
assert "required" not in schema
|
||||
|
||||
def test_required_pruning_applies_recursively_inside_nested_objects(self):
|
||||
"""Nested object schemas also get required pruning."""
|
||||
from tools.mcp_tool import _normalize_mcp_input_schema
|
||||
|
||||
schema = _normalize_mcp_input_schema({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filter": {
|
||||
"type": "object",
|
||||
"properties": {"field": {"type": "string"}},
|
||||
"required": ["field", "missing"],
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
assert schema["properties"]["filter"]["required"] == ["field"]
|
||||
|
||||
def test_object_in_array_items_gets_properties_filled(self):
|
||||
"""Array-item object schemas without properties get an empty dict."""
|
||||
from tools.mcp_tool import _normalize_mcp_input_schema
|
||||
|
||||
schema = _normalize_mcp_input_schema({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {"type": "object"},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
assert schema["properties"]["items"]["items"]["properties"] == {}
|
||||
|
||||
def test_convert_mcp_schema_survives_missing_inputschema_attribute(self):
|
||||
"""A Tool object without .inputSchema must not crash registration."""
|
||||
import types
|
||||
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
bare_tool = types.SimpleNamespace(name="probe", description="Probe")
|
||||
schema = _convert_mcp_schema("srv", bare_tool)
|
||||
|
||||
assert schema["name"] == "mcp_srv_probe"
|
||||
assert schema["parameters"] == {"type": "object", "properties": {}}
|
||||
|
||||
def test_convert_mcp_schema_with_none_inputschema(self):
|
||||
"""Tool with inputSchema=None produces a valid empty object schema."""
|
||||
import types
|
||||
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
# Note: _make_mcp_tool(input_schema=None) falls back to a default —
|
||||
# build the namespace directly so .inputSchema really is None.
|
||||
mcp_tool = types.SimpleNamespace(name="probe", description="Probe", inputSchema=None)
|
||||
schema = _convert_mcp_schema("srv", mcp_tool)
|
||||
|
||||
assert schema["parameters"] == {"type": "object", "properties": {}}
|
||||
|
||||
def test_tool_name_prefix_format(self):
|
||||
from tools.mcp_tool import _convert_mcp_schema
|
||||
|
||||
@@ -1029,6 +1200,92 @@ class TestHTTPConfig:
|
||||
|
||||
asyncio.run(_test())
|
||||
|
||||
def test_http_seeds_initial_protocol_header(self):
|
||||
from tools.mcp_tool import LATEST_PROTOCOL_VERSION, MCPServerTask
|
||||
|
||||
server = MCPServerTask("remote")
|
||||
captured = {}
|
||||
|
||||
class DummyAsyncClient:
|
||||
def __init__(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
class DummyTransportCtx:
|
||||
async def __aenter__(self):
|
||||
return MagicMock(), MagicMock(), (lambda: None)
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
class DummySession:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
async def initialize(self):
|
||||
return None
|
||||
|
||||
class DummyLegacyTransportCtx:
|
||||
def __init__(self, **kwargs):
|
||||
captured["legacy_headers"] = kwargs.get("headers")
|
||||
|
||||
async def __aenter__(self):
|
||||
return MagicMock(), MagicMock(), (lambda: None)
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
async def _discover_tools(self):
|
||||
self._shutdown_event.set()
|
||||
|
||||
async def _run(config, *, new_http):
|
||||
captured.clear()
|
||||
with patch("tools.mcp_tool._MCP_HTTP_AVAILABLE", True), \
|
||||
patch("tools.mcp_tool._MCP_NEW_HTTP", new_http), \
|
||||
patch("httpx.AsyncClient", DummyAsyncClient), \
|
||||
patch("tools.mcp_tool.streamable_http_client", return_value=DummyTransportCtx()), \
|
||||
patch("tools.mcp_tool.streamablehttp_client", side_effect=lambda url, **kwargs: DummyLegacyTransportCtx(**kwargs)), \
|
||||
patch("tools.mcp_tool.ClientSession", DummySession), \
|
||||
patch.object(MCPServerTask, "_discover_tools", _discover_tools):
|
||||
await server._run_http(config)
|
||||
|
||||
asyncio.run(_run({"url": "https://example.com/mcp"}, new_http=True))
|
||||
assert captured["headers"]["mcp-protocol-version"] == LATEST_PROTOCOL_VERSION
|
||||
|
||||
asyncio.run(_run({
|
||||
"url": "https://example.com/mcp",
|
||||
"headers": {"mcp-protocol-version": "custom-version"},
|
||||
}, new_http=True))
|
||||
assert captured["headers"]["mcp-protocol-version"] == "custom-version"
|
||||
|
||||
asyncio.run(_run({
|
||||
"url": "https://example.com/mcp",
|
||||
"headers": {"MCP-Protocol-Version": "custom-version"},
|
||||
}, new_http=True))
|
||||
assert captured["headers"]["MCP-Protocol-Version"] == "custom-version"
|
||||
assert "mcp-protocol-version" not in captured["headers"]
|
||||
|
||||
asyncio.run(_run({"url": "https://example.com/mcp"}, new_http=False))
|
||||
assert captured["legacy_headers"]["mcp-protocol-version"] == LATEST_PROTOCOL_VERSION
|
||||
|
||||
asyncio.run(_run({
|
||||
"url": "https://example.com/mcp",
|
||||
"headers": {"MCP-Protocol-Version": "custom-version"},
|
||||
}, new_http=False))
|
||||
assert captured["legacy_headers"]["MCP-Protocol-Version"] == "custom-version"
|
||||
assert "mcp-protocol-version" not in captured["legacy_headers"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reconnection logic
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
"""Tests for tools.tool_output_limits.
|
||||
|
||||
Covers:
|
||||
1. Default values when no config is provided.
|
||||
2. Config override picks up user-supplied max_bytes / max_lines /
|
||||
max_line_length.
|
||||
3. Malformed values (None, negative, wrong type) fall back to defaults
|
||||
rather than raising.
|
||||
4. Integration: the helpers return what the terminal_tool and
|
||||
file_operations call paths will actually consume.
|
||||
|
||||
Port-tracking: anomalyco/opencode PR #23770
|
||||
(feat(truncate): allow configuring tool output truncation limits).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tools import tool_output_limits as tol
|
||||
|
||||
|
||||
class TestDefaults:
|
||||
def test_defaults_match_previous_hardcoded_values(self):
|
||||
assert tol.DEFAULT_MAX_BYTES == 50_000
|
||||
assert tol.DEFAULT_MAX_LINES == 2000
|
||||
assert tol.DEFAULT_MAX_LINE_LENGTH == 2000
|
||||
|
||||
def test_get_limits_returns_defaults_when_config_missing(self):
|
||||
with patch("hermes_cli.config.load_config", return_value={}):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits == {
|
||||
"max_bytes": tol.DEFAULT_MAX_BYTES,
|
||||
"max_lines": tol.DEFAULT_MAX_LINES,
|
||||
"max_line_length": tol.DEFAULT_MAX_LINE_LENGTH,
|
||||
}
|
||||
|
||||
def test_get_limits_returns_defaults_when_config_not_a_dict(self):
|
||||
# load_config should always return a dict but be defensive anyway.
|
||||
with patch("hermes_cli.config.load_config", return_value="not a dict"):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits["max_bytes"] == tol.DEFAULT_MAX_BYTES
|
||||
|
||||
def test_get_limits_returns_defaults_when_load_config_raises(self):
|
||||
def _boom():
|
||||
raise RuntimeError("boom")
|
||||
|
||||
with patch("hermes_cli.config.load_config", side_effect=_boom):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits["max_lines"] == tol.DEFAULT_MAX_LINES
|
||||
|
||||
|
||||
class TestOverrides:
|
||||
def test_user_config_overrides_all_three(self):
|
||||
cfg = {
|
||||
"tool_output": {
|
||||
"max_bytes": 100_000,
|
||||
"max_lines": 5000,
|
||||
"max_line_length": 4096,
|
||||
}
|
||||
}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits == {
|
||||
"max_bytes": 100_000,
|
||||
"max_lines": 5000,
|
||||
"max_line_length": 4096,
|
||||
}
|
||||
|
||||
def test_partial_override_preserves_other_defaults(self):
|
||||
cfg = {"tool_output": {"max_bytes": 200_000}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits["max_bytes"] == 200_000
|
||||
assert limits["max_lines"] == tol.DEFAULT_MAX_LINES
|
||||
assert limits["max_line_length"] == tol.DEFAULT_MAX_LINE_LENGTH
|
||||
|
||||
def test_section_not_a_dict_falls_back(self):
|
||||
cfg = {"tool_output": "nonsense"}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits["max_bytes"] == tol.DEFAULT_MAX_BYTES
|
||||
|
||||
|
||||
class TestCoercion:
|
||||
@pytest.mark.parametrize("bad", [None, "not a number", -1, 0, [], {}])
|
||||
def test_invalid_values_fall_back_to_defaults(self, bad):
|
||||
cfg = {"tool_output": {"max_bytes": bad, "max_lines": bad, "max_line_length": bad}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits["max_bytes"] == tol.DEFAULT_MAX_BYTES
|
||||
assert limits["max_lines"] == tol.DEFAULT_MAX_LINES
|
||||
assert limits["max_line_length"] == tol.DEFAULT_MAX_LINE_LENGTH
|
||||
|
||||
def test_string_integer_is_coerced(self):
|
||||
cfg = {"tool_output": {"max_bytes": "75000"}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
limits = tol.get_tool_output_limits()
|
||||
assert limits["max_bytes"] == 75_000
|
||||
|
||||
|
||||
class TestShortcuts:
|
||||
def test_individual_accessors_delegate_to_get_tool_output_limits(self):
|
||||
cfg = {
|
||||
"tool_output": {
|
||||
"max_bytes": 111,
|
||||
"max_lines": 222,
|
||||
"max_line_length": 333,
|
||||
}
|
||||
}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
assert tol.get_max_bytes() == 111
|
||||
assert tol.get_max_lines() == 222
|
||||
assert tol.get_max_line_length() == 333
|
||||
|
||||
|
||||
class TestDefaultConfigHasSection:
|
||||
"""The DEFAULT_CONFIG in hermes_cli.config must expose tool_output so
|
||||
that ``hermes setup`` and default installs stay in sync with the
|
||||
helpers here."""
|
||||
|
||||
def test_default_config_contains_tool_output_section(self):
|
||||
from hermes_cli.config import DEFAULT_CONFIG
|
||||
assert "tool_output" in DEFAULT_CONFIG
|
||||
section = DEFAULT_CONFIG["tool_output"]
|
||||
assert isinstance(section, dict)
|
||||
assert section["max_bytes"] == tol.DEFAULT_MAX_BYTES
|
||||
assert section["max_lines"] == tol.DEFAULT_MAX_LINES
|
||||
assert section["max_line_length"] == tol.DEFAULT_MAX_LINE_LENGTH
|
||||
|
||||
|
||||
class TestIntegrationReadPagination:
|
||||
"""normalize_read_pagination uses get_max_lines() — verify the plumbing."""
|
||||
|
||||
def test_pagination_limit_clamped_by_config_value(self):
|
||||
from tools.file_operations import normalize_read_pagination
|
||||
cfg = {"tool_output": {"max_lines": 50}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
offset, limit = normalize_read_pagination(offset=1, limit=1000)
|
||||
# limit should have been clamped to 50 (the configured max_lines)
|
||||
assert limit == 50
|
||||
assert offset == 1
|
||||
|
||||
def test_pagination_default_when_config_missing(self):
|
||||
from tools.file_operations import normalize_read_pagination
|
||||
with patch("hermes_cli.config.load_config", return_value={}):
|
||||
offset, limit = normalize_read_pagination(offset=10, limit=100000)
|
||||
# Clamped to default MAX_LINES (2000).
|
||||
assert limit == tol.DEFAULT_MAX_LINES
|
||||
assert offset == 10
|
||||
+157
-10
@@ -188,10 +188,116 @@ async def _cdp_call(
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _browser_cdp_via_supervisor(
|
||||
task_id: str,
|
||||
frame_id: str,
|
||||
method: str,
|
||||
params: Optional[Dict[str, Any]],
|
||||
timeout: float,
|
||||
) -> str:
|
||||
"""Route a CDP call through the live supervisor session for an OOPIF frame.
|
||||
|
||||
Looks up the frame in the supervisor's snapshot, extracts its child
|
||||
``cdp_session_id``, and dispatches ``method`` with that sessionId via
|
||||
the supervisor's already-connected WebSocket (using
|
||||
``asyncio.run_coroutine_threadsafe`` onto the supervisor loop).
|
||||
"""
|
||||
try:
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found]
|
||||
except Exception as exc: # pragma: no cover — defensive
|
||||
return tool_error(
|
||||
f"CDP supervisor is not available: {exc}. frame_id routing requires "
|
||||
f"a running supervisor attached via /browser connect or an active "
|
||||
f"Browserbase session."
|
||||
)
|
||||
|
||||
supervisor = SUPERVISOR_REGISTRY.get(task_id)
|
||||
if supervisor is None:
|
||||
return tool_error(
|
||||
f"No CDP supervisor is attached for task={task_id!r}. Call "
|
||||
f"browser_navigate or /browser connect first so the supervisor "
|
||||
f"can attach. Once attached, browser_snapshot will populate "
|
||||
f"frame_tree with frame_ids you can pass here."
|
||||
)
|
||||
|
||||
snap = supervisor.snapshot()
|
||||
# Search both the top frame and the children for the requested id.
|
||||
top = snap.frame_tree.get("top")
|
||||
frame_info: Optional[Dict[str, Any]] = None
|
||||
if top and top.get("frame_id") == frame_id:
|
||||
frame_info = top
|
||||
else:
|
||||
for child in snap.frame_tree.get("children", []) or []:
|
||||
if child.get("frame_id") == frame_id:
|
||||
frame_info = child
|
||||
break
|
||||
if frame_info is None:
|
||||
# Check the raw frames dict too (frame_tree is capped at 30 entries)
|
||||
with supervisor._state_lock: # type: ignore[attr-defined]
|
||||
raw = supervisor._frames.get(frame_id) # type: ignore[attr-defined]
|
||||
if raw is not None:
|
||||
frame_info = raw.to_dict()
|
||||
|
||||
if frame_info is None:
|
||||
return tool_error(
|
||||
f"frame_id {frame_id!r} not found in supervisor state. "
|
||||
f"Call browser_snapshot to see current frame_tree."
|
||||
)
|
||||
|
||||
child_sid = frame_info.get("session_id")
|
||||
if not child_sid:
|
||||
# Not an OOPIF — fall back to top-level session (evaluating at page
|
||||
# scope). Same-origin iframes don't get their own sessionId; the
|
||||
# agent can still use contentWindow/contentDocument from the parent.
|
||||
return tool_error(
|
||||
f"frame_id {frame_id!r} is not an out-of-process iframe (no "
|
||||
f"dedicated CDP session). For same-origin iframes, use "
|
||||
f"`browser_cdp(method='Runtime.evaluate', params={{'expression': "
|
||||
f"\"document.querySelector('iframe').contentDocument.title\"}})` "
|
||||
f"at the top-level page instead."
|
||||
)
|
||||
|
||||
# Dispatch onto the supervisor's loop.
|
||||
import asyncio as _asyncio
|
||||
loop = supervisor._loop # type: ignore[attr-defined]
|
||||
if loop is None or not loop.is_running():
|
||||
return tool_error(
|
||||
"CDP supervisor loop is not running. Try reconnecting with "
|
||||
"/browser connect."
|
||||
)
|
||||
|
||||
async def _do_cdp():
|
||||
return await supervisor._cdp( # type: ignore[attr-defined]
|
||||
method,
|
||||
params or {},
|
||||
session_id=child_sid,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
try:
|
||||
fut = _asyncio.run_coroutine_threadsafe(_do_cdp(), loop)
|
||||
result_msg = fut.result(timeout=timeout + 2)
|
||||
except Exception as exc:
|
||||
return tool_error(
|
||||
f"CDP call via supervisor failed: {type(exc).__name__}: {exc}",
|
||||
cdp_docs=CDP_DOCS_URL,
|
||||
)
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"success": True,
|
||||
"method": method,
|
||||
"frame_id": frame_id,
|
||||
"session_id": child_sid,
|
||||
"result": result_msg.get("result", {}),
|
||||
}
|
||||
return json.dumps(payload, ensure_ascii=False)
|
||||
|
||||
|
||||
def browser_cdp(
|
||||
method: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
target_id: Optional[str] = None,
|
||||
frame_id: Optional[str] = None,
|
||||
timeout: float = 30.0,
|
||||
task_id: Optional[str] = None,
|
||||
) -> str:
|
||||
@@ -202,16 +308,34 @@ def browser_cdp(
|
||||
params: Method-specific parameters; defaults to ``{}``.
|
||||
target_id: Optional target/tab ID for page-level methods. When set,
|
||||
we first attach to the target (``flatten=True``) and send
|
||||
``method`` with the resulting ``sessionId``.
|
||||
``method`` with the resulting ``sessionId``. Uses a fresh
|
||||
stateless CDP connection.
|
||||
frame_id: Optional cross-origin (OOPIF) iframe ``frame_id`` from
|
||||
``browser_snapshot.frame_tree.children[]``. When set (and the
|
||||
frame is an OOPIF with a live session tracked by the CDP
|
||||
supervisor), routes the call through the supervisor's existing
|
||||
WebSocket — which is how you Runtime.evaluate *inside* an
|
||||
iframe on backends where per-call fresh CDP connections would
|
||||
hit signed-URL expiry (Browserbase) or expensive reattach.
|
||||
timeout: Seconds to wait for the call to complete.
|
||||
task_id: Unused (tool is stateless) — accepted for uniformity with
|
||||
other browser tools.
|
||||
task_id: Task identifier for supervisor lookup. When ``frame_id``
|
||||
is set, this identifies which task's supervisor to use; the
|
||||
handler will default to ``"default"`` otherwise.
|
||||
|
||||
Returns:
|
||||
JSON string ``{"success": True, "method": ..., "result": {...}}`` on
|
||||
success, or ``{"error": "..."}`` on failure.
|
||||
"""
|
||||
del task_id # unused — stateless
|
||||
# --- Route iframe-scoped calls through the supervisor ---------------
|
||||
if frame_id:
|
||||
return _browser_cdp_via_supervisor(
|
||||
task_id=task_id or "default",
|
||||
frame_id=frame_id,
|
||||
method=method,
|
||||
params=params,
|
||||
timeout=timeout,
|
||||
)
|
||||
del task_id # stateless path below
|
||||
|
||||
if not method or not isinstance(method, str):
|
||||
return tool_error(
|
||||
@@ -324,12 +448,18 @@ BROWSER_CDP_SCHEMA: Dict[str, Any] = {
|
||||
"'mobile': false}, target_id=<tabId>\n\n"
|
||||
"**Usage rules:**\n"
|
||||
"- Browser-level methods (Target.*, Browser.*, Storage.*): omit "
|
||||
"target_id.\n"
|
||||
"target_id and frame_id.\n"
|
||||
"- Page-level methods (Page.*, Runtime.*, DOM.*, Emulation.*, "
|
||||
"Network.* scoped to a tab): pass target_id from Target.getTargets.\n"
|
||||
"- Each call is independent — sessions and event subscriptions do "
|
||||
"not persist between calls. For stateful workflows, prefer the "
|
||||
"dedicated browser tools."
|
||||
"- **Cross-origin iframe scope** (Runtime.evaluate inside an OOPIF, "
|
||||
"Page.* targeting a frame target, etc.): pass frame_id from the "
|
||||
"browser_snapshot frame_tree output. This routes through the CDP "
|
||||
"supervisor's live connection — the only reliable way on "
|
||||
"Browserbase where stateless CDP calls hit signed-URL expiry.\n"
|
||||
"- Each stateless call (without frame_id) is independent — sessions "
|
||||
"and event subscriptions do not persist between calls. For stateful "
|
||||
"workflows, prefer the dedicated browser tools or use frame_id "
|
||||
"routing."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
@@ -353,8 +483,24 @@ BROWSER_CDP_SCHEMA: Dict[str, Any] = {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Optional. Target/tab ID from Target.getTargets result "
|
||||
"(each entry's 'targetId'). Required for page-level "
|
||||
"methods; must be omitted for browser-level methods."
|
||||
"(each entry's 'targetId'). Use for page-level methods "
|
||||
"at the top-level tab scope. Mutually exclusive with "
|
||||
"frame_id."
|
||||
),
|
||||
},
|
||||
"frame_id": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Optional. Out-of-process iframe (OOPIF) frame_id from "
|
||||
"browser_snapshot.frame_tree.children[] where "
|
||||
"is_oopif=true. When set, routes the call through the "
|
||||
"CDP supervisor's live session for that iframe. "
|
||||
"Essential for Runtime.evaluate inside cross-origin "
|
||||
"iframes, especially on Browserbase where fresh "
|
||||
"per-call CDP connections can't keep up with signed "
|
||||
"URL rotation. For same-origin iframes, use parent "
|
||||
"contentWindow/contentDocument from Runtime.evaluate "
|
||||
"at the top-level page instead."
|
||||
),
|
||||
},
|
||||
"timeout": {
|
||||
@@ -408,6 +554,7 @@ registry.register(
|
||||
method=args.get("method", ""),
|
||||
params=args.get("params"),
|
||||
target_id=args.get("target_id"),
|
||||
frame_id=args.get("frame_id"),
|
||||
timeout=args.get("timeout", 30.0),
|
||||
task_id=kw.get("task_id"),
|
||||
),
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
"""Agent-facing tool: respond to a native JS dialog captured by the CDP supervisor.
|
||||
|
||||
This tool is response-only — the agent first reads ``pending_dialogs`` from
|
||||
``browser_snapshot`` output, then calls ``browser_dialog(action=...)`` to
|
||||
accept or dismiss.
|
||||
|
||||
Gated on the same ``_browser_cdp_check`` as ``browser_cdp`` so it only
|
||||
appears when a CDP endpoint is reachable (Browserbase with a
|
||||
``connectUrl``, local Chrome via ``/browser connect``, or
|
||||
``browser.cdp_url`` set in config).
|
||||
|
||||
See ``website/docs/developer-guide/browser-supervisor.md`` for the full
|
||||
design.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY
|
||||
from tools.registry import registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
BROWSER_DIALOG_SCHEMA: Dict[str, Any] = {
|
||||
"name": "browser_dialog",
|
||||
"description": (
|
||||
"Respond to a native JavaScript dialog (alert / confirm / prompt / "
|
||||
"beforeunload) that is currently blocking the page.\n\n"
|
||||
"**Workflow:** call ``browser_snapshot`` first — if a dialog is open, "
|
||||
"it appears in the ``pending_dialogs`` field with ``id``, ``type``, "
|
||||
"and ``message``. Then call this tool with ``action='accept'`` or "
|
||||
"``action='dismiss'``.\n\n"
|
||||
"**Prompt dialogs:** pass ``prompt_text`` to supply the response "
|
||||
"string. Ignored for alert/confirm/beforeunload.\n\n"
|
||||
"**Multiple dialogs:** if more than one dialog is queued (rare — "
|
||||
"happens when a second dialog fires while the first is still open), "
|
||||
"pass ``dialog_id`` from the snapshot to disambiguate.\n\n"
|
||||
"**Availability:** only present when a CDP-capable backend is "
|
||||
"attached — Browserbase sessions, local Chrome via "
|
||||
"``/browser connect``, or ``browser.cdp_url`` in config.yaml. "
|
||||
"Not available on Camofox (REST-only) or the default Playwright "
|
||||
"local browser (CDP port is hidden)."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {
|
||||
"type": "string",
|
||||
"enum": ["accept", "dismiss"],
|
||||
"description": (
|
||||
"'accept' clicks OK / returns the prompt text. "
|
||||
"'dismiss' clicks Cancel / returns null from prompt(). "
|
||||
"For ``beforeunload`` dialogs: 'accept' allows the "
|
||||
"navigation, 'dismiss' keeps the page."
|
||||
),
|
||||
},
|
||||
"prompt_text": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Response string for a ``prompt()`` dialog. Ignored for "
|
||||
"other dialog types. Defaults to empty string."
|
||||
),
|
||||
},
|
||||
"dialog_id": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Specific dialog to respond to, from "
|
||||
"``browser_snapshot.pending_dialogs[].id``. Required "
|
||||
"only when multiple dialogs are queued."
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["action"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def browser_dialog(
|
||||
action: str,
|
||||
prompt_text: Optional[str] = None,
|
||||
dialog_id: Optional[str] = None,
|
||||
task_id: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Respond to a pending dialog on the active task's CDP supervisor."""
|
||||
effective_task_id = task_id or "default"
|
||||
supervisor = SUPERVISOR_REGISTRY.get(effective_task_id)
|
||||
if supervisor is None:
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"error": (
|
||||
"No CDP supervisor is attached to this task. Either the "
|
||||
"browser backend doesn't expose CDP (Camofox, default "
|
||||
"Playwright) or no browser session has been started yet. "
|
||||
"Call browser_navigate or /browser connect first."
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
result = supervisor.respond_to_dialog(
|
||||
action=action,
|
||||
prompt_text=prompt_text,
|
||||
dialog_id=dialog_id,
|
||||
)
|
||||
if result.get("ok"):
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"action": action,
|
||||
"dialog": result.get("dialog", {}),
|
||||
}
|
||||
)
|
||||
return json.dumps({"success": False, "error": result.get("error", "unknown error")})
|
||||
|
||||
|
||||
def _browser_dialog_check() -> bool:
|
||||
"""Gate: same as ``browser_cdp`` — only offered when CDP is reachable.
|
||||
|
||||
Kept identical so the two tools appear and disappear together. The
|
||||
supervisor itself is started lazily by ``browser_navigate`` /
|
||||
``/browser connect`` / Browserbase session creation, so a reachable
|
||||
CDP URL is enough to commit to showing the tool.
|
||||
"""
|
||||
try:
|
||||
from tools.browser_cdp_tool import _browser_cdp_check # type: ignore[import-not-found]
|
||||
except Exception as exc: # pragma: no cover — defensive
|
||||
logger.debug("browser_dialog check: browser_cdp_tool import failed: %s", exc)
|
||||
return False
|
||||
return _browser_cdp_check()
|
||||
|
||||
|
||||
registry.register(
|
||||
name="browser_dialog",
|
||||
toolset="browser-cdp",
|
||||
schema=BROWSER_DIALOG_SCHEMA,
|
||||
handler=lambda args, **kw: browser_dialog(
|
||||
action=args.get("action", ""),
|
||||
prompt_text=args.get("prompt_text"),
|
||||
dialog_id=args.get("dialog_id"),
|
||||
task_id=kw.get("task_id"),
|
||||
),
|
||||
check_fn=_browser_dialog_check,
|
||||
emoji="💬",
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
+128
-5
@@ -63,7 +63,7 @@ import tempfile
|
||||
import threading
|
||||
import time
|
||||
import requests
|
||||
from typing import Dict, Any, Optional, List
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
from pathlib import Path
|
||||
from agent.auxiliary_client import call_llm
|
||||
from hermes_constants import get_hermes_home
|
||||
@@ -287,6 +287,100 @@ def _get_cdp_override() -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _get_dialog_policy_config() -> Tuple[str, float]:
|
||||
"""Read ``browser.dialog_policy`` + ``browser.dialog_timeout_s`` from config.
|
||||
|
||||
Returns a ``(policy, timeout_s)`` tuple, falling back to the supervisor's
|
||||
defaults when keys are absent or invalid.
|
||||
"""
|
||||
# Defer imports so browser_tool can be imported in minimal environments.
|
||||
from tools.browser_supervisor import (
|
||||
DEFAULT_DIALOG_POLICY,
|
||||
DEFAULT_DIALOG_TIMEOUT_S,
|
||||
_VALID_POLICIES,
|
||||
)
|
||||
|
||||
try:
|
||||
from hermes_cli.config import read_raw_config
|
||||
|
||||
cfg = read_raw_config()
|
||||
browser_cfg = cfg.get("browser", {}) if isinstance(cfg, dict) else {}
|
||||
if not isinstance(browser_cfg, dict):
|
||||
return DEFAULT_DIALOG_POLICY, DEFAULT_DIALOG_TIMEOUT_S
|
||||
policy = str(browser_cfg.get("dialog_policy") or DEFAULT_DIALOG_POLICY)
|
||||
if policy not in _VALID_POLICIES:
|
||||
logger.debug("Invalid browser.dialog_policy=%r; using default", policy)
|
||||
policy = DEFAULT_DIALOG_POLICY
|
||||
timeout_raw = browser_cfg.get("dialog_timeout_s")
|
||||
try:
|
||||
timeout_s = float(timeout_raw) if timeout_raw is not None else DEFAULT_DIALOG_TIMEOUT_S
|
||||
if timeout_s <= 0:
|
||||
timeout_s = DEFAULT_DIALOG_TIMEOUT_S
|
||||
except (TypeError, ValueError):
|
||||
timeout_s = DEFAULT_DIALOG_TIMEOUT_S
|
||||
return policy, timeout_s
|
||||
except Exception:
|
||||
return DEFAULT_DIALOG_POLICY, DEFAULT_DIALOG_TIMEOUT_S
|
||||
|
||||
|
||||
def _ensure_cdp_supervisor(task_id: str) -> None:
|
||||
"""Start a CDP supervisor for ``task_id`` if an endpoint is reachable.
|
||||
|
||||
Idempotent — delegates to ``SupervisorRegistry.get_or_start`` which skips
|
||||
when a supervisor for this ``(task_id, cdp_url)`` already exists and
|
||||
tears down + restarts on URL change. Safe to call on every
|
||||
``browser_navigate`` / ``/browser connect`` without worrying about
|
||||
double-attach.
|
||||
|
||||
Resolves the CDP URL in this order:
|
||||
1. ``BROWSER_CDP_URL`` / ``browser.cdp_url`` — covers ``/browser connect``
|
||||
and config-set overrides.
|
||||
2. ``_active_sessions[task_id]["cdp_url"]`` — covers Browserbase + any
|
||||
other cloud provider whose ``create_session`` returns a raw CDP URL.
|
||||
|
||||
Swallows all errors — failing to attach the supervisor must not break
|
||||
the browser session itself. The agent simply won't see
|
||||
``pending_dialogs`` / ``frame_tree`` fields in snapshots.
|
||||
"""
|
||||
cdp_url = _get_cdp_override()
|
||||
if not cdp_url:
|
||||
# Fallback: active session may carry a per-session CDP URL from a
|
||||
# cloud provider (Browserbase sets this).
|
||||
with _cleanup_lock:
|
||||
session_info = _active_sessions.get(task_id, {})
|
||||
maybe = str(session_info.get("cdp_url") or "")
|
||||
if maybe:
|
||||
cdp_url = _resolve_cdp_override(maybe)
|
||||
if not cdp_url:
|
||||
return
|
||||
try:
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found]
|
||||
|
||||
policy, timeout_s = _get_dialog_policy_config()
|
||||
SUPERVISOR_REGISTRY.get_or_start(
|
||||
task_id=task_id,
|
||||
cdp_url=cdp_url,
|
||||
dialog_policy=policy,
|
||||
dialog_timeout_s=timeout_s,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug(
|
||||
"CDP supervisor attach for task=%s failed (non-fatal): %s",
|
||||
task_id,
|
||||
exc,
|
||||
)
|
||||
|
||||
|
||||
def _stop_cdp_supervisor(task_id: str) -> None:
|
||||
"""Stop the CDP supervisor for ``task_id`` if one exists. No-op otherwise."""
|
||||
try:
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found]
|
||||
|
||||
SUPERVISOR_REGISTRY.stop(task_id)
|
||||
except Exception as exc:
|
||||
logger.debug("CDP supervisor stop for task=%s failed (non-fatal): %s", task_id, exc)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Cloud Provider Registry
|
||||
# ============================================================================
|
||||
@@ -995,7 +1089,12 @@ def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]:
|
||||
if task_id in _active_sessions:
|
||||
return _active_sessions[task_id]
|
||||
_active_sessions[task_id] = session_info
|
||||
|
||||
|
||||
# Lazy-start the CDP supervisor now that the session exists (if the
|
||||
# backend surfaces a CDP URL via override or session_info["cdp_url"]).
|
||||
# Idempotent; swallows errors. See _ensure_cdp_supervisor for details.
|
||||
_ensure_cdp_supervisor(task_id)
|
||||
|
||||
return session_info
|
||||
|
||||
|
||||
@@ -1455,7 +1554,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str:
|
||||
if is_first_nav:
|
||||
session_info["_first_nav"] = False
|
||||
_maybe_start_recording(effective_task_id)
|
||||
|
||||
|
||||
result = _run_browser_command(effective_task_id, "open", [url], timeout=max(_get_command_timeout(), 60))
|
||||
|
||||
if result.get("success"):
|
||||
@@ -1578,7 +1677,20 @@ def browser_snapshot(
|
||||
"snapshot": snapshot_text,
|
||||
"element_count": len(refs) if refs else 0
|
||||
}
|
||||
|
||||
|
||||
# Merge supervisor state (pending dialogs + frame tree) when a CDP
|
||||
# supervisor is attached to this task. No-op otherwise. See
|
||||
# website/docs/developer-guide/browser-supervisor.md.
|
||||
try:
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found]
|
||||
_supervisor = SUPERVISOR_REGISTRY.get(effective_task_id)
|
||||
if _supervisor is not None:
|
||||
_sv_snap = _supervisor.snapshot()
|
||||
if _sv_snap.active:
|
||||
response.update(_sv_snap.to_dict())
|
||||
except Exception as _sv_exc:
|
||||
logger.debug("supervisor snapshot merge failed: %s", _sv_exc)
|
||||
|
||||
return json.dumps(response, ensure_ascii=False)
|
||||
else:
|
||||
return json.dumps({
|
||||
@@ -2248,7 +2360,11 @@ def cleanup_browser(task_id: Optional[str] = None) -> None:
|
||||
"""
|
||||
if task_id is None:
|
||||
task_id = "default"
|
||||
|
||||
|
||||
# Stop the CDP supervisor for this task FIRST so we close our WebSocket
|
||||
# before the backend tears down the underlying CDP endpoint.
|
||||
_stop_cdp_supervisor(task_id)
|
||||
|
||||
# Also clean up Camofox session if running in Camofox mode.
|
||||
# Skip full close when managed persistence is enabled — the browser
|
||||
# profile (and its session cookies) must survive across agent tasks.
|
||||
@@ -2329,6 +2445,13 @@ def cleanup_all_browsers() -> None:
|
||||
for task_id in task_ids:
|
||||
cleanup_browser(task_id)
|
||||
|
||||
# Tear down CDP supervisors for all tasks so background threads exit.
|
||||
try:
|
||||
from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found]
|
||||
SUPERVISOR_REGISTRY.stop_all()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Reset cached lookups so they are re-evaluated on next use.
|
||||
global _cached_agent_browser, _agent_browser_resolved
|
||||
global _cached_command_timeout, _command_timeout_resolved
|
||||
|
||||
@@ -298,7 +298,7 @@ def _get_child_timeout() -> float:
|
||||
"""Read delegation.child_timeout_seconds from config.
|
||||
|
||||
Returns the number of seconds a single child agent is allowed to run
|
||||
before being considered stuck. Default: 300 s (5 minutes).
|
||||
before being considered stuck. Default: 600 s (10 minutes).
|
||||
"""
|
||||
cfg = _load_config()
|
||||
val = cfg.get("child_timeout_seconds")
|
||||
@@ -409,7 +409,7 @@ def _preserve_parent_mcp_toolsets(
|
||||
|
||||
|
||||
DEFAULT_MAX_ITERATIONS = 50
|
||||
DEFAULT_CHILD_TIMEOUT = 300 # seconds before a child agent is considered stuck
|
||||
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
|
||||
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
|
||||
_HEARTBEAT_STALE_CYCLES = (
|
||||
5 # mark child stale after this many heartbeats with no iteration progress
|
||||
|
||||
+111
-63
@@ -473,6 +473,12 @@ _ACTIONS = {
|
||||
"remove_role": _remove_role,
|
||||
}
|
||||
|
||||
_CORE_ACTION_NAMES = frozenset({"fetch_messages", "search_members", "create_thread"})
|
||||
_ADMIN_ACTION_NAMES = frozenset(_ACTIONS.keys()) - _CORE_ACTION_NAMES
|
||||
|
||||
_CORE_ACTIONS = {k: v for k, v in _ACTIONS.items() if k in _CORE_ACTION_NAMES}
|
||||
_ADMIN_ACTIONS = {k: v for k, v in _ACTIONS.items() if k in _ADMIN_ACTION_NAMES}
|
||||
|
||||
# Single-source-of-truth manifest: action → (signature, one-line description).
|
||||
# Consumed by :func:`_build_schema` so the schema's top-level description
|
||||
# always matches the registered action set.
|
||||
@@ -531,7 +537,7 @@ def _load_allowed_actions_config() -> Optional[List[str]]:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
except Exception as exc:
|
||||
logger.debug("discord_server: could not load config (%s); allowing all actions.", exc)
|
||||
logger.debug("discord: could not load config (%s); allowing all actions.", exc)
|
||||
return None
|
||||
|
||||
raw = (cfg.get("discord") or {}).get("server_actions")
|
||||
@@ -586,12 +592,16 @@ def _available_actions(
|
||||
def _build_schema(
|
||||
actions: List[str],
|
||||
caps: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build the tool schema for the given filtered action list."""
|
||||
tool_name: str = "discord",
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Build the tool schema for the given filtered action list.
|
||||
|
||||
Returns ``None`` when *actions* is empty — callers should drop the
|
||||
tool from registration in that case.
|
||||
"""
|
||||
caps = caps or {}
|
||||
if not actions:
|
||||
# Tool shouldn't be registered when empty, but guard anyway.
|
||||
actions = list(_ACTIONS.keys())
|
||||
return None
|
||||
|
||||
# Action manifest lines (action-first, parameter-scoped).
|
||||
manifest_lines = [
|
||||
@@ -602,24 +612,36 @@ def _build_schema(
|
||||
manifest_block = "\n".join(manifest_lines)
|
||||
|
||||
content_note = ""
|
||||
if caps.get("detected") and caps.get("has_message_content") is False:
|
||||
affected_actions = {"fetch_messages", "list_pins"} & set(actions)
|
||||
if affected_actions and caps.get("detected") and caps.get("has_message_content") is False:
|
||||
names = " and ".join(sorted(affected_actions))
|
||||
content_note = (
|
||||
"\n\nNOTE: Bot does NOT have the MESSAGE_CONTENT privileged intent. "
|
||||
"fetch_messages and list_pins will return message metadata (author, "
|
||||
f"\n\nNOTE: Bot does NOT have the MESSAGE_CONTENT privileged intent. "
|
||||
f"{names} will return message metadata (author, "
|
||||
"timestamps, attachments, reactions, pin state) but `content` will be "
|
||||
"empty for messages not sent as a direct mention to the bot or in DMs. "
|
||||
"Enable the intent in the Discord Developer Portal to see all content."
|
||||
)
|
||||
|
||||
description = (
|
||||
"Query and manage a Discord server via the REST API.\n\n"
|
||||
"Available actions:\n"
|
||||
f"{manifest_block}\n\n"
|
||||
"Call list_guilds first to discover guild_ids, then list_channels for "
|
||||
"channel_ids. Runtime errors will tell you if the bot lacks a specific "
|
||||
"per-guild permission (e.g. MANAGE_ROLES for add_role)."
|
||||
f"{content_note}"
|
||||
)
|
||||
if tool_name == "discord_admin":
|
||||
description = (
|
||||
"Manage a Discord server via the REST API.\n\n"
|
||||
"Available actions:\n"
|
||||
f"{manifest_block}\n\n"
|
||||
"Call list_guilds first to discover guild_ids, then list_channels for "
|
||||
"channel_ids. Runtime errors will tell you if the bot lacks a specific "
|
||||
"per-guild permission (e.g. MANAGE_ROLES for add_role)."
|
||||
f"{content_note}"
|
||||
)
|
||||
else:
|
||||
description = (
|
||||
"Read and participate in a Discord server.\n\n"
|
||||
"Available actions:\n"
|
||||
f"{manifest_block}\n\n"
|
||||
"Use the channel_id from the current conversation context. "
|
||||
"Use search_members to look up user IDs by name prefix."
|
||||
f"{content_note}"
|
||||
)
|
||||
|
||||
properties: Dict[str, Any] = {
|
||||
"action": {
|
||||
@@ -676,7 +698,7 @@ def _build_schema(
|
||||
}
|
||||
|
||||
return {
|
||||
"name": "discord_server",
|
||||
"name": tool_name,
|
||||
"description": description,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
@@ -686,28 +708,33 @@ def _build_schema(
|
||||
}
|
||||
|
||||
|
||||
def get_dynamic_schema() -> Optional[Dict[str, Any]]:
|
||||
"""Return a schema filtered by current intents + config allowlist.
|
||||
|
||||
Called by ``model_tools.get_tool_definitions`` as a post-processing
|
||||
step so the schema the model sees always reflects reality. Returns
|
||||
``None`` when no actions are available (tool should be removed from
|
||||
the schema list entirely).
|
||||
"""
|
||||
def _get_dynamic_schema(
|
||||
action_subset: Dict[str, Any],
|
||||
tool_name: str,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Build a dynamic schema for *action_subset* filtered by intents + config."""
|
||||
token = _get_bot_token()
|
||||
if not token:
|
||||
return None
|
||||
|
||||
caps = _detect_capabilities(token)
|
||||
allowlist = _load_allowed_actions_config()
|
||||
actions = _available_actions(caps, allowlist)
|
||||
actions = [a for a in _available_actions(caps, allowlist) if a in action_subset]
|
||||
if not actions:
|
||||
logger.warning(
|
||||
"discord_server: config allowlist/intents left zero available actions; "
|
||||
"hiding tool from this session."
|
||||
)
|
||||
return None
|
||||
return _build_schema(actions, caps)
|
||||
return _build_schema(actions, caps, tool_name=tool_name)
|
||||
|
||||
|
||||
def get_dynamic_schema_core() -> Optional[Dict[str, Any]]:
|
||||
return _get_dynamic_schema(_CORE_ACTIONS, "discord")
|
||||
|
||||
|
||||
def get_dynamic_schema_admin() -> Optional[Dict[str, Any]]:
|
||||
return _get_dynamic_schema(_ADMIN_ACTIONS, "discord_admin")
|
||||
|
||||
|
||||
def get_dynamic_schema() -> Optional[Dict[str, Any]]:
|
||||
"""Backward-compat wrapper — returns core schema."""
|
||||
return get_dynamic_schema_core()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -774,11 +801,13 @@ def check_discord_tool_requirements() -> bool:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main handler
|
||||
# Handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def discord_server(
|
||||
def _run_discord_action(
|
||||
action: str,
|
||||
valid_actions: Dict[str, Any],
|
||||
tool_label: str,
|
||||
guild_id: str = "",
|
||||
channel_id: str = "",
|
||||
user_id: str = "",
|
||||
@@ -790,18 +819,17 @@ def discord_server(
|
||||
before: str = "",
|
||||
after: str = "",
|
||||
auto_archive_duration: int = 1440,
|
||||
task_id: str = None,
|
||||
) -> str:
|
||||
"""Execute a Discord server action."""
|
||||
"""Shared handler logic for both discord tools."""
|
||||
token = _get_bot_token()
|
||||
if not token:
|
||||
return json.dumps({"error": "DISCORD_BOT_TOKEN not configured."})
|
||||
|
||||
action_fn = _ACTIONS.get(action)
|
||||
action_fn = valid_actions.get(action)
|
||||
if not action_fn:
|
||||
return json.dumps({
|
||||
"error": f"Unknown action: {action}",
|
||||
"available_actions": list(_ACTIONS.keys()),
|
||||
"available_actions": list(valid_actions.keys()),
|
||||
})
|
||||
|
||||
# Config-level allowlist gate (defense in depth — schema already filtered,
|
||||
@@ -848,44 +876,64 @@ def discord_server(
|
||||
auto_archive_duration=auto_archive_duration,
|
||||
)
|
||||
except DiscordAPIError as e:
|
||||
logger.warning("Discord API error in action '%s': %s", action, e)
|
||||
logger.warning("Discord API error in %s action '%s': %s", tool_label, action, e)
|
||||
if e.status == 403:
|
||||
return json.dumps({"error": _enrich_403(action, e.body)})
|
||||
return json.dumps({"error": str(e)})
|
||||
except Exception as e:
|
||||
logger.exception("Unexpected error in discord_server action '%s'", action)
|
||||
logger.exception("Unexpected error in %s action '%s'", tool_label, action)
|
||||
return json.dumps({"error": f"Unexpected error: {e}"})
|
||||
|
||||
|
||||
def discord_core(action: str, **kwargs) -> str:
|
||||
"""Execute a core Discord action (fetch_messages, search_members, create_thread)."""
|
||||
return _run_discord_action(action, _CORE_ACTIONS, "discord", **kwargs)
|
||||
|
||||
|
||||
def discord_admin_handler(action: str, **kwargs) -> str:
|
||||
"""Execute a Discord admin action (server management)."""
|
||||
return _run_discord_action(action, _ADMIN_ACTIONS, "discord_admin", **kwargs)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool registration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Register with the full unfiltered schema. ``model_tools.get_tool_definitions``
|
||||
# rebuilds this per-session via ``get_dynamic_schema`` so the model only ever
|
||||
# sees intent-available, config-allowed actions. The static registration is a
|
||||
# safe baseline for tools that inspect the registry directly.
|
||||
_STATIC_SCHEMA = _build_schema(list(_ACTIONS.keys()), caps={"detected": False})
|
||||
_HANDLER_DEFAULTS = {
|
||||
"action": "", "guild_id": "", "channel_id": "", "user_id": "",
|
||||
"role_id": "", "message_id": "", "query": "", "name": "",
|
||||
"limit": 50, "before": "", "after": "", "auto_archive_duration": 1440,
|
||||
}
|
||||
|
||||
|
||||
def _make_handler(handler_fn):
|
||||
"""Create a registry-compatible handler lambda for a discord handler."""
|
||||
return lambda args, **kw: handler_fn(
|
||||
**{k: args.get(k, v) for k, v in _HANDLER_DEFAULTS.items()},
|
||||
)
|
||||
|
||||
|
||||
_STATIC_CORE_SCHEMA = _build_schema(
|
||||
list(_CORE_ACTIONS.keys()), caps={"detected": False}, tool_name="discord",
|
||||
)
|
||||
_STATIC_ADMIN_SCHEMA = _build_schema(
|
||||
list(_ADMIN_ACTIONS.keys()), caps={"detected": False}, tool_name="discord_admin",
|
||||
)
|
||||
|
||||
registry.register(
|
||||
name="discord_server",
|
||||
name="discord",
|
||||
toolset="discord",
|
||||
schema=_STATIC_SCHEMA,
|
||||
handler=lambda args, **kw: discord_server(
|
||||
action=args.get("action", ""),
|
||||
guild_id=args.get("guild_id", ""),
|
||||
channel_id=args.get("channel_id", ""),
|
||||
user_id=args.get("user_id", ""),
|
||||
role_id=args.get("role_id", ""),
|
||||
message_id=args.get("message_id", ""),
|
||||
query=args.get("query", ""),
|
||||
name=args.get("name", ""),
|
||||
limit=args.get("limit", 50),
|
||||
before=args.get("before", ""),
|
||||
after=args.get("after", ""),
|
||||
auto_archive_duration=args.get("auto_archive_duration", 1440),
|
||||
task_id=kw.get("task_id"),
|
||||
),
|
||||
schema=_STATIC_CORE_SCHEMA,
|
||||
handler=_make_handler(discord_core),
|
||||
check_fn=check_discord_tool_requirements,
|
||||
requires_env=["DISCORD_BOT_TOKEN"],
|
||||
)
|
||||
|
||||
registry.register(
|
||||
name="discord_admin",
|
||||
toolset="discord_admin",
|
||||
schema=_STATIC_ADMIN_SCHEMA,
|
||||
handler=_make_handler(discord_admin_handler),
|
||||
check_fn=check_discord_tool_requirements,
|
||||
requires_env=["DISCORD_BOT_TOKEN"],
|
||||
)
|
||||
|
||||
@@ -292,10 +292,15 @@ def normalize_read_pagination(offset: Any = DEFAULT_READ_OFFSET,
|
||||
Tool schemas declare minimum/maximum values, but not every caller or
|
||||
provider enforces schemas before dispatch. Clamp here so invalid values
|
||||
cannot leak into sed ranges like ``0,-1p``.
|
||||
|
||||
The upper bound on ``limit`` comes from ``tool_output.max_lines`` in
|
||||
config.yaml (defaults to the module-level ``MAX_LINES`` constant).
|
||||
"""
|
||||
from tools.tool_output_limits import get_max_lines
|
||||
max_lines = get_max_lines()
|
||||
normalized_offset = max(1, _coerce_int(offset, DEFAULT_READ_OFFSET))
|
||||
normalized_limit = _coerce_int(limit, DEFAULT_READ_LIMIT)
|
||||
normalized_limit = max(1, min(normalized_limit, MAX_LINES))
|
||||
normalized_limit = max(1, min(normalized_limit, max_lines))
|
||||
return normalized_offset, normalized_limit
|
||||
|
||||
|
||||
@@ -414,12 +419,14 @@ class ShellFileOperations(FileOperations):
|
||||
|
||||
def _add_line_numbers(self, content: str, start_line: int = 1) -> str:
|
||||
"""Add line numbers to content in LINE_NUM|CONTENT format."""
|
||||
from tools.tool_output_limits import get_max_line_length
|
||||
max_line_length = get_max_line_length()
|
||||
lines = content.split('\n')
|
||||
numbered = []
|
||||
for i, line in enumerate(lines, start=start_line):
|
||||
# Truncate long lines
|
||||
if len(line) > MAX_LINE_LENGTH:
|
||||
line = line[:MAX_LINE_LENGTH] + "... [truncated]"
|
||||
if len(line) > max_line_length:
|
||||
line = line[:max_line_length] + "... [truncated]"
|
||||
numbered.append(f"{i:6d}|{line}")
|
||||
return '\n'.join(numbered)
|
||||
|
||||
|
||||
+178
-6
@@ -78,12 +78,86 @@ import math
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stdio subprocess stderr redirection
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# The MCP SDK's ``stdio_client(server, errlog=sys.stderr)`` defaults the
|
||||
# subprocess stderr stream to the parent process's real stderr, i.e. the
|
||||
# user's TTY. That means any MCP server we spawn at startup (FastMCP
|
||||
# banners, slack-mcp-server JSON startup logs, etc.) writes directly onto
|
||||
# the terminal while prompt_toolkit / Rich is rendering the TUI — which
|
||||
# corrupts the display and can hang the session.
|
||||
#
|
||||
# Instead we redirect every stdio MCP subprocess's stderr into a shared
|
||||
# per-profile log file (~/.hermes/logs/mcp-stderr.log), tagged with the
|
||||
# server name so individual servers remain debuggable.
|
||||
#
|
||||
# Fallback is os.devnull if opening the log file fails for any reason.
|
||||
|
||||
_mcp_stderr_log_fh: Optional[Any] = None
|
||||
_mcp_stderr_log_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_mcp_stderr_log() -> Any:
|
||||
"""Return a shared append-mode file handle for MCP subprocess stderr.
|
||||
|
||||
Opened once per process and reused for every stdio server. Must have a
|
||||
real OS-level file descriptor (``fileno()``) because asyncio's subprocess
|
||||
machinery wires the child's stderr directly to that fd. Falls back to
|
||||
``/dev/null`` if opening the log file fails.
|
||||
"""
|
||||
global _mcp_stderr_log_fh
|
||||
with _mcp_stderr_log_lock:
|
||||
if _mcp_stderr_log_fh is not None:
|
||||
return _mcp_stderr_log_fh
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
log_dir = get_hermes_home() / "logs"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
log_path = log_dir / "mcp-stderr.log"
|
||||
# Line-buffered so server output lands on disk promptly; errors=
|
||||
# "replace" tolerates garbled binary output from misbehaving
|
||||
# servers.
|
||||
fh = open(log_path, "a", encoding="utf-8", errors="replace", buffering=1)
|
||||
# Sanity-check: confirm a real fd is available before we commit.
|
||||
fh.fileno()
|
||||
_mcp_stderr_log_fh = fh
|
||||
except Exception as exc: # pragma: no cover — best-effort fallback
|
||||
logger.debug("Failed to open MCP stderr log, using devnull: %s", exc)
|
||||
try:
|
||||
_mcp_stderr_log_fh = open(os.devnull, "w", encoding="utf-8")
|
||||
except Exception:
|
||||
# Last resort: the real stderr. Not ideal for TUI users but
|
||||
# it matches pre-fix behavior.
|
||||
_mcp_stderr_log_fh = sys.stderr
|
||||
return _mcp_stderr_log_fh
|
||||
|
||||
|
||||
def _write_stderr_log_header(server_name: str) -> None:
|
||||
"""Write a human-readable session marker before launching a server.
|
||||
|
||||
Gives operators a way to find each server's output in the shared
|
||||
``mcp-stderr.log`` file without needing per-line prefixes (which would
|
||||
require a pipe + reader thread and complicate shutdown).
|
||||
"""
|
||||
fh = _get_mcp_stderr_log()
|
||||
try:
|
||||
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
fh.write(f"\n===== [{ts}] starting MCP server '{server_name}' =====\n")
|
||||
fh.flush()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Graceful import -- MCP SDK is an optional dependency
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -93,6 +167,10 @@ _MCP_HTTP_AVAILABLE = False
|
||||
_MCP_SAMPLING_TYPES = False
|
||||
_MCP_NOTIFICATION_TYPES = False
|
||||
_MCP_MESSAGE_HANDLER_SUPPORTED = False
|
||||
# Conservative fallback for SDK builds that don't export LATEST_PROTOCOL_VERSION.
|
||||
# Streamable HTTP was introduced by 2025-03-26, so this remains valid for the
|
||||
# HTTP transport path even on older-but-supported SDK versions.
|
||||
LATEST_PROTOCOL_VERSION = "2025-03-26"
|
||||
try:
|
||||
from mcp import ClientSession, StdioServerParameters
|
||||
from mcp.client.stdio import stdio_client
|
||||
@@ -109,6 +187,10 @@ try:
|
||||
_MCP_NEW_HTTP = True
|
||||
except ImportError:
|
||||
_MCP_NEW_HTTP = False
|
||||
try:
|
||||
from mcp.types import LATEST_PROTOCOL_VERSION
|
||||
except ImportError:
|
||||
logger.debug("mcp.types.LATEST_PROTOCOL_VERSION not available -- using fallback protocol version")
|
||||
# Sampling types -- separated so older SDK versions don't break MCP support
|
||||
try:
|
||||
from mcp.types import (
|
||||
@@ -962,7 +1044,13 @@ class MCPServerTask:
|
||||
|
||||
# Snapshot child PIDs before spawning so we can track the new one.
|
||||
pids_before = _snapshot_child_pids()
|
||||
async with stdio_client(server_params) as (read_stream, write_stream):
|
||||
# Redirect subprocess stderr into a shared log file so MCP servers
|
||||
# (FastMCP banners, slack-mcp startup JSON, etc.) don't dump onto
|
||||
# the user's TTY and corrupt the TUI. Preserves debuggability via
|
||||
# ~/.hermes/logs/mcp-stderr.log.
|
||||
_write_stderr_log_header(self.name)
|
||||
_errlog = _get_mcp_stderr_log()
|
||||
async with stdio_client(server_params, errlog=_errlog) as (read_stream, write_stream):
|
||||
# Capture the newly spawned subprocess PID for force-kill cleanup.
|
||||
new_pids = _snapshot_child_pids() - pids_before
|
||||
if new_pids:
|
||||
@@ -995,6 +1083,12 @@ class MCPServerTask:
|
||||
|
||||
url = config["url"]
|
||||
headers = dict(config.get("headers") or {})
|
||||
# Some MCP servers require MCP-Protocol-Version on the initial
|
||||
# initialize request and reject session-less POSTs otherwise.
|
||||
# Seed it as a client-level default, but treat user overrides as
|
||||
# case-insensitive so conventional casing is preserved.
|
||||
if not any(key.lower() == "mcp-protocol-version" for key in headers):
|
||||
headers["mcp-protocol-version"] = LATEST_PROTOCOL_VERSION
|
||||
connect_timeout = config.get("connect_timeout", _DEFAULT_CONNECT_TIMEOUT)
|
||||
ssl_verify = config.get("ssl_verify", True)
|
||||
|
||||
@@ -2019,14 +2113,92 @@ def _make_check_fn(server_name: str):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _normalize_mcp_input_schema(schema: dict | None) -> dict:
|
||||
"""Normalize MCP input schemas for LLM tool-calling compatibility."""
|
||||
"""Normalize MCP input schemas for LLM tool-calling compatibility.
|
||||
|
||||
MCP servers can emit plain JSON Schema with ``definitions`` /
|
||||
``#/definitions/...`` references. Kimi / Moonshot rejects that form and
|
||||
requires local refs to point into ``#/$defs/...`` instead. Normalize the
|
||||
common draft-07 shape here so MCP tool schemas remain portable across
|
||||
OpenAI-compatible providers.
|
||||
|
||||
Additional MCP-server robustness repairs applied recursively:
|
||||
|
||||
* Missing or ``null`` ``type`` on an object-shaped node is coerced to
|
||||
``"object"`` (some servers omit it). See PR #4897.
|
||||
* When an ``object`` node lacks ``properties``, an empty ``properties``
|
||||
dict is added so ``required`` entries don't dangle.
|
||||
* ``required`` arrays are pruned to only names that exist in
|
||||
``properties``; otherwise Google AI Studio / Gemini 400s with
|
||||
``property is not defined``. See PR #4651.
|
||||
|
||||
All repairs are provider-agnostic and ideally produce a schema valid on
|
||||
OpenAI, Anthropic, Gemini, and Moonshot in one pass.
|
||||
"""
|
||||
if not schema:
|
||||
return {"type": "object", "properties": {}}
|
||||
|
||||
if schema.get("type") == "object" and "properties" not in schema:
|
||||
return {**schema, "properties": {}}
|
||||
def _rewrite_local_refs(node):
|
||||
if isinstance(node, dict):
|
||||
normalized = {}
|
||||
for key, value in node.items():
|
||||
out_key = "$defs" if key == "definitions" else key
|
||||
normalized[out_key] = _rewrite_local_refs(value)
|
||||
ref = normalized.get("$ref")
|
||||
if isinstance(ref, str) and ref.startswith("#/definitions/"):
|
||||
normalized["$ref"] = "#/$defs/" + ref[len("#/definitions/"):]
|
||||
return normalized
|
||||
if isinstance(node, list):
|
||||
return [_rewrite_local_refs(item) for item in node]
|
||||
return node
|
||||
|
||||
return schema
|
||||
def _repair_object_shape(node):
|
||||
"""Recursively repair object-shaped nodes: fill type, prune required."""
|
||||
if isinstance(node, list):
|
||||
return [_repair_object_shape(item) for item in node]
|
||||
if not isinstance(node, dict):
|
||||
return node
|
||||
|
||||
repaired = {k: _repair_object_shape(v) for k, v in node.items()}
|
||||
|
||||
# Coerce missing / null type when the shape is clearly an object
|
||||
# (has properties or required but no type).
|
||||
if not repaired.get("type") and (
|
||||
"properties" in repaired or "required" in repaired
|
||||
):
|
||||
repaired["type"] = "object"
|
||||
|
||||
if repaired.get("type") == "object":
|
||||
# Ensure properties exists so required can reference it safely
|
||||
if "properties" not in repaired or not isinstance(
|
||||
repaired.get("properties"), dict
|
||||
):
|
||||
repaired["properties"] = {} if "properties" not in repaired else repaired["properties"]
|
||||
if not isinstance(repaired.get("properties"), dict):
|
||||
repaired["properties"] = {}
|
||||
|
||||
# Prune required to only include names that exist in properties
|
||||
required = repaired.get("required")
|
||||
if isinstance(required, list):
|
||||
props = repaired.get("properties") or {}
|
||||
valid = [r for r in required if isinstance(r, str) and r in props]
|
||||
if len(valid) != len(required):
|
||||
if valid:
|
||||
repaired["required"] = valid
|
||||
else:
|
||||
repaired.pop("required", None)
|
||||
|
||||
return repaired
|
||||
|
||||
normalized = _rewrite_local_refs(schema)
|
||||
normalized = _repair_object_shape(normalized)
|
||||
|
||||
# Ensure top-level is a well-formed object schema
|
||||
if not isinstance(normalized, dict):
|
||||
return {"type": "object", "properties": {}}
|
||||
if normalized.get("type") == "object" and "properties" not in normalized:
|
||||
normalized = {**normalized, "properties": {}}
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def sanitize_mcp_name_component(value: str) -> str:
|
||||
@@ -2057,7 +2229,7 @@ def _convert_mcp_schema(server_name: str, mcp_tool) -> dict:
|
||||
return {
|
||||
"name": prefixed_name,
|
||||
"description": mcp_tool.description or f"MCP tool {mcp_tool.name} from {server_name}",
|
||||
"parameters": _normalize_mcp_input_schema(mcp_tool.inputSchema),
|
||||
"parameters": _normalize_mcp_input_schema(getattr(mcp_tool, "inputSchema", None)),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1805,7 +1805,8 @@ def terminal_tool(
|
||||
pass
|
||||
|
||||
# Truncate output if too long, keeping both head and tail
|
||||
MAX_OUTPUT_CHARS = 50000
|
||||
from tools.tool_output_limits import get_max_bytes
|
||||
MAX_OUTPUT_CHARS = get_max_bytes()
|
||||
if len(output) > MAX_OUTPUT_CHARS:
|
||||
head_chars = int(MAX_OUTPUT_CHARS * 0.4) # 40% head (error messages often appear early)
|
||||
tail_chars = MAX_OUTPUT_CHARS - head_chars # 60% tail (most recent/relevant output)
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
"""Configurable tool-output truncation limits.
|
||||
|
||||
Ported from anomalyco/opencode PR #23770 (``feat(truncate): allow
|
||||
configuring tool output truncation limits``).
|
||||
|
||||
OpenCode hardcoded ``MAX_LINES = 2000`` and ``MAX_BYTES = 50 * 1024``
|
||||
as tool-output truncation thresholds. Hermes-agent had the same
|
||||
hardcoded constants in two places:
|
||||
|
||||
* ``tools/terminal_tool.py`` — ``MAX_OUTPUT_CHARS = 50000`` (terminal
|
||||
stdout/stderr cap)
|
||||
* ``tools/file_operations.py`` — ``MAX_LINES = 2000`` /
|
||||
``MAX_LINE_LENGTH = 2000`` (read_file pagination cap + per-line cap)
|
||||
|
||||
This module centralises those values behind a single config section
|
||||
(``tool_output`` in ``config.yaml``) so power users can tune them
|
||||
without patching the source. The existing hardcoded numbers remain as
|
||||
defaults, so behaviour is unchanged when the config key is absent.
|
||||
|
||||
Example ``config.yaml``::
|
||||
|
||||
tool_output:
|
||||
max_bytes: 100000 # terminal output cap (chars)
|
||||
max_lines: 5000 # read_file pagination + truncation cap
|
||||
max_line_length: 2000 # per-line length cap before '... [truncated]'
|
||||
|
||||
The limits reader is defensive: any error (missing config file, invalid
|
||||
value type, etc.) falls back to the built-in defaults so tools never
|
||||
fail because of a malformed config.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
# Hardcoded defaults — these match the pre-existing values, so adding
|
||||
# this module is behaviour-preserving for users who don't set
|
||||
# ``tool_output`` in config.yaml.
|
||||
DEFAULT_MAX_BYTES = 50_000 # terminal_tool.MAX_OUTPUT_CHARS
|
||||
DEFAULT_MAX_LINES = 2000 # file_operations.MAX_LINES
|
||||
DEFAULT_MAX_LINE_LENGTH = 2000 # file_operations.MAX_LINE_LENGTH
|
||||
|
||||
|
||||
def _coerce_positive_int(value: Any, default: int) -> int:
|
||||
"""Return ``value`` as a positive int, or ``default`` on any issue."""
|
||||
try:
|
||||
iv = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
if iv <= 0:
|
||||
return default
|
||||
return iv
|
||||
|
||||
|
||||
def get_tool_output_limits() -> Dict[str, int]:
|
||||
"""Return resolved tool-output limits, reading ``tool_output`` from config.
|
||||
|
||||
Keys: ``max_bytes``, ``max_lines``, ``max_line_length``. Missing or
|
||||
invalid entries fall through to the ``DEFAULT_*`` constants. This
|
||||
function NEVER raises.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config() or {}
|
||||
section = cfg.get("tool_output") if isinstance(cfg, dict) else None
|
||||
if not isinstance(section, dict):
|
||||
section = {}
|
||||
except Exception:
|
||||
section = {}
|
||||
|
||||
return {
|
||||
"max_bytes": _coerce_positive_int(section.get("max_bytes"), DEFAULT_MAX_BYTES),
|
||||
"max_lines": _coerce_positive_int(section.get("max_lines"), DEFAULT_MAX_LINES),
|
||||
"max_line_length": _coerce_positive_int(
|
||||
section.get("max_line_length"), DEFAULT_MAX_LINE_LENGTH
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def get_max_bytes() -> int:
|
||||
"""Shortcut for terminal-tool callers that only need the byte cap."""
|
||||
return get_tool_output_limits()["max_bytes"]
|
||||
|
||||
|
||||
def get_max_lines() -> int:
|
||||
"""Shortcut for file-ops callers that only need the line cap."""
|
||||
return get_tool_output_limits()["max_lines"]
|
||||
|
||||
|
||||
def get_max_line_length() -> int:
|
||||
"""Shortcut for file-ops callers that only need the per-line cap."""
|
||||
return get_tool_output_limits()["max_line_length"]
|
||||
+38
-8
@@ -43,7 +43,7 @@ _HERMES_CORE_TOOLS = [
|
||||
"browser_navigate", "browser_snapshot", "browser_click",
|
||||
"browser_type", "browser_scroll", "browser_back",
|
||||
"browser_press", "browser_get_images",
|
||||
"browser_vision", "browser_console", "browser_cdp",
|
||||
"browser_vision", "browser_console", "browser_cdp", "browser_dialog",
|
||||
# Text-to-speech
|
||||
"text_to_speech",
|
||||
# Planning & memory
|
||||
@@ -115,7 +115,8 @@ TOOLSETS = {
|
||||
"browser_navigate", "browser_snapshot", "browser_click",
|
||||
"browser_type", "browser_scroll", "browser_back",
|
||||
"browser_press", "browser_get_images",
|
||||
"browser_vision", "browser_console", "browser_cdp", "web_search"
|
||||
"browser_vision", "browser_console", "browser_cdp",
|
||||
"browser_dialog", "web_search"
|
||||
],
|
||||
"includes": []
|
||||
},
|
||||
@@ -201,6 +202,18 @@ TOOLSETS = {
|
||||
"includes": []
|
||||
},
|
||||
|
||||
"discord": {
|
||||
"description": "Discord read and participate tools (fetch messages, search members, create threads)",
|
||||
"tools": ["discord"],
|
||||
"includes": [],
|
||||
},
|
||||
|
||||
"discord_admin": {
|
||||
"description": "Discord server management (list channels/roles, pin messages, assign roles)",
|
||||
"tools": ["discord_admin"],
|
||||
"includes": [],
|
||||
},
|
||||
|
||||
"feishu_doc": {
|
||||
"description": "Read Feishu/Lark document content",
|
||||
"tools": ["feishu_doc_read"],
|
||||
@@ -249,7 +262,7 @@ TOOLSETS = {
|
||||
"browser_navigate", "browser_snapshot", "browser_click",
|
||||
"browser_type", "browser_scroll", "browser_back",
|
||||
"browser_press", "browser_get_images",
|
||||
"browser_vision", "browser_console", "browser_cdp",
|
||||
"browser_vision", "browser_console", "browser_cdp", "browser_dialog",
|
||||
"todo", "memory",
|
||||
"session_search",
|
||||
"execute_code", "delegate_task",
|
||||
@@ -274,7 +287,7 @@ TOOLSETS = {
|
||||
"browser_navigate", "browser_snapshot", "browser_click",
|
||||
"browser_type", "browser_scroll", "browser_back",
|
||||
"browser_press", "browser_get_images",
|
||||
"browser_vision", "browser_console", "browser_cdp",
|
||||
"browser_vision", "browser_console", "browser_cdp", "browser_dialog",
|
||||
# Planning & memory
|
||||
"todo", "memory",
|
||||
# Session history search
|
||||
@@ -295,7 +308,18 @@ TOOLSETS = {
|
||||
"tools": _HERMES_CORE_TOOLS,
|
||||
"includes": []
|
||||
},
|
||||
|
||||
|
||||
"hermes-cron": {
|
||||
# Mirrors hermes-cli so cron's "default" toolset is the same set of
|
||||
# core tools users see interactively — then `hermes tools` filters
|
||||
# them down per the platform config. _DEFAULT_OFF_TOOLSETS (moa,
|
||||
# homeassistant, rl) are excluded by _get_platform_tools() unless
|
||||
# the user explicitly enables them.
|
||||
"description": "Default cron toolset - same core tools as hermes-cli; gated by `hermes tools`",
|
||||
"tools": _HERMES_CORE_TOOLS,
|
||||
"includes": []
|
||||
},
|
||||
|
||||
"hermes-telegram": {
|
||||
"description": "Telegram bot toolset - full access for personal use (terminal has safety checks)",
|
||||
"tools": _HERMES_CORE_TOOLS,
|
||||
@@ -305,8 +329,8 @@ TOOLSETS = {
|
||||
"hermes-discord": {
|
||||
"description": "Discord bot toolset - full access (terminal has safety checks via dangerous command approval)",
|
||||
"tools": _HERMES_CORE_TOOLS + [
|
||||
# Discord server introspection & management (gated on DISCORD_BOT_TOKEN via check_fn)
|
||||
"discord_server",
|
||||
"discord",
|
||||
"discord_admin",
|
||||
],
|
||||
"includes": []
|
||||
},
|
||||
@@ -367,7 +391,13 @@ TOOLSETS = {
|
||||
|
||||
"hermes-feishu": {
|
||||
"description": "Feishu/Lark bot toolset - enterprise messaging via Feishu/Lark (full access)",
|
||||
"tools": _HERMES_CORE_TOOLS,
|
||||
"tools": _HERMES_CORE_TOOLS + [
|
||||
"feishu_doc_read",
|
||||
"feishu_drive_list_comments",
|
||||
"feishu_drive_list_comment_replies",
|
||||
"feishu_drive_reply_comment",
|
||||
"feishu_drive_add_comment",
|
||||
],
|
||||
"includes": []
|
||||
},
|
||||
|
||||
|
||||
+81
-2
@@ -1,19 +1,93 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from tui_gateway.server import dispatch, resolve_skin, write_json
|
||||
from tui_gateway.server import _CRASH_LOG, dispatch, resolve_skin, write_json
|
||||
|
||||
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
|
||||
|
||||
def _log_signal(signum: int, frame) -> None:
|
||||
"""Capture WHICH thread and WHERE a termination signal hit us.
|
||||
|
||||
SIG_DFL for SIGPIPE kills the process silently the instant any
|
||||
background thread (TTS playback, beep, voice status emitter, etc.)
|
||||
writes to a stdout the TUI has stopped reading. Without this
|
||||
handler the gateway-exited banner in the TUI has no trace — the
|
||||
crash log never sees a Python exception because the kernel reaps
|
||||
the process before the interpreter runs anything.
|
||||
"""
|
||||
name = {
|
||||
signal.SIGPIPE: "SIGPIPE",
|
||||
signal.SIGTERM: "SIGTERM",
|
||||
signal.SIGHUP: "SIGHUP",
|
||||
}.get(signum, f"signal {signum}")
|
||||
try:
|
||||
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
|
||||
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(
|
||||
f"\n=== {name} received · {time.strftime('%Y-%m-%d %H:%M:%S')} ===\n"
|
||||
)
|
||||
if frame is not None:
|
||||
f.write("main-thread stack at signal delivery:\n")
|
||||
traceback.print_stack(frame, file=f)
|
||||
# All live threads — signal may have been triggered by a
|
||||
# background thread (write to broken stdout from TTS, etc.).
|
||||
import threading as _threading
|
||||
for tid, th in _threading._active.items():
|
||||
f.write(f"\n--- thread {th.name} (id={tid}) ---\n")
|
||||
f.write("".join(traceback.format_stack(sys._current_frames().get(tid))))
|
||||
except Exception:
|
||||
pass
|
||||
print(f"[gateway-signal] {name}", file=sys.stderr, flush=True)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
# SIGPIPE: ignore, don't exit. The old SIG_DFL killed the process
|
||||
# silently whenever a *background* thread (TTS playback chain, voice
|
||||
# debug stderr emitter, beep thread) wrote to a pipe the TUI had gone
|
||||
# quiet on — even though the main thread was perfectly fine waiting on
|
||||
# stdin. Ignoring the signal lets Python raise BrokenPipeError on the
|
||||
# offending write (write_json already handles that with a clean
|
||||
# sys.exit(0) + _log_exit), which keeps the gateway alive as long as
|
||||
# the main command pipe is still readable. Terminal signals still
|
||||
# route through _log_signal so kills and hangups are diagnosable.
|
||||
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
|
||||
signal.signal(signal.SIGTERM, _log_signal)
|
||||
signal.signal(signal.SIGHUP, _log_signal)
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
|
||||
|
||||
def _log_exit(reason: str) -> None:
|
||||
"""Record why the gateway subprocess is shutting down.
|
||||
|
||||
Three exit paths (startup write fail, parse-error-response write fail,
|
||||
dispatch-response write fail, stdin EOF) all collapse into a silent
|
||||
sys.exit(0) here. Without this trail the TUI shows "gateway exited"
|
||||
with no actionable clue about WHICH broken pipe or WHICH message
|
||||
triggered it — the main reason voice-mode turns look like phantom
|
||||
crashes when the real story is "TUI read pipe closed on this event".
|
||||
"""
|
||||
try:
|
||||
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
|
||||
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(
|
||||
f"\n=== gateway exit · {time.strftime('%Y-%m-%d %H:%M:%S')} "
|
||||
f"· reason={reason} ===\n"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
print(f"[gateway-exit] {reason}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def main():
|
||||
if not write_json({
|
||||
"jsonrpc": "2.0",
|
||||
"method": "event",
|
||||
"params": {"type": "gateway.ready", "payload": {"skin": resolve_skin()}},
|
||||
}):
|
||||
_log_exit("startup write failed (broken stdout pipe before first event)")
|
||||
sys.exit(0)
|
||||
|
||||
for raw in sys.stdin:
|
||||
@@ -25,14 +99,19 @@ def main():
|
||||
req = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
if not write_json({"jsonrpc": "2.0", "error": {"code": -32700, "message": "parse error"}, "id": None}):
|
||||
_log_exit("parse-error-response write failed (broken stdout pipe)")
|
||||
sys.exit(0)
|
||||
continue
|
||||
|
||||
method = req.get("method") if isinstance(req, dict) else None
|
||||
resp = dispatch(req)
|
||||
if resp is not None:
|
||||
if not write_json(resp):
|
||||
_log_exit(f"response write failed for method={method!r} (broken stdout pipe)")
|
||||
sys.exit(0)
|
||||
|
||||
_log_exit("stdin EOF (TUI closed the command pipe)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
+428
-19
@@ -23,6 +23,75 @@ load_hermes_dotenv(
|
||||
hermes_home=_hermes_home, project_env=Path(__file__).parent.parent / ".env"
|
||||
)
|
||||
|
||||
|
||||
# ── Panic logger ─────────────────────────────────────────────────────
|
||||
# Gateway crashes in a TUI session leave no forensics: stdout is the
|
||||
# JSON-RPC pipe (TUI side parses it, doesn't log raw), the root logger
|
||||
# only catches handled warnings, and the subprocess exits before stderr
|
||||
# flushes through the stderr->gateway.stderr event pump. This hook
|
||||
# appends every unhandled exception to ~/.hermes/logs/tui_gateway_crash.log
|
||||
# AND re-emits a one-line summary to stderr so the TUI can surface it in
|
||||
# Activity — exactly what was missing when the voice-mode turns started
|
||||
# exiting the gateway mid-TTS.
|
||||
_CRASH_LOG = os.path.join(_hermes_home, "logs", "tui_gateway_crash.log")
|
||||
|
||||
|
||||
def _panic_hook(exc_type, exc_value, exc_tb):
|
||||
import traceback
|
||||
|
||||
trace = "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
|
||||
try:
|
||||
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
|
||||
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(
|
||||
f"\n=== unhandled exception · {time.strftime('%Y-%m-%d %H:%M:%S')} ===\n"
|
||||
)
|
||||
f.write(trace)
|
||||
except Exception:
|
||||
pass
|
||||
# Stderr goes through to the TUI as a gateway.stderr Activity line —
|
||||
# the first line here is what the user will see without opening any
|
||||
# log files. Rest of the stack is still in the log for full context.
|
||||
first = str(exc_value).strip().splitlines()[0] if str(exc_value).strip() else exc_type.__name__
|
||||
print(f"[gateway-crash] {exc_type.__name__}: {first}", file=sys.stderr, flush=True)
|
||||
# Chain to the default hook so the process still terminates normally.
|
||||
sys.__excepthook__(exc_type, exc_value, exc_tb)
|
||||
|
||||
|
||||
sys.excepthook = _panic_hook
|
||||
|
||||
|
||||
def _thread_panic_hook(args):
|
||||
# threading.excepthook signature: SimpleNamespace(exc_type, exc_value, exc_traceback, thread)
|
||||
import traceback
|
||||
|
||||
trace = "".join(
|
||||
traceback.format_exception(args.exc_type, args.exc_value, args.exc_traceback)
|
||||
)
|
||||
try:
|
||||
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
|
||||
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(
|
||||
f"\n=== thread exception · {time.strftime('%Y-%m-%d %H:%M:%S')} "
|
||||
f"· thread={args.thread.name} ===\n"
|
||||
)
|
||||
f.write(trace)
|
||||
except Exception:
|
||||
pass
|
||||
first_line = (
|
||||
str(args.exc_value).strip().splitlines()[0]
|
||||
if str(args.exc_value).strip()
|
||||
else args.exc_type.__name__
|
||||
)
|
||||
print(
|
||||
f"[gateway-crash] thread {args.thread.name} raised {args.exc_type.__name__}: {first_line}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
||||
threading.excepthook = _thread_panic_hook
|
||||
|
||||
try:
|
||||
from hermes_cli.banner import prefetch_update_check
|
||||
|
||||
@@ -2126,7 +2195,43 @@ def _(rid, params: dict) -> dict:
|
||||
if rendered:
|
||||
payload["rendered"] = rendered
|
||||
_emit("message.complete", sid, payload)
|
||||
|
||||
# CLI parity: when voice-mode TTS is on, speak the agent reply
|
||||
# (cli.py:_voice_speak_response). Only the final text — tool
|
||||
# calls / reasoning already stream separately and would be
|
||||
# noisy to read aloud.
|
||||
if (
|
||||
status == "complete"
|
||||
and isinstance(raw, str)
|
||||
and raw.strip()
|
||||
and _voice_tts_enabled()
|
||||
):
|
||||
try:
|
||||
from hermes_cli.voice import speak_text
|
||||
|
||||
spoken = raw
|
||||
threading.Thread(
|
||||
target=speak_text, args=(spoken,), daemon=True
|
||||
).start()
|
||||
except ImportError:
|
||||
logger.warning("voice TTS skipped: hermes_cli.voice unavailable")
|
||||
except Exception as e:
|
||||
logger.warning("voice TTS dispatch failed: %s", e)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
trace = traceback.format_exc()
|
||||
try:
|
||||
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
|
||||
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(
|
||||
f"\n=== turn-dispatcher exception · "
|
||||
f"{time.strftime('%Y-%m-%d %H:%M:%S')} · sid={sid} ===\n"
|
||||
)
|
||||
f.write(trace)
|
||||
except Exception:
|
||||
pass
|
||||
print(f"[gateway-turn] {type(e).__name__}: {e}", file=sys.stderr, flush=True)
|
||||
_emit("error", sid, {"message": str(e)})
|
||||
finally:
|
||||
try:
|
||||
@@ -3151,6 +3256,162 @@ def _(rid, params: dict) -> dict:
|
||||
|
||||
# ── Methods: complete ─────────────────────────────────────────────────
|
||||
|
||||
_FUZZY_CACHE_TTL_S = 5.0
|
||||
_FUZZY_CACHE_MAX_FILES = 20000
|
||||
_FUZZY_FALLBACK_EXCLUDES = frozenset(
|
||||
{
|
||||
".git",
|
||||
".hg",
|
||||
".svn",
|
||||
".next",
|
||||
".cache",
|
||||
".venv",
|
||||
"venv",
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
"dist",
|
||||
"build",
|
||||
"target",
|
||||
".mypy_cache",
|
||||
".pytest_cache",
|
||||
".ruff_cache",
|
||||
}
|
||||
)
|
||||
_fuzzy_cache_lock = threading.Lock()
|
||||
_fuzzy_cache: dict[str, tuple[float, list[str]]] = {}
|
||||
|
||||
|
||||
def _list_repo_files(root: str) -> list[str]:
|
||||
"""Return file paths relative to ``root``.
|
||||
|
||||
Uses ``git ls-files`` from the repo top (resolved via
|
||||
``rev-parse --show-toplevel``) so the listing covers tracked + untracked
|
||||
files anywhere in the repo, then converts each path back to be relative
|
||||
to ``root``. Files outside ``root`` (parent directories of cwd, sibling
|
||||
subtrees) are excluded so the picker stays scoped to what's reachable
|
||||
from the gateway's cwd. Falls back to a bounded ``os.walk(root)`` when
|
||||
``root`` isn't inside a git repo. Result cached per-root for
|
||||
``_FUZZY_CACHE_TTL_S`` so rapid keystrokes don't respawn git processes.
|
||||
"""
|
||||
now = time.monotonic()
|
||||
with _fuzzy_cache_lock:
|
||||
cached = _fuzzy_cache.get(root)
|
||||
if cached and now - cached[0] < _FUZZY_CACHE_TTL_S:
|
||||
return cached[1]
|
||||
|
||||
files: list[str] = []
|
||||
try:
|
||||
top_result = subprocess.run(
|
||||
["git", "-C", root, "rev-parse", "--show-toplevel"],
|
||||
capture_output=True,
|
||||
timeout=2.0,
|
||||
check=False,
|
||||
)
|
||||
if top_result.returncode == 0:
|
||||
top = top_result.stdout.decode("utf-8", "replace").strip()
|
||||
list_result = subprocess.run(
|
||||
["git", "-C", top, "ls-files", "-z", "--cached", "--others", "--exclude-standard"],
|
||||
capture_output=True,
|
||||
timeout=2.0,
|
||||
check=False,
|
||||
)
|
||||
if list_result.returncode == 0:
|
||||
for p in list_result.stdout.decode("utf-8", "replace").split("\0"):
|
||||
if not p:
|
||||
continue
|
||||
rel = os.path.relpath(os.path.join(top, p), root).replace(os.sep, "/")
|
||||
# Skip parents/siblings of cwd — keep the picker scoped
|
||||
# to root-and-below, matching Cmd-P workspace semantics.
|
||||
if rel.startswith("../"):
|
||||
continue
|
||||
files.append(rel)
|
||||
if len(files) >= _FUZZY_CACHE_MAX_FILES:
|
||||
break
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
|
||||
if not files:
|
||||
# Fallback walk: skip vendor/build dirs + dot-dirs so the walk stays
|
||||
# tractable. Dotfiles themselves survive — the ranker decides based
|
||||
# on whether the query starts with `.`.
|
||||
try:
|
||||
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
|
||||
dirnames[:] = [
|
||||
d
|
||||
for d in dirnames
|
||||
if d not in _FUZZY_FALLBACK_EXCLUDES and not d.startswith(".")
|
||||
]
|
||||
rel_dir = os.path.relpath(dirpath, root)
|
||||
for f in filenames:
|
||||
rel = f if rel_dir == "." else f"{rel_dir}/{f}"
|
||||
files.append(rel.replace(os.sep, "/"))
|
||||
if len(files) >= _FUZZY_CACHE_MAX_FILES:
|
||||
break
|
||||
if len(files) >= _FUZZY_CACHE_MAX_FILES:
|
||||
break
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
with _fuzzy_cache_lock:
|
||||
_fuzzy_cache[root] = (now, files)
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def _fuzzy_basename_rank(name: str, query: str) -> tuple[int, int] | None:
|
||||
"""Rank ``name`` against ``query``; lower is better. Returns None to reject.
|
||||
|
||||
Tiers (kind):
|
||||
0 — exact basename
|
||||
1 — basename prefix (e.g. `app` → `appChrome.tsx`)
|
||||
2 — word-boundary / camelCase hit (e.g. `chrome` → `appChrome.tsx`)
|
||||
3 — substring anywhere in basename
|
||||
4 — subsequence match (every query char appears in order)
|
||||
|
||||
Secondary key is `len(name)` so shorter names win ties.
|
||||
"""
|
||||
if not query:
|
||||
return (3, len(name))
|
||||
|
||||
nl = name.lower()
|
||||
ql = query.lower()
|
||||
|
||||
if nl == ql:
|
||||
return (0, len(name))
|
||||
|
||||
if nl.startswith(ql):
|
||||
return (1, len(name))
|
||||
|
||||
# Word-boundary split: `foo-bar_baz.qux` → ["foo","bar","baz","qux"].
|
||||
# camelCase split: `appChrome` → ["app","Chrome"]. Cheap approximation;
|
||||
# falls through to substring/subsequence if it misses.
|
||||
parts: list[str] = []
|
||||
buf = ""
|
||||
for ch in name:
|
||||
if ch in "-_." or (ch.isupper() and buf and not buf[-1].isupper()):
|
||||
if buf:
|
||||
parts.append(buf)
|
||||
buf = ch if ch not in "-_." else ""
|
||||
else:
|
||||
buf += ch
|
||||
if buf:
|
||||
parts.append(buf)
|
||||
for p in parts:
|
||||
if p.lower().startswith(ql):
|
||||
return (2, len(name))
|
||||
|
||||
if ql in nl:
|
||||
return (3, len(name))
|
||||
|
||||
i = 0
|
||||
for ch in nl:
|
||||
if ch == ql[i]:
|
||||
i += 1
|
||||
if i == len(ql):
|
||||
return (4, len(name))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@method("complete.path")
|
||||
def _(rid, params: dict) -> dict:
|
||||
@@ -3186,6 +3447,42 @@ def _(rid, params: dict) -> dict:
|
||||
prefix_tag = ""
|
||||
path_part = query if is_context else query
|
||||
|
||||
# Fuzzy basename search across the repo when the user types a bare
|
||||
# name with no path separator — `@appChrome` surfaces every file
|
||||
# whose basename matches, regardless of directory depth. Matches what
|
||||
# editors like Cursor / VS Code do for Cmd-P. Path-ish queries (with
|
||||
# `/`, `./`, `~/`, `/abs`) fall through to the directory-listing
|
||||
# path so explicit navigation intent is preserved.
|
||||
if (
|
||||
is_context
|
||||
and path_part
|
||||
and "/" not in path_part
|
||||
and prefix_tag != "folder"
|
||||
):
|
||||
root = os.getcwd()
|
||||
ranked: list[tuple[tuple[int, int], str, str]] = []
|
||||
for rel in _list_repo_files(root):
|
||||
basename = os.path.basename(rel)
|
||||
if basename.startswith(".") and not path_part.startswith("."):
|
||||
continue
|
||||
rank = _fuzzy_basename_rank(basename, path_part)
|
||||
if rank is None:
|
||||
continue
|
||||
ranked.append((rank, rel, basename))
|
||||
|
||||
ranked.sort(key=lambda r: (r[0], len(r[1]), r[1]))
|
||||
tag = prefix_tag or "file"
|
||||
for _, rel, basename in ranked[:30]:
|
||||
items.append(
|
||||
{
|
||||
"text": f"@{tag}:{rel}",
|
||||
"display": basename,
|
||||
"meta": os.path.dirname(rel),
|
||||
}
|
||||
)
|
||||
|
||||
return _ok(rid, {"items": items})
|
||||
|
||||
expanded = _normalize_completion_path(path_part) if path_part else "."
|
||||
if expanded == "." or not expanded:
|
||||
search_dir, match = ".", ""
|
||||
@@ -3455,43 +3752,155 @@ def _(rid, params: dict) -> dict:
|
||||
# ── Methods: voice ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
_voice_sid_lock = threading.Lock()
|
||||
_voice_event_sid: str = ""
|
||||
|
||||
|
||||
def _voice_emit(event: str, payload: dict | None = None) -> None:
|
||||
"""Emit a voice event toward the session that most recently turned the
|
||||
mode on. Voice is process-global (one microphone), so there's only ever
|
||||
one sid to target; the TUI handler treats an empty sid as "active
|
||||
session". Kept separate from _emit to make the lack of per-call sid
|
||||
argument explicit."""
|
||||
with _voice_sid_lock:
|
||||
sid = _voice_event_sid
|
||||
_emit(event, sid, payload)
|
||||
|
||||
|
||||
def _voice_mode_enabled() -> bool:
|
||||
"""Current voice-mode flag (runtime-only, CLI parity).
|
||||
|
||||
cli.py initialises ``_voice_mode = False`` at startup and only flips
|
||||
it via ``/voice on``; it never reads a persisted enable bit from
|
||||
config.yaml. We match that: no config lookup, env var only. This
|
||||
avoids the TUI auto-starting in REC the next time the user opens it
|
||||
just because they happened to enable voice in a prior session.
|
||||
"""
|
||||
return os.environ.get("HERMES_VOICE", "").strip() == "1"
|
||||
|
||||
|
||||
def _voice_tts_enabled() -> bool:
|
||||
"""Whether agent replies should be spoken back via TTS (runtime only)."""
|
||||
return os.environ.get("HERMES_VOICE_TTS", "").strip() == "1"
|
||||
|
||||
|
||||
@method("voice.toggle")
|
||||
def _(rid, params: dict) -> dict:
|
||||
"""CLI parity for the ``/voice`` slash command.
|
||||
|
||||
Subcommands:
|
||||
|
||||
* ``status`` — report mode + TTS flags (default when action is unknown).
|
||||
* ``on`` / ``off`` — flip voice *mode* (the umbrella bit). Turning it
|
||||
off also tears down any active continuous recording loop. Does NOT
|
||||
start recording on its own; recording is driven by ``voice.record``
|
||||
(Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split.
|
||||
* ``tts`` — toggle speech-output of agent replies. Requires mode on
|
||||
(mirrors CLI's _toggle_voice_tts guard).
|
||||
"""
|
||||
action = params.get("action", "status")
|
||||
|
||||
if action == "status":
|
||||
env = os.environ.get("HERMES_VOICE", "").strip()
|
||||
if env in {"0", "1"}:
|
||||
return _ok(rid, {"enabled": env == "1"})
|
||||
return _ok(
|
||||
rid,
|
||||
{
|
||||
"enabled": bool(
|
||||
_load_cfg().get("display", {}).get("voice_enabled", False)
|
||||
)
|
||||
},
|
||||
)
|
||||
# Mirror CLI's _show_voice_status: include STT/TTS provider
|
||||
# availability so the user can tell at a glance *why* voice mode
|
||||
# isn't working ("STT provider: MISSING ..." is the common case).
|
||||
payload: dict = {
|
||||
"enabled": _voice_mode_enabled(),
|
||||
"tts": _voice_tts_enabled(),
|
||||
}
|
||||
try:
|
||||
from tools.voice_mode import check_voice_requirements
|
||||
|
||||
reqs = check_voice_requirements()
|
||||
payload["available"] = bool(reqs.get("available"))
|
||||
payload["audio_available"] = bool(reqs.get("audio_available"))
|
||||
payload["stt_available"] = bool(reqs.get("stt_available"))
|
||||
payload["details"] = reqs.get("details") or ""
|
||||
except Exception as e:
|
||||
# check_voice_requirements pulls optional transcription deps —
|
||||
# swallow so /voice status always returns something useful.
|
||||
logger.warning("voice.toggle status: requirements probe failed: %s", e)
|
||||
|
||||
return _ok(rid, payload)
|
||||
|
||||
if action in ("on", "off"):
|
||||
enabled = action == "on"
|
||||
# Runtime-only flag (CLI parity) — no _write_config_key, so the
|
||||
# next TUI launch starts with voice OFF instead of auto-REC from a
|
||||
# persisted stale toggle.
|
||||
os.environ["HERMES_VOICE"] = "1" if enabled else "0"
|
||||
_write_config_key("display.voice_enabled", enabled)
|
||||
return _ok(rid, {"enabled": action == "on"})
|
||||
|
||||
if not enabled:
|
||||
# Disabling the mode must tear the continuous loop down; the
|
||||
# loop holds the microphone and would otherwise keep running.
|
||||
try:
|
||||
from hermes_cli.voice import stop_continuous
|
||||
|
||||
stop_continuous()
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("voice: stop_continuous failed during toggle off: %s", e)
|
||||
|
||||
return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()})
|
||||
|
||||
if action == "tts":
|
||||
if not _voice_mode_enabled():
|
||||
return _err(rid, 4014, "enable voice mode first: /voice on")
|
||||
new_value = not _voice_tts_enabled()
|
||||
# Runtime-only flag (CLI parity) — see voice.toggle on/off above.
|
||||
os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0"
|
||||
return _ok(rid, {"enabled": True, "tts": new_value})
|
||||
|
||||
return _err(rid, 4013, f"unknown voice action: {action}")
|
||||
|
||||
|
||||
@method("voice.record")
|
||||
def _(rid, params: dict) -> dict:
|
||||
"""VAD-driven continuous record loop, CLI-parity.
|
||||
|
||||
``start`` turns on a VAD loop that emits ``voice.transcript`` events
|
||||
for each detected utterance and auto-restarts for the next turn.
|
||||
``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while-
|
||||
recording branch clearing ``_voice_continuous``). Three consecutive
|
||||
silent cycles stop the loop automatically and emit a
|
||||
``voice.transcript`` with ``no_speech_limit=True``.
|
||||
"""
|
||||
action = params.get("action", "start")
|
||||
|
||||
if action not in {"start", "stop"}:
|
||||
return _err(rid, 4019, f"unknown voice action: {action}")
|
||||
|
||||
try:
|
||||
if action == "start":
|
||||
from hermes_cli.voice import start_recording
|
||||
if not _voice_mode_enabled():
|
||||
return _err(rid, 4015, "voice mode is off — enable with /voice on")
|
||||
|
||||
start_recording()
|
||||
with _voice_sid_lock:
|
||||
global _voice_event_sid
|
||||
_voice_event_sid = params.get("session_id") or _voice_event_sid
|
||||
|
||||
from hermes_cli.voice import start_continuous
|
||||
|
||||
voice_cfg = _load_cfg().get("voice", {})
|
||||
start_continuous(
|
||||
on_transcript=lambda t: _voice_emit(
|
||||
"voice.transcript", {"text": t}
|
||||
),
|
||||
on_status=lambda s: _voice_emit("voice.status", {"state": s}),
|
||||
on_silent_limit=lambda: _voice_emit(
|
||||
"voice.transcript", {"no_speech_limit": True}
|
||||
),
|
||||
silence_threshold=voice_cfg.get("silence_threshold", 200),
|
||||
silence_duration=voice_cfg.get("silence_duration", 3.0),
|
||||
)
|
||||
return _ok(rid, {"status": "recording"})
|
||||
if action == "stop":
|
||||
from hermes_cli.voice import stop_and_transcribe
|
||||
|
||||
return _ok(rid, {"text": stop_and_transcribe() or ""})
|
||||
return _err(rid, 4019, f"unknown voice action: {action}")
|
||||
# action == "stop"
|
||||
from hermes_cli.voice import stop_continuous
|
||||
|
||||
stop_continuous()
|
||||
return _ok(rid, {"status": "stopped"})
|
||||
except ImportError:
|
||||
return _err(
|
||||
rid, 5025, "voice module not available — install audio dependencies"
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import { shouldUseAnsiDim } from './Text.js'
|
||||
|
||||
describe('shouldUseAnsiDim', () => {
|
||||
it('disables ANSI dim on VTE terminals by default', () => {
|
||||
expect(shouldUseAnsiDim({ VTE_VERSION: '7603' } as NodeJS.ProcessEnv)).toBe(false)
|
||||
})
|
||||
|
||||
it('keeps ANSI dim enabled elsewhere by default', () => {
|
||||
expect(shouldUseAnsiDim({ TERM: 'xterm-256color' } as NodeJS.ProcessEnv)).toBe(true)
|
||||
})
|
||||
|
||||
it('honors explicit env override', () => {
|
||||
expect(shouldUseAnsiDim({ HERMES_TUI_DIM: '1', VTE_VERSION: '7603' } as NodeJS.ProcessEnv)).toBe(true)
|
||||
expect(shouldUseAnsiDim({ HERMES_TUI_DIM: '0' } as NodeJS.ProcessEnv)).toBe(false)
|
||||
})
|
||||
})
|
||||
@@ -3,6 +3,9 @@ import React from 'react'
|
||||
import { c as _c } from 'react/compiler-runtime'
|
||||
|
||||
import type { Color, Styles } from '../styles.js'
|
||||
|
||||
const ENV_ON_RE = /^(?:1|true|yes|on)$/i
|
||||
const ENV_OFF_RE = /^(?:0|false|no|off)$/i
|
||||
type BaseProps = {
|
||||
/**
|
||||
* Change text color. Accepts a raw color value (rgb, hex, ansi).
|
||||
@@ -62,6 +65,20 @@ type WeightProps =
|
||||
}
|
||||
export type Props = BaseProps & WeightProps
|
||||
|
||||
export function shouldUseAnsiDim(env: NodeJS.ProcessEnv = process.env): boolean {
|
||||
const override = (env.HERMES_TUI_DIM ?? '').trim()
|
||||
|
||||
if (ENV_ON_RE.test(override)) {
|
||||
return true
|
||||
}
|
||||
|
||||
if (ENV_OFF_RE.test(override)) {
|
||||
return false
|
||||
}
|
||||
|
||||
return !env.VTE_VERSION
|
||||
}
|
||||
|
||||
const memoizedStylesForWrap: Record<NonNullable<Styles['textWrap']>, Styles> = {
|
||||
wrap: {
|
||||
flexGrow: 0,
|
||||
@@ -143,6 +160,7 @@ export default function Text(t0: Props) {
|
||||
const strikethrough = t3 === undefined ? false : t3
|
||||
const inverse = t4 === undefined ? false : t4
|
||||
const wrap = t5 === undefined ? 'wrap' : t5
|
||||
const effectiveDim = dim && shouldUseAnsiDim()
|
||||
|
||||
if (children === undefined || children === null) {
|
||||
return null
|
||||
@@ -174,11 +192,11 @@ export default function Text(t0: Props) {
|
||||
|
||||
let t8
|
||||
|
||||
if ($[4] !== dim) {
|
||||
t8 = dim && {
|
||||
dim
|
||||
if ($[4] !== effectiveDim) {
|
||||
t8 = effectiveDim && {
|
||||
dim: effectiveDim
|
||||
}
|
||||
$[4] = dim
|
||||
$[4] = effectiveDim
|
||||
$[5] = t8
|
||||
} else {
|
||||
t8 = $[5]
|
||||
|
||||
@@ -83,6 +83,10 @@ export type DOMElement = {
|
||||
// Only set on ink-root. The document owns focus — any node can
|
||||
// reach it by walking parentNode, like browser getRootNode().
|
||||
focusManager?: FocusManager
|
||||
// Measurement cache for ink-text nodes: avoids re-squashing and re-wrapping
|
||||
// text when yoga calls measureFunc multiple times per frame with different
|
||||
// widths during flex re-pass. Keyed by `${width}|${widthMode}`.
|
||||
_textMeasureCache?: { gen: number; entries: Map<string, { _gen: number; result: { width: number; height: number } }> }
|
||||
} & InkNode
|
||||
|
||||
export type TextNode = {
|
||||
@@ -311,10 +315,42 @@ export const createTextNode = (text: string): TextNode => {
|
||||
return node
|
||||
}
|
||||
|
||||
const MEASURE_CACHE_CAP = 16
|
||||
|
||||
const measureTextNode = function (
|
||||
node: DOMNode,
|
||||
width: number,
|
||||
widthMode: LayoutMeasureMode
|
||||
): { width: number; height: number } {
|
||||
const elem = node.nodeName !== '#text' ? (node as DOMElement) : node.parentNode
|
||||
if (elem && elem.nodeName === 'ink-text') {
|
||||
let cache = elem._textMeasureCache
|
||||
if (!cache) {
|
||||
cache = { gen: 0, entries: new Map() }
|
||||
elem._textMeasureCache = cache
|
||||
}
|
||||
const key = `${width}|${widthMode}`
|
||||
const hit = cache.entries.get(key)
|
||||
if (hit && hit._gen === cache.gen) {
|
||||
return hit.result
|
||||
}
|
||||
const result = computeTextMeasure(node, width, widthMode)
|
||||
// Enforce cap with FIFO eviction to avoid unbounded growth during
|
||||
// pathological frames where yoga probes many widths.
|
||||
if (cache.entries.size >= MEASURE_CACHE_CAP) {
|
||||
const firstKey = cache.entries.keys().next().value
|
||||
cache.entries.delete(firstKey)
|
||||
}
|
||||
cache.entries.set(key, { _gen: cache.gen, result })
|
||||
return result
|
||||
}
|
||||
return computeTextMeasure(node, width, widthMode)
|
||||
}
|
||||
|
||||
const computeTextMeasure = function (
|
||||
node: DOMNode,
|
||||
width: number,
|
||||
widthMode: LayoutMeasureMode
|
||||
): { width: number; height: number } {
|
||||
const rawText = node.nodeName === '#text' ? node.nodeValue : squashTextNodes(node)
|
||||
|
||||
@@ -378,13 +414,19 @@ export const markDirty = (node?: DOMNode): void => {
|
||||
|
||||
while (current) {
|
||||
if (current.nodeName !== '#text') {
|
||||
;(current as DOMElement).dirty = true
|
||||
const elem = current as DOMElement
|
||||
elem.dirty = true
|
||||
|
||||
// Only mark yoga dirty on leaf nodes that have measure functions
|
||||
if (!markedYoga && (current.nodeName === 'ink-text' || current.nodeName === 'ink-raw-ansi') && current.yogaNode) {
|
||||
current.yogaNode.markDirty()
|
||||
if (!markedYoga && (elem.nodeName === 'ink-text' || elem.nodeName === 'ink-raw-ansi') && elem.yogaNode) {
|
||||
elem.yogaNode.markDirty()
|
||||
markedYoga = true
|
||||
}
|
||||
|
||||
// Invalidate text measurement cache — child text or style changed.
|
||||
if (elem._textMeasureCache) {
|
||||
elem._textMeasureCache.gen++
|
||||
}
|
||||
}
|
||||
|
||||
current = current.parentNode
|
||||
@@ -433,6 +475,7 @@ export const clearYogaNodeReferences = (node: DOMElement | TextNode): void => {
|
||||
for (const child of node.childNodes) {
|
||||
clearYogaNodeReferences(child)
|
||||
}
|
||||
node._textMeasureCache = undefined
|
||||
}
|
||||
|
||||
node.yogaNode = undefined
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import { shouldEmitClipboardSequence } from './osc.js'
|
||||
|
||||
describe('shouldEmitClipboardSequence', () => {
|
||||
it('suppresses local multiplexer clipboard OSC by default', () => {
|
||||
expect(shouldEmitClipboardSequence({ TMUX: '/tmp/tmux-1/default,1,0' } as NodeJS.ProcessEnv)).toBe(false)
|
||||
expect(shouldEmitClipboardSequence({ STY: '1234.pts-0.host' } as NodeJS.ProcessEnv)).toBe(false)
|
||||
})
|
||||
|
||||
it('keeps OSC enabled for remote or plain local terminals', () => {
|
||||
expect(shouldEmitClipboardSequence({ SSH_CONNECTION: '1', TMUX: '/tmp/tmux-1/default,1,0' } as NodeJS.ProcessEnv)).toBe(
|
||||
true
|
||||
)
|
||||
expect(shouldEmitClipboardSequence({ TERM: 'xterm-256color' } as NodeJS.ProcessEnv)).toBe(true)
|
||||
})
|
||||
|
||||
it('honors explicit env override', () => {
|
||||
expect(shouldEmitClipboardSequence({ HERMES_TUI_CLIPBOARD_OSC52: '1', TMUX: '/tmp/tmux-1/default,1,0' } as NodeJS.ProcessEnv)).toBe(
|
||||
true
|
||||
)
|
||||
expect(shouldEmitClipboardSequence({ HERMES_TUI_COPY_OSC52: '0', TERM: 'xterm-256color' } as NodeJS.ProcessEnv)).toBe(
|
||||
false
|
||||
)
|
||||
})
|
||||
})
|
||||
@@ -11,6 +11,8 @@ import { BEL, ESC, ESC_TYPE, SEP } from './ansi.js'
|
||||
import type { Action, Color, TabStatusAction } from './types.js'
|
||||
|
||||
export const OSC_PREFIX = ESC + String.fromCharCode(ESC_TYPE.OSC)
|
||||
const ENV_ON_RE = /^(?:1|true|yes|on)$/i
|
||||
const ENV_OFF_RE = /^(?:0|false|no|off)$/i
|
||||
|
||||
/** String Terminator (ESC \) - alternative to BEL for terminating OSC */
|
||||
export const ST = ESC + '\\'
|
||||
@@ -81,6 +83,20 @@ export function getClipboardPath(): ClipboardPath {
|
||||
return 'osc52'
|
||||
}
|
||||
|
||||
export function shouldEmitClipboardSequence(env: NodeJS.ProcessEnv = process.env): boolean {
|
||||
const override = (env.HERMES_TUI_CLIPBOARD_OSC52 ?? env.HERMES_TUI_COPY_OSC52 ?? '').trim()
|
||||
|
||||
if (ENV_ON_RE.test(override)) {
|
||||
return true
|
||||
}
|
||||
|
||||
if (ENV_OFF_RE.test(override)) {
|
||||
return false
|
||||
}
|
||||
|
||||
return !!env['SSH_CONNECTION'] || (!env['TMUX'] && !env['STY'])
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap a payload in tmux's DCS passthrough: ESC P tmux ; <payload> ESC \
|
||||
* tmux forwards the payload to the outer terminal, bypassing its own parser.
|
||||
@@ -152,6 +168,7 @@ export async function tmuxLoadBuffer(text: string): Promise<boolean> {
|
||||
export async function setClipboard(text: string): Promise<string> {
|
||||
const b64 = Buffer.from(text, 'utf8').toString('base64')
|
||||
const raw = osc(OSC.CLIPBOARD, 'c', b64)
|
||||
const emitSequence = shouldEmitClipboardSequence(process.env)
|
||||
|
||||
// Native safety net — fire FIRST, before the tmux await, so a quick
|
||||
// focus-switch after selecting doesn't race pbcopy. Previously this ran
|
||||
@@ -170,10 +187,10 @@ export async function setClipboard(text: string): Promise<string> {
|
||||
// Inner OSC uses BEL directly (not osc()) — ST's ESC would need doubling
|
||||
// too, and BEL works everywhere for OSC 52.
|
||||
if (tmuxBufferLoaded) {
|
||||
return tmuxPassthrough(`${ESC}]52;c;${b64}${BEL}`)
|
||||
return emitSequence ? tmuxPassthrough(`${ESC}]52;c;${b64}${BEL}`) : ''
|
||||
}
|
||||
|
||||
return raw
|
||||
return emitSequence ? raw : ''
|
||||
}
|
||||
|
||||
// Linux clipboard tool: undefined = not yet probed, null = none available.
|
||||
|
||||
@@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
|
||||
composer: {
|
||||
dequeue: () => undefined,
|
||||
queueEditRef: ref<null | number>(null),
|
||||
sendQueued: vi.fn()
|
||||
sendQueued: vi.fn(),
|
||||
setInput: vi.fn()
|
||||
},
|
||||
gateway: {
|
||||
gw: { request: vi.fn() },
|
||||
@@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
|
||||
resumeById: vi.fn(),
|
||||
setCatalog: vi.fn()
|
||||
},
|
||||
submission: {
|
||||
submitRef: { current: vi.fn() }
|
||||
},
|
||||
system: {
|
||||
bellOnComplete: false,
|
||||
sys: vi.fn()
|
||||
@@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
|
||||
panel: (title: string, sections: any[]) =>
|
||||
appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
|
||||
setHistoryItems: vi.fn()
|
||||
},
|
||||
voice: {
|
||||
setProcessing: vi.fn(),
|
||||
setRecording: vi.fn(),
|
||||
setVoiceEnabled: vi.fn()
|
||||
}
|
||||
}) as any
|
||||
|
||||
@@ -143,91 +152,79 @@ describe('createGatewayEventHandler', () => {
|
||||
expect(appended[0]?.thinkingTokens).toBe(estimateTokensRough(fromServer))
|
||||
})
|
||||
|
||||
it('attaches inline_diff to the assistant completion body', () => {
|
||||
it('anchors inline_diff as its own segment where the edit happened', () => {
|
||||
const appended: Msg[] = []
|
||||
const onEvent = createGatewayEventHandler(buildCtx(appended))
|
||||
const diff = '\u001b[31m--- a/foo.ts\u001b[0m\n\u001b[32m+++ b/foo.ts\u001b[0m\n@@\n-old\n+new'
|
||||
const cleaned = '--- a/foo.ts\n+++ b/foo.ts\n@@\n-old\n+new'
|
||||
const block = `\`\`\`diff\n${cleaned}\n\`\`\``
|
||||
|
||||
onEvent({
|
||||
payload: { context: 'foo.ts', name: 'patch', tool_id: 'tool-1' },
|
||||
type: 'tool.start'
|
||||
} as any)
|
||||
onEvent({
|
||||
payload: { inline_diff: diff, summary: 'patched', tool_id: 'tool-1' },
|
||||
type: 'tool.complete'
|
||||
} as any)
|
||||
// Narration → tool → tool-complete → more narration → message-complete.
|
||||
// The diff MUST land between the two narration segments, not tacked
|
||||
// onto the final one.
|
||||
onEvent({ payload: { text: 'Editing the file' }, type: 'message.delta' } as any)
|
||||
onEvent({ payload: { context: 'foo.ts', name: 'patch', tool_id: 'tool-1' }, type: 'tool.start' } as any)
|
||||
onEvent({ payload: { inline_diff: diff, summary: 'patched', tool_id: 'tool-1' }, type: 'tool.complete' } as any)
|
||||
|
||||
// Diff is buffered for message.complete and sanitized (ANSI stripped).
|
||||
// Diff is already committed to segmentMessages as its own segment.
|
||||
expect(appended).toHaveLength(0)
|
||||
expect(turnController.pendingInlineDiffs).toEqual([cleaned])
|
||||
expect(turnController.segmentMessages).toEqual([
|
||||
{ role: 'assistant', text: 'Editing the file' },
|
||||
{ kind: 'diff', role: 'assistant', text: block }
|
||||
])
|
||||
|
||||
onEvent({
|
||||
payload: { text: 'patch applied' },
|
||||
type: 'message.complete'
|
||||
} as any)
|
||||
onEvent({ payload: { text: 'patch applied' }, type: 'message.complete' } as any)
|
||||
|
||||
// Diff is rendered in the same assistant message body as the completion.
|
||||
expect(appended).toHaveLength(1)
|
||||
expect(appended[0]).toMatchObject({ role: 'assistant' })
|
||||
expect(appended[0]?.text).toContain('patch applied')
|
||||
expect(appended[0]?.text).toContain('```diff')
|
||||
expect(appended[0]?.text).toContain(cleaned)
|
||||
// Three transcript messages: pre-tool narration → diff (kind='diff',
|
||||
// so MessageLine gives it blank-line breathing room) → post-tool
|
||||
// narration. The final message does NOT contain a diff.
|
||||
expect(appended).toHaveLength(3)
|
||||
expect(appended[0]?.text).toBe('Editing the file')
|
||||
expect(appended[1]).toMatchObject({ kind: 'diff', text: block })
|
||||
expect(appended[2]?.text).toBe('patch applied')
|
||||
expect(appended[2]?.text).not.toContain('```diff')
|
||||
})
|
||||
|
||||
it('does not append inline_diff twice when assistant text already contains it', () => {
|
||||
it('drops the diff segment when the final assistant text narrates the same diff', () => {
|
||||
const appended: Msg[] = []
|
||||
const onEvent = createGatewayEventHandler(buildCtx(appended))
|
||||
const cleaned = '--- a/foo.ts\n+++ b/foo.ts\n@@\n-old\n+new'
|
||||
const assistantText = `Done. Here's the inline diff:\n\n\`\`\`diff\n${cleaned}\n\`\`\``
|
||||
|
||||
onEvent({
|
||||
payload: { inline_diff: cleaned, summary: 'patched', tool_id: 'tool-1' },
|
||||
type: 'tool.complete'
|
||||
} as any)
|
||||
onEvent({
|
||||
payload: { text: assistantText },
|
||||
type: 'message.complete'
|
||||
} as any)
|
||||
onEvent({ payload: { inline_diff: cleaned, summary: 'patched', tool_id: 'tool-1' }, type: 'tool.complete' } as any)
|
||||
onEvent({ payload: { text: assistantText }, type: 'message.complete' } as any)
|
||||
|
||||
// Only the final message — diff-only segment dropped so we don't
|
||||
// render two stacked copies of the same patch.
|
||||
expect(appended).toHaveLength(1)
|
||||
expect(appended[0]?.text).toBe(assistantText)
|
||||
expect((appended[0]?.text.match(/```diff/g) ?? []).length).toBe(1)
|
||||
})
|
||||
|
||||
it('strips the CLI "┊ review diff" header from queued inline diffs', () => {
|
||||
it('strips the CLI "┊ review diff" header from inline diff segments', () => {
|
||||
const appended: Msg[] = []
|
||||
const onEvent = createGatewayEventHandler(buildCtx(appended))
|
||||
const raw = ' \u001b[33m┊ review diff\u001b[0m\n--- a/foo.ts\n+++ b/foo.ts\n@@\n-old\n+new'
|
||||
|
||||
onEvent({
|
||||
payload: { inline_diff: raw, summary: 'patched', tool_id: 'tool-1' },
|
||||
type: 'tool.complete'
|
||||
} as any)
|
||||
onEvent({
|
||||
payload: { text: 'done' },
|
||||
type: 'message.complete'
|
||||
} as any)
|
||||
onEvent({ payload: { inline_diff: raw, summary: 'patched', tool_id: 'tool-1' }, type: 'tool.complete' } as any)
|
||||
onEvent({ payload: { text: 'done' }, type: 'message.complete' } as any)
|
||||
|
||||
expect(appended).toHaveLength(1)
|
||||
// diff segment first (kind='diff'), final narration second
|
||||
expect(appended).toHaveLength(2)
|
||||
expect(appended[0]?.kind).toBe('diff')
|
||||
expect(appended[0]?.text).not.toContain('┊ review diff')
|
||||
expect(appended[0]?.text).toContain('--- a/foo.ts')
|
||||
expect(appended[1]?.text).toBe('done')
|
||||
})
|
||||
|
||||
it('suppresses inline_diff when assistant already wrote a diff fence', () => {
|
||||
it('drops the diff segment when assistant writes its own ```diff fence', () => {
|
||||
const appended: Msg[] = []
|
||||
const onEvent = createGatewayEventHandler(buildCtx(appended))
|
||||
const inlineDiff = '--- a/foo.ts\n+++ b/foo.ts\n@@\n-old\n+new'
|
||||
const assistantText = 'Done. Clean swap:\n\n```diff\n-old\n+new\n```'
|
||||
|
||||
onEvent({
|
||||
payload: { inline_diff: inlineDiff, summary: 'patched', tool_id: 'tool-1' },
|
||||
type: 'tool.complete'
|
||||
} as any)
|
||||
onEvent({
|
||||
payload: { text: assistantText },
|
||||
type: 'message.complete'
|
||||
} as any)
|
||||
onEvent({ payload: { inline_diff: inlineDiff, summary: 'patched', tool_id: 'tool-1' }, type: 'tool.complete' } as any)
|
||||
onEvent({ payload: { text: assistantText }, type: 'message.complete' } as any)
|
||||
|
||||
expect(appended).toHaveLength(1)
|
||||
expect(appended[0]?.text).toBe(assistantText)
|
||||
@@ -243,15 +240,18 @@ describe('createGatewayEventHandler', () => {
|
||||
payload: { inline_diff: diff, name: 'review_diff', summary: diff, tool_id: 'tool-1' },
|
||||
type: 'tool.complete'
|
||||
} as any)
|
||||
onEvent({
|
||||
payload: { text: 'done' },
|
||||
type: 'message.complete'
|
||||
} as any)
|
||||
onEvent({ payload: { text: 'done' }, type: 'message.complete' } as any)
|
||||
|
||||
expect(appended).toHaveLength(1)
|
||||
expect(appended[0]?.tools?.[0]).toContain('Review Diff')
|
||||
expect(appended[0]?.tools?.[0]).not.toContain('--- a/foo.ts')
|
||||
// Two segments: the diff block (kind='diff', no tool row) and the final
|
||||
// narration (tool row belongs here since pendingSegmentTools carries
|
||||
// across the flushStreamingSegment call).
|
||||
expect(appended).toHaveLength(2)
|
||||
expect(appended[0]?.kind).toBe('diff')
|
||||
expect(appended[0]?.text).toContain('```diff')
|
||||
expect(appended[0]?.tools ?? []).toEqual([])
|
||||
expect(appended[1]?.text).toBe('done')
|
||||
expect(appended[1]?.tools?.[0]).toContain('Review Diff')
|
||||
expect(appended[1]?.tools?.[0]).not.toContain('--- a/foo.ts')
|
||||
})
|
||||
|
||||
it('shows setup panel for missing provider startup error', () => {
|
||||
|
||||
@@ -31,6 +31,36 @@ describe('platform action modifier', () => {
|
||||
})
|
||||
})
|
||||
|
||||
describe('isVoiceToggleKey', () => {
|
||||
it('matches raw Ctrl+B on macOS (doc-default across platforms)', async () => {
|
||||
const { isVoiceToggleKey } = await importPlatform('darwin')
|
||||
|
||||
expect(isVoiceToggleKey({ ctrl: true, meta: false, super: false }, 'b')).toBe(true)
|
||||
expect(isVoiceToggleKey({ ctrl: true, meta: false, super: false }, 'B')).toBe(true)
|
||||
})
|
||||
|
||||
it('matches Cmd+B on macOS (preserve platform muscle memory)', async () => {
|
||||
const { isVoiceToggleKey } = await importPlatform('darwin')
|
||||
|
||||
expect(isVoiceToggleKey({ ctrl: false, meta: true, super: false }, 'b')).toBe(true)
|
||||
expect(isVoiceToggleKey({ ctrl: false, meta: false, super: true }, 'b')).toBe(true)
|
||||
})
|
||||
|
||||
it('matches Ctrl+B on non-macOS platforms', async () => {
|
||||
const { isVoiceToggleKey } = await importPlatform('linux')
|
||||
|
||||
expect(isVoiceToggleKey({ ctrl: true, meta: false, super: false }, 'b')).toBe(true)
|
||||
})
|
||||
|
||||
it('does not match unmodified b or other Ctrl combos', async () => {
|
||||
const { isVoiceToggleKey } = await importPlatform('darwin')
|
||||
|
||||
expect(isVoiceToggleKey({ ctrl: false, meta: false, super: false }, 'b')).toBe(false)
|
||||
expect(isVoiceToggleKey({ ctrl: true, meta: false, super: false }, 'a')).toBe(false)
|
||||
expect(isVoiceToggleKey({ ctrl: true, meta: false, super: false }, 'c')).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe('isMacActionFallback', () => {
|
||||
it('routes raw Ctrl+K and Ctrl+W to readline kill-to-end / delete-word on macOS', async () => {
|
||||
const { isMacActionFallback } = await importPlatform('darwin')
|
||||
|
||||
@@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
||||
const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
|
||||
const { bellOnComplete, stdout, sys } = ctx.system
|
||||
const { appendMessage, panel, setHistoryItems } = ctx.transcript
|
||||
const { setInput } = ctx.composer
|
||||
const { submitRef } = ctx.submission
|
||||
const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice
|
||||
|
||||
let pendingThinkingStatus = ''
|
||||
let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
|
||||
@@ -261,6 +264,57 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
||||
return
|
||||
}
|
||||
|
||||
case 'voice.status': {
|
||||
// Continuous VAD loop reports its internal state so the status bar
|
||||
// can show listening / transcribing / idle without polling.
|
||||
const state = String(ev.payload?.state ?? '')
|
||||
|
||||
if (state === 'listening') {
|
||||
setVoiceRecording(true)
|
||||
setVoiceProcessing(false)
|
||||
} else if (state === 'transcribing') {
|
||||
setVoiceRecording(false)
|
||||
setVoiceProcessing(true)
|
||||
} else {
|
||||
setVoiceRecording(false)
|
||||
setVoiceProcessing(false)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
case 'voice.transcript': {
|
||||
// CLI parity: the 3-strikes silence detector flipped off automatically.
|
||||
// Mirror that on the UI side and tell the user why the mode is off.
|
||||
if (ev.payload?.no_speech_limit) {
|
||||
setVoiceEnabled(false)
|
||||
setVoiceRecording(false)
|
||||
setVoiceProcessing(false)
|
||||
sys('voice: no speech detected 3 times, continuous mode stopped')
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
const text = String(ev.payload?.text ?? '').trim()
|
||||
|
||||
if (!text) {
|
||||
return
|
||||
}
|
||||
|
||||
// CLI parity: _pending_input.put(transcript) unconditionally feeds
|
||||
// the transcript to the agent as its next turn — draft handling
|
||||
// doesn't apply because voice-mode users are speaking, not typing.
|
||||
//
|
||||
// We can't branch on composer input from inside a setInput updater
|
||||
// (React strict mode double-invokes it, duplicating the submit).
|
||||
// Just clear + defer submit so the cleared input is committed before
|
||||
// submit reads it.
|
||||
setInput('')
|
||||
setTimeout(() => submitRef.current(text), 0)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
case 'gateway.start_timeout': {
|
||||
const { cwd, python } = ev.payload ?? {}
|
||||
const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
|
||||
@@ -331,10 +385,12 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
||||
return
|
||||
}
|
||||
|
||||
// Keep inline diffs attached to the assistant completion body so
|
||||
// they render in the same message flow, not as a standalone system
|
||||
// artifact that can look out-of-place around tool rows.
|
||||
turnController.queueInlineDiff(inlineDiffText)
|
||||
// Anchor the diff to where the edit happened in the turn — between
|
||||
// the narration that preceded the tool call and whatever the agent
|
||||
// streams afterwards. The previous end-merge put the diff at the
|
||||
// bottom of the final message even when the edit fired mid-turn,
|
||||
// which read as "the agent wrote this after saying that".
|
||||
turnController.pushInlineDiffSegment(inlineDiffText)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@@ -189,9 +189,11 @@ export interface InputHandlerContext {
|
||||
stdout?: NodeJS.WriteStream
|
||||
}
|
||||
voice: {
|
||||
enabled: boolean
|
||||
recording: boolean
|
||||
setProcessing: StateSetter<boolean>
|
||||
setRecording: StateSetter<boolean>
|
||||
setVoiceEnabled: StateSetter<boolean>
|
||||
}
|
||||
wheelStep: number
|
||||
}
|
||||
@@ -201,6 +203,9 @@ export interface InputHandlerResult {
|
||||
}
|
||||
|
||||
export interface GatewayEventHandlerContext {
|
||||
composer: {
|
||||
setInput: StateSetter<string>
|
||||
}
|
||||
gateway: GatewayServices
|
||||
session: {
|
||||
STARTUP_RESUME_ID: string
|
||||
@@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
|
||||
resumeById: (id: string) => void
|
||||
setCatalog: StateSetter<null | SlashCatalog>
|
||||
}
|
||||
submission: {
|
||||
submitRef: MutableRefObject<(value: string) => void>
|
||||
}
|
||||
system: {
|
||||
bellOnComplete: boolean
|
||||
stdout?: NodeJS.WriteStream
|
||||
@@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
|
||||
panel: (title: string, sections: PanelSection[]) => void
|
||||
setHistoryItems: StateSetter<Msg[]>
|
||||
}
|
||||
voice: {
|
||||
setProcessing: StateSetter<boolean>
|
||||
setRecording: StateSetter<boolean>
|
||||
setVoiceEnabled: StateSetter<boolean>
|
||||
}
|
||||
}
|
||||
|
||||
export interface SlashHandlerContext {
|
||||
|
||||
@@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
|
||||
},
|
||||
|
||||
{
|
||||
help: 'toggle voice input',
|
||||
help: 'voice mode: [on|off|tts|status]',
|
||||
name: 'voice',
|
||||
run: (arg, ctx) => {
|
||||
const action = arg === 'on' || arg === 'off' ? arg : 'status'
|
||||
const normalized = (arg ?? '').trim().toLowerCase()
|
||||
|
||||
const action =
|
||||
normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
|
||||
? normalized
|
||||
: 'status'
|
||||
|
||||
ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
|
||||
ctx.guarded<VoiceToggleResponse>(r => {
|
||||
ctx.voice.setVoiceEnabled(!!r.enabled)
|
||||
ctx.transcript.sys(`voice: ${r.enabled ? 'on' : 'off'}`)
|
||||
|
||||
// Match CLI's _show_voice_status / _enable_voice_mode /
|
||||
// _toggle_voice_tts output shape so users don't have to learn
|
||||
// two vocabularies.
|
||||
if (action === 'status') {
|
||||
const mode = r.enabled ? 'ON' : 'OFF'
|
||||
const tts = r.tts ? 'ON' : 'OFF'
|
||||
ctx.transcript.sys('Voice Mode Status')
|
||||
ctx.transcript.sys(` Mode: ${mode}`)
|
||||
ctx.transcript.sys(` TTS: ${tts}`)
|
||||
ctx.transcript.sys(' Record key: Ctrl+B')
|
||||
|
||||
// CLI's "Requirements:" block — surfaces STT/audio setup issues
|
||||
// so the user sees "STT provider: MISSING ..." instead of
|
||||
// silently failing on every Ctrl+B press.
|
||||
if (r.details) {
|
||||
ctx.transcript.sys('')
|
||||
ctx.transcript.sys(' Requirements:')
|
||||
|
||||
for (const line of r.details.split('\n')) {
|
||||
if (line.trim()) {
|
||||
ctx.transcript.sys(` ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
if (action === 'tts') {
|
||||
ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// on/off — mirror cli.py:_enable_voice_mode's 3-line output
|
||||
if (r.enabled) {
|
||||
const tts = r.tts ? ' (TTS enabled)' : ''
|
||||
ctx.transcript.sys(`Voice mode enabled${tts}`)
|
||||
ctx.transcript.sys(' Ctrl+B to start/stop recording')
|
||||
ctx.transcript.sys(' /voice tts to toggle speech output')
|
||||
ctx.transcript.sys(' /voice off to disable voice mode')
|
||||
} else {
|
||||
ctx.transcript.sys('Voice mode disabled.')
|
||||
}
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
@@ -19,6 +19,20 @@ const INTERRUPT_COOLDOWN_MS = 1500
|
||||
const ACTIVITY_LIMIT = 8
|
||||
const TRAIL_LIMIT = 8
|
||||
|
||||
// Extracts the raw patch from a diff-only segment produced by
|
||||
// pushInlineDiffSegment. Used at message.complete to dedupe against final
|
||||
// assistant text that narrates the same patch. Returns null for anything
|
||||
// else so real assistant narration never gets touched.
|
||||
const diffSegmentBody = (msg: Msg): null | string => {
|
||||
if (msg.kind !== 'diff') {
|
||||
return null
|
||||
}
|
||||
|
||||
const m = msg.text.match(/^```diff\n([\s\S]*?)\n```$/)
|
||||
|
||||
return m ? m[1]! : null
|
||||
}
|
||||
|
||||
export interface InterruptDeps {
|
||||
appendMessage: (msg: Msg) => void
|
||||
gw: { request: <T = unknown>(method: string, params?: Record<string, unknown>) => Promise<T> }
|
||||
@@ -40,7 +54,6 @@ class TurnController {
|
||||
bufRef = ''
|
||||
interrupted = false
|
||||
lastStatusNote = ''
|
||||
pendingInlineDiffs: string[] = []
|
||||
persistedToolLabels = new Set<string>()
|
||||
persistSpawnTree?: (subagents: SubagentProgress[], sessionId: null | string) => Promise<void>
|
||||
protocolWarned = false
|
||||
@@ -79,7 +92,6 @@ class TurnController {
|
||||
this.activeTools = []
|
||||
this.streamTimer = clear(this.streamTimer)
|
||||
this.bufRef = ''
|
||||
this.pendingInlineDiffs = []
|
||||
this.pendingSegmentTools = []
|
||||
this.segmentMessages = []
|
||||
|
||||
@@ -186,18 +198,35 @@ class TurnController {
|
||||
}, REASONING_PULSE_MS)
|
||||
}
|
||||
|
||||
queueInlineDiff(diffText: string) {
|
||||
pushInlineDiffSegment(diffText: string) {
|
||||
// Strip CLI chrome the gateway emits before the unified diff (e.g. a
|
||||
// leading "┊ review diff" header written by `_emit_inline_diff` for the
|
||||
// terminal printer). That header only makes sense as stdout dressing,
|
||||
// not inside a markdown ```diff block.
|
||||
const text = diffText.replace(/^\s*┊[^\n]*\n?/, '').trim()
|
||||
const stripped = diffText.replace(/^\s*┊[^\n]*\n?/, '').trim()
|
||||
|
||||
if (!text || this.pendingInlineDiffs.includes(text)) {
|
||||
if (!stripped) {
|
||||
return
|
||||
}
|
||||
|
||||
this.pendingInlineDiffs = [...this.pendingInlineDiffs, text]
|
||||
// Flush any in-progress streaming text as its own segment first, so the
|
||||
// diff lands BETWEEN the assistant narration that preceded the edit and
|
||||
// whatever the agent streams afterwards — not glued onto the final
|
||||
// message. This is the whole point of segment-anchored diffs: the diff
|
||||
// renders where the edit actually happened.
|
||||
this.flushStreamingSegment()
|
||||
|
||||
const block = `\`\`\`diff\n${stripped}\n\`\`\``
|
||||
|
||||
// Skip consecutive duplicates (same tool firing tool.complete twice, or
|
||||
// two edits producing the same patch). Keeping this cheap — deeper
|
||||
// dedupe against the final assistant text happens at message.complete.
|
||||
if (this.segmentMessages.at(-1)?.text === block) {
|
||||
return
|
||||
}
|
||||
|
||||
this.segmentMessages = [...this.segmentMessages, { kind: 'diff', role: 'assistant', text: block }]
|
||||
patchTurnState({ streamSegments: this.segmentMessages })
|
||||
}
|
||||
|
||||
pushActivity(text: string, tone: ActivityItem['tone'] = 'info', replaceLabel?: string) {
|
||||
@@ -234,7 +263,6 @@ class TurnController {
|
||||
this.idle()
|
||||
this.clearReasoning()
|
||||
this.clearStatusTimer()
|
||||
this.pendingInlineDiffs = []
|
||||
this.pendingSegmentTools = []
|
||||
this.segmentMessages = []
|
||||
this.turnTools = []
|
||||
@@ -245,31 +273,31 @@ class TurnController {
|
||||
const rawText = (payload.rendered ?? payload.text ?? this.bufRef).trimStart()
|
||||
const split = splitReasoning(rawText)
|
||||
const finalText = split.text
|
||||
// Skip appending if the assistant already narrated the diff inside a
|
||||
// markdown fence of its own — otherwise we render two stacked diff
|
||||
// blocks for the same edit.
|
||||
const assistantAlreadyHasDiff = /```(?:diff|patch)\b/i.test(finalText)
|
||||
|
||||
const remainingInlineDiffs = assistantAlreadyHasDiff
|
||||
? []
|
||||
: this.pendingInlineDiffs.filter(diff => !finalText.includes(diff))
|
||||
|
||||
const inlineDiffBlock = remainingInlineDiffs.length
|
||||
? `\`\`\`diff\n${remainingInlineDiffs.join('\n\n')}\n\`\`\``
|
||||
: ''
|
||||
|
||||
const mergedText = [finalText, inlineDiffBlock].filter(Boolean).join('\n\n')
|
||||
const existingReasoning = this.reasoningText.trim() || String(payload.reasoning ?? '').trim()
|
||||
const savedReasoning = [existingReasoning, existingReasoning ? '' : split.reasoning].filter(Boolean).join('\n\n')
|
||||
const savedReasoningTokens = savedReasoning ? estimateTokensRough(savedReasoning) : 0
|
||||
const savedToolTokens = this.toolTokenAcc
|
||||
const tools = this.pendingSegmentTools
|
||||
const finalMessages = [...this.segmentMessages]
|
||||
|
||||
if (mergedText) {
|
||||
// Drop diff-only segments the agent is about to narrate in the final
|
||||
// reply. Without this, a closing "here's the diff …" message would
|
||||
// render two stacked copies of the same patch. Only touches segments
|
||||
// with `kind: 'diff'` emitted by pushInlineDiffSegment — real
|
||||
// assistant narration stays put.
|
||||
const finalHasOwnDiffFence = /```(?:diff|patch)\b/i.test(finalText)
|
||||
|
||||
const segments = this.segmentMessages.filter(msg => {
|
||||
const body = diffSegmentBody(msg)
|
||||
|
||||
return body === null || (!finalHasOwnDiffFence && !finalText.includes(body))
|
||||
})
|
||||
|
||||
const finalMessages = [...segments]
|
||||
|
||||
if (finalText) {
|
||||
finalMessages.push({
|
||||
role: 'assistant',
|
||||
text: mergedText,
|
||||
text: finalText,
|
||||
thinking: savedReasoning || undefined,
|
||||
thinkingTokens: savedReasoning ? savedReasoningTokens : undefined,
|
||||
toolTokens: savedToolTokens || undefined,
|
||||
@@ -300,7 +328,7 @@ class TurnController {
|
||||
this.bufRef = ''
|
||||
patchTurnState({ activity: [], outcome: '' })
|
||||
|
||||
return { finalMessages, finalText: mergedText, wasInterrupted }
|
||||
return { finalMessages, finalText, wasInterrupted }
|
||||
}
|
||||
|
||||
recordMessageDelta({ rendered, text }: { rendered?: string; text?: string }) {
|
||||
@@ -406,7 +434,6 @@ class TurnController {
|
||||
this.bufRef = ''
|
||||
this.interrupted = false
|
||||
this.lastStatusNote = ''
|
||||
this.pendingInlineDiffs = []
|
||||
this.pendingSegmentTools = []
|
||||
this.protocolWarned = false
|
||||
this.segmentMessages = []
|
||||
@@ -452,7 +479,6 @@ class TurnController {
|
||||
this.endReasoningPhase()
|
||||
this.clearReasoning()
|
||||
this.activeTools = []
|
||||
this.pendingInlineDiffs = []
|
||||
this.turnTools = []
|
||||
this.toolTokenAcc = 0
|
||||
this.persistedToolLabels.clear()
|
||||
|
||||
@@ -8,7 +8,7 @@ import type {
|
||||
SudoRespondResponse,
|
||||
VoiceRecordResponse
|
||||
} from '../gatewayTypes.js'
|
||||
import { isAction, isMac } from '../lib/platform.js'
|
||||
import { isAction, isMac, isVoiceToggleKey } from '../lib/platform.js'
|
||||
|
||||
import { getInputSelection } from './inputSelectionStore.js'
|
||||
import type { InputHandlerContext, InputHandlerResult } from './interfaces.js'
|
||||
@@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
|
||||
}
|
||||
}
|
||||
|
||||
const voiceStop = () => {
|
||||
voice.setRecording(false)
|
||||
voice.setProcessing(true)
|
||||
// CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
|
||||
// (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
|
||||
// Ctrl+B while the mode is off sys-nudges the user. While the mode is
|
||||
// on, the first press starts a continuous loop (gateway → start_continuous,
|
||||
// VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
|
||||
// The gateway publishes voice.status + voice.transcript events that
|
||||
// createGatewayEventHandler turns into UI badges and composer injection.
|
||||
const voiceRecordToggle = () => {
|
||||
if (!voice.enabled) {
|
||||
return actions.sys('voice: mode is off — enable with /voice on')
|
||||
}
|
||||
|
||||
const starting = !voice.recording
|
||||
const action = starting ? 'start' : 'stop'
|
||||
|
||||
// Optimistic UI — flip the REC badge immediately so the user gets
|
||||
// feedback while the RPC round-trips; the voice.status event is the
|
||||
// authoritative source and may correct us.
|
||||
if (starting) {
|
||||
voice.setRecording(true)
|
||||
} else {
|
||||
voice.setRecording(false)
|
||||
voice.setProcessing(false)
|
||||
}
|
||||
|
||||
gateway
|
||||
.rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
|
||||
.then(r => {
|
||||
if (!r) {
|
||||
return
|
||||
.rpc<VoiceRecordResponse>('voice.record', { action })
|
||||
.catch((e: Error) => {
|
||||
// Revert optimistic UI on failure.
|
||||
if (starting) {
|
||||
voice.setRecording(false)
|
||||
}
|
||||
|
||||
const transcript = String(r.text || '').trim()
|
||||
|
||||
if (!transcript) {
|
||||
return actions.sys('voice: no speech detected')
|
||||
}
|
||||
|
||||
cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
|
||||
})
|
||||
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
|
||||
.finally(() => {
|
||||
voice.setProcessing(false)
|
||||
patchUiState({ status: 'ready' })
|
||||
actions.sys(`voice error: ${e.message}`)
|
||||
})
|
||||
}
|
||||
|
||||
const voiceStart = () =>
|
||||
gateway
|
||||
.rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
|
||||
.then(r => {
|
||||
if (!r) {
|
||||
return
|
||||
}
|
||||
|
||||
voice.setRecording(true)
|
||||
patchUiState({ status: 'recording…' })
|
||||
})
|
||||
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
|
||||
|
||||
useInput((ch, key) => {
|
||||
const live = getUiState()
|
||||
|
||||
@@ -370,8 +368,8 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
|
||||
return actions.newSession()
|
||||
}
|
||||
|
||||
if (isAction(key, ch, 'b')) {
|
||||
return voice.recording ? voiceStop() : voiceStart()
|
||||
if (isVoiceToggleKey(key, ch)) {
|
||||
return voiceRecordToggle()
|
||||
}
|
||||
|
||||
if (isAction(key, ch, 'g')) {
|
||||
|
||||
@@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
|
||||
composer: { actions: composerActions, refs: composerRefs, state: composerState },
|
||||
gateway,
|
||||
terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
|
||||
voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
|
||||
voice: {
|
||||
enabled: voiceEnabled,
|
||||
recording: voiceRecording,
|
||||
setProcessing: setVoiceProcessing,
|
||||
setRecording: setVoiceRecording,
|
||||
setVoiceEnabled
|
||||
},
|
||||
wheelStep: WHEEL_SCROLL_STEP
|
||||
})
|
||||
|
||||
const onEvent = useMemo(
|
||||
() =>
|
||||
createGatewayEventHandler({
|
||||
composer: { setInput: composerActions.setInput },
|
||||
gateway,
|
||||
session: {
|
||||
STARTUP_RESUME_ID,
|
||||
@@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
|
||||
resumeById: session.resumeById,
|
||||
setCatalog
|
||||
},
|
||||
submission: { submitRef },
|
||||
system: { bellOnComplete, stdout, sys },
|
||||
transcript: { appendMessage, panel, setHistoryItems }
|
||||
transcript: { appendMessage, panel, setHistoryItems },
|
||||
voice: {
|
||||
setProcessing: setVoiceProcessing,
|
||||
setRecording: setVoiceRecording,
|
||||
setVoiceEnabled
|
||||
}
|
||||
}),
|
||||
[
|
||||
appendMessage,
|
||||
bellOnComplete,
|
||||
composerActions.setInput,
|
||||
gateway,
|
||||
panel,
|
||||
session.newSession,
|
||||
session.resetSession,
|
||||
session.resumeById,
|
||||
setVoiceEnabled,
|
||||
setVoiceProcessing,
|
||||
setVoiceRecording,
|
||||
stdout,
|
||||
submitRef,
|
||||
sys
|
||||
]
|
||||
)
|
||||
@@ -698,7 +716,9 @@ export function useMainApp(gw: GatewayClient) {
|
||||
statusColor: statusColorOf(ui.status, ui.theme.color),
|
||||
stickyPrompt,
|
||||
turnStartedAt: ui.sid ? turnStartedAt : null,
|
||||
voiceLabel: voiceRecording ? 'REC' : voiceProcessing ? 'STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
|
||||
// CLI parity: the classic prompt_toolkit status bar shows a red dot
|
||||
// on REC (cli.py:_get_voice_status_fragments line 2344).
|
||||
voiceLabel: voiceRecording ? '● REC' : voiceProcessing ? '◉ STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
|
||||
}),
|
||||
[
|
||||
cwd,
|
||||
|
||||
@@ -215,7 +215,20 @@ export function StatusRule({
|
||||
</Text>
|
||||
) : null}
|
||||
<SpawnHud t={t} />
|
||||
{voiceLabel ? <Text color={t.color.dim}> │ {voiceLabel}</Text> : null}
|
||||
{voiceLabel ? (
|
||||
<Text
|
||||
color={
|
||||
voiceLabel.startsWith('●')
|
||||
? t.color.error
|
||||
: voiceLabel.startsWith('◉')
|
||||
? t.color.warn
|
||||
: t.color.dim
|
||||
}
|
||||
>
|
||||
{' │ '}
|
||||
{voiceLabel}
|
||||
</Text>
|
||||
) : null}
|
||||
{bgCount > 0 ? <Text color={t.color.dim}> │ {bgCount} bg</Text> : null}
|
||||
{showCost && typeof usage.cost_usd === 'number' ? (
|
||||
<Text color={t.color.dim}> │ ${usage.cost_usd.toFixed(4)}</Text>
|
||||
|
||||
@@ -185,56 +185,58 @@ const ComposerPane = memo(function ComposerPane({
|
||||
|
||||
<StatusRulePane at="top" composer={composer} status={status} />
|
||||
|
||||
{!isBlocked && (
|
||||
<Box flexDirection="column" marginTop={ui.statusBar === 'top' ? 0 : 1} position="relative">
|
||||
<FloatingOverlays
|
||||
cols={composer.cols}
|
||||
compIdx={composer.compIdx}
|
||||
completions={composer.completions}
|
||||
onModelSelect={actions.onModelSelect}
|
||||
onPickerSelect={actions.resumeById}
|
||||
pagerPageSize={composer.pagerPageSize}
|
||||
/>
|
||||
<Box flexDirection="column" marginTop={ui.statusBar === 'top' ? 0 : 1} position="relative">
|
||||
<FloatingOverlays
|
||||
cols={composer.cols}
|
||||
compIdx={composer.compIdx}
|
||||
completions={composer.completions}
|
||||
onModelSelect={actions.onModelSelect}
|
||||
onPickerSelect={actions.resumeById}
|
||||
pagerPageSize={composer.pagerPageSize}
|
||||
/>
|
||||
|
||||
{composer.inputBuf.map((line, i) => (
|
||||
<Box key={i}>
|
||||
<Box width={3}>
|
||||
<Text color={ui.theme.color.dim}>{i === 0 ? `${ui.theme.brand.prompt} ` : ' '}</Text>
|
||||
{!isBlocked && (
|
||||
<>
|
||||
{composer.inputBuf.map((line, i) => (
|
||||
<Box key={i}>
|
||||
<Box width={3}>
|
||||
<Text color={ui.theme.color.dim}>{i === 0 ? `${ui.theme.brand.prompt} ` : ' '}</Text>
|
||||
</Box>
|
||||
|
||||
<Text color={ui.theme.color.cornsilk}>{line || ' '}</Text>
|
||||
</Box>
|
||||
))}
|
||||
|
||||
<Box position="relative">
|
||||
<Box width={pw}>
|
||||
{sh ? (
|
||||
<Text color={ui.theme.color.shellDollar}>$ </Text>
|
||||
) : (
|
||||
<Text bold color={ui.theme.color.prompt}>
|
||||
{composer.inputBuf.length ? ' ' : `${ui.theme.brand.prompt} `}
|
||||
</Text>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
<Text color={ui.theme.color.cornsilk}>{line || ' '}</Text>
|
||||
</Box>
|
||||
))}
|
||||
<Box flexGrow={1} position="relative">
|
||||
{/* subtract NoSelect paddingX={1} (2 cols) + pw so wrap-ansi and cursorLayout agree */}
|
||||
<TextInput
|
||||
columns={Math.max(20, composer.cols - pw - 2)}
|
||||
onChange={composer.updateInput}
|
||||
onPaste={composer.handleTextPaste}
|
||||
onSubmit={composer.submit}
|
||||
placeholder={composer.empty ? PLACEHOLDER : ui.busy ? 'Ctrl+C to interrupt…' : ''}
|
||||
value={composer.input}
|
||||
/>
|
||||
|
||||
<Box position="relative">
|
||||
<Box width={pw}>
|
||||
{sh ? (
|
||||
<Text color={ui.theme.color.shellDollar}>$ </Text>
|
||||
) : (
|
||||
<Text bold color={ui.theme.color.prompt}>
|
||||
{composer.inputBuf.length ? ' ' : `${ui.theme.brand.prompt} `}
|
||||
</Text>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
<Box flexGrow={1} position="relative">
|
||||
{/* subtract NoSelect paddingX={1} (2 cols) + pw so wrap-ansi and cursorLayout agree */}
|
||||
<TextInput
|
||||
columns={Math.max(20, composer.cols - pw - 2)}
|
||||
onChange={composer.updateInput}
|
||||
onPaste={composer.handleTextPaste}
|
||||
onSubmit={composer.submit}
|
||||
placeholder={composer.empty ? PLACEHOLDER : ui.busy ? 'Ctrl+C to interrupt…' : ''}
|
||||
value={composer.input}
|
||||
/>
|
||||
|
||||
<Box position="absolute" right={0}>
|
||||
<GoodVibesHeart t={ui.theme} tick={status.goodVibesTick} />
|
||||
<Box position="absolute" right={0}>
|
||||
<GoodVibesHeart t={ui.theme} tick={status.goodVibesTick} />
|
||||
</Box>
|
||||
</Box>
|
||||
</Box>
|
||||
</Box>
|
||||
</Box>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
{!composer.empty && !ui.sid && <Text color={ui.theme.color.dim}>⚕ {ui.status}</Text>}
|
||||
|
||||
|
||||
@@ -81,11 +81,16 @@ export const MessageLine = memo(function MessageLine({
|
||||
return <Text {...(body ? { color: body } : {})}>{msg.text}</Text>
|
||||
})()
|
||||
|
||||
// Diff segments (emitted by pushInlineDiffSegment between narration
|
||||
// segments) need a blank line on both sides so the patch doesn't butt up
|
||||
// against the prose around it.
|
||||
const isDiffSegment = msg.kind === 'diff'
|
||||
|
||||
return (
|
||||
<Box
|
||||
flexDirection="column"
|
||||
marginBottom={msg.role === 'user' ? 1 : 0}
|
||||
marginTop={msg.role === 'user' || msg.kind === 'slash' ? 1 : 0}
|
||||
marginBottom={msg.role === 'user' || isDiffSegment ? 1 : 0}
|
||||
marginTop={msg.role === 'user' || msg.kind === 'slash' || isDiffSegment ? 1 : 0}
|
||||
>
|
||||
{showDetails && (
|
||||
<Box flexDirection="column" marginBottom={1}>
|
||||
|
||||
@@ -623,7 +623,19 @@ export function TextInput({
|
||||
return
|
||||
}
|
||||
|
||||
if ((k.ctrl && inp === 'c') || k.tab || (k.shift && k.tab) || k.pageUp || k.pageDown || k.escape) {
|
||||
// Ctrl+B is the documented voice-recording toggle (see platform.ts →
|
||||
// isVoiceToggleKey). Pass it through so the app-level handler in
|
||||
// useInputHandlers receives it instead of being swallowed here as
|
||||
// either backward-word nav (line below) or a literal 'b' insertion.
|
||||
if (
|
||||
(k.ctrl && inp === 'c') ||
|
||||
(k.ctrl && inp === 'b') ||
|
||||
k.tab ||
|
||||
(k.shift && k.tab) ||
|
||||
k.pageUp ||
|
||||
k.pageDown ||
|
||||
k.escape
|
||||
) {
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -236,10 +236,16 @@ export interface ImageAttachResponse {
|
||||
// ── Voice ────────────────────────────────────────────────────────────
|
||||
|
||||
export interface VoiceToggleResponse {
|
||||
audio_available?: boolean
|
||||
available?: boolean
|
||||
details?: string
|
||||
enabled?: boolean
|
||||
stt_available?: boolean
|
||||
tts?: boolean
|
||||
}
|
||||
|
||||
export interface VoiceRecordResponse {
|
||||
status?: string
|
||||
text?: string
|
||||
}
|
||||
|
||||
@@ -368,6 +374,8 @@ export type GatewayEvent =
|
||||
| { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
|
||||
| { payload?: undefined; session_id?: string; type: 'message.start' }
|
||||
| { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
|
||||
| { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
|
||||
| { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
|
||||
| { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
|
||||
| { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
|
||||
| { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }
|
||||
|
||||
@@ -33,3 +33,17 @@ export const isMacActionFallback = (
|
||||
/** Match action-modifier + a single character (case-insensitive). */
|
||||
export const isAction = (key: { ctrl: boolean; meta: boolean; super?: boolean }, ch: string, target: string): boolean =>
|
||||
isActionMod(key) && ch.toLowerCase() === target
|
||||
|
||||
/**
|
||||
* Voice recording toggle key (Ctrl+B).
|
||||
*
|
||||
* Documented as "Ctrl+B" everywhere: tips.py, config.yaml's voice.record_key
|
||||
* default, and the Python CLI prompt_toolkit handler. We accept raw Ctrl+B on
|
||||
* every platform so the TUI matches those docs. On macOS we additionally
|
||||
* accept Cmd+B (the platform action modifier) so existing macOS muscle memory
|
||||
* keeps working.
|
||||
*/
|
||||
export const isVoiceToggleKey = (
|
||||
key: { ctrl: boolean; meta: boolean; super?: boolean },
|
||||
ch: string
|
||||
): boolean => (key.ctrl || isActionMod(key)) && ch.toLowerCase() === 'b'
|
||||
|
||||
+1
-1
@@ -102,7 +102,7 @@ export interface ClarifyReq {
|
||||
|
||||
export interface Msg {
|
||||
info?: SessionInfo
|
||||
kind?: 'intro' | 'panel' | 'slash' | 'trail'
|
||||
kind?: 'diff' | 'intro' | 'panel' | 'slash' | 'trail'
|
||||
panelData?: PanelData
|
||||
role: Role
|
||||
text: string
|
||||
|
||||
@@ -0,0 +1,223 @@
|
||||
# Browser CDP Supervisor — Design
|
||||
|
||||
**Status:** Shipped (PR 14540)
|
||||
**Last updated:** 2026-04-23
|
||||
**Author:** @teknium1
|
||||
|
||||
## Problem
|
||||
|
||||
Native JS dialogs (`alert`/`confirm`/`prompt`/`beforeunload`) and iframes are
|
||||
the two biggest gaps in our browser tooling:
|
||||
|
||||
1. **Dialogs block the JS thread.** Any operation on the page stalls until the
|
||||
dialog is handled. Before this work, the agent had no way to know a dialog
|
||||
was open — subsequent tool calls would hang or throw opaque errors.
|
||||
2. **Iframes are invisible.** The agent could see iframe nodes in the DOM
|
||||
snapshot but could not click, type, or eval inside them — especially
|
||||
cross-origin (OOPIF) iframes that live in separate Chromium processes.
|
||||
|
||||
[PR #12550](https://github.com/NousResearch/hermes-agent/pull/12550) proposed a
|
||||
stateless `browser_dialog` wrapper. That doesn't solve detection — it's a
|
||||
cleaner CDP call for when the agent already knows (via symptoms) that a dialog
|
||||
is open. Closed as superseded.
|
||||
|
||||
## Backend capability matrix (verified live 2026-04-23)
|
||||
|
||||
Using throwaway probe scripts against a data-URL page that fires alerts in the
|
||||
main frame and in a same-origin srcdoc iframe, plus a cross-origin
|
||||
`https://example.com` iframe:
|
||||
|
||||
| Backend | Dialog detect | Dialog respond | Frame tree | OOPIF `Runtime.evaluate` via `browser_cdp(frame_id=...)` |
|
||||
|---|---|---|---|---|
|
||||
| Local Chrome (`--remote-debugging-port`) / `/browser connect` | ✓ | ✓ full workflow | ✓ | ✓ |
|
||||
| Browserbase | ✓ (via bridge) | ✓ full workflow (via bridge) | ✓ | ✓ (`document.title = "Example Domain"` verified on real cross-origin iframe) |
|
||||
| Camofox | ✗ no CDP (REST-only) | ✗ | partial via DOM snapshot | ✗ |
|
||||
|
||||
**How Browserbase respond works.** Browserbase's CDP proxy uses Playwright
|
||||
internally and auto-dismisses native dialogs within ~10ms, so
|
||||
`Page.handleJavaScriptDialog` can't keep up. To work around this, the
|
||||
supervisor injects a bridge script via
|
||||
`Page.addScriptToEvaluateOnNewDocument` that overrides
|
||||
`window.alert`/`confirm`/`prompt` with a synchronous XHR to a magic host
|
||||
(`hermes-dialog-bridge.invalid`). `Fetch.enable` intercepts those XHRs
|
||||
before they touch the network — the dialog becomes a `Fetch.requestPaused`
|
||||
event the supervisor captures, and `respond_to_dialog` fulfills via
|
||||
`Fetch.fulfillRequest` with a JSON body the injected script decodes.
|
||||
|
||||
Net result: from the page's perspective, `prompt()` still returns the
|
||||
agent-supplied string. From the agent's perspective, it's the same
|
||||
`browser_dialog(action=...)` API either way. Tested end-to-end against
|
||||
real Browserbase sessions — 4/4 (alert/prompt/confirm-accept/confirm-dismiss)
|
||||
pass including value round-tripping back into page JS.
|
||||
|
||||
Camofox stays unsupported for this PR; follow-up upstream issue planned at
|
||||
`jo-inc/camofox-browser` requesting a dialog polling endpoint.
|
||||
|
||||
## Architecture
|
||||
|
||||
### CDPSupervisor
|
||||
|
||||
One `asyncio.Task` running in a background daemon thread per Hermes `task_id`.
|
||||
Holds a persistent WebSocket to the backend's CDP endpoint. Maintains:
|
||||
|
||||
- **Dialog queue** — `List[PendingDialog]` with `{id, type, message, default_prompt, session_id, opened_at}`
|
||||
- **Frame tree** — `Dict[frame_id, FrameInfo]` with parent relationships, URL, origin, whether cross-origin child session
|
||||
- **Session map** — `Dict[session_id, SessionInfo]` so interaction tools can route to the right attached session for OOPIF operations
|
||||
- **Recent console errors** — ring buffer of the last 50 (for PR 2 diagnostics)
|
||||
|
||||
Subscribes on attach:
|
||||
- `Page.enable` — `javascriptDialogOpening`, `frameAttached`, `frameNavigated`, `frameDetached`
|
||||
- `Runtime.enable` — `executionContextCreated`, `consoleAPICalled`, `exceptionThrown`
|
||||
- `Target.setAutoAttach {autoAttach: true, flatten: true}` — surfaces child OOPIF targets; supervisor enables `Page`+`Runtime` on each
|
||||
|
||||
Thread-safe state access via a snapshot lock; tool handlers (sync) read the
|
||||
frozen snapshot without awaiting.
|
||||
|
||||
### Lifecycle
|
||||
|
||||
- **Start:** `SupervisorRegistry.get_or_start(task_id, cdp_url)` — called by
|
||||
`browser_navigate`, Browserbase session create, `/browser connect`. Idempotent.
|
||||
- **Stop:** session teardown or `/browser disconnect`. Cancels the asyncio
|
||||
task, closes the WebSocket, discards state.
|
||||
- **Rebind:** if the CDP URL changes (user reconnects to a new Chrome), stop
|
||||
the old supervisor and start fresh — never reuse state across endpoints.
|
||||
|
||||
### Dialog policy
|
||||
|
||||
Configurable via `config.yaml` under `browser.dialog_policy`:
|
||||
|
||||
- **`must_respond`** (default) — capture, surface in `browser_snapshot`, wait
|
||||
for explicit `browser_dialog(action=...)` call. After a 300s safety timeout
|
||||
with no response, auto-dismiss and log. Prevents a buggy agent from stalling
|
||||
forever.
|
||||
- `auto_dismiss` — record and dismiss immediately; agent sees it after the
|
||||
fact via `browser_state` inside `browser_snapshot`.
|
||||
- `auto_accept` — record and accept (useful for `beforeunload` where the user
|
||||
wants to navigate away cleanly).
|
||||
|
||||
Policy is per-task; no per-dialog overrides in v1.
|
||||
|
||||
## Agent surface (PR 1)
|
||||
|
||||
### One new tool
|
||||
|
||||
```
|
||||
browser_dialog(action, prompt_text=None, dialog_id=None)
|
||||
```
|
||||
|
||||
- `action="accept"` / `"dismiss"` → responds to the specified or sole pending dialog (required)
|
||||
- `prompt_text=...` → text to supply to a `prompt()` dialog
|
||||
- `dialog_id=...` → disambiguate when multiple dialogs queued (rare)
|
||||
|
||||
Tool is response-only. Agent reads pending dialogs from `browser_snapshot`
|
||||
output before calling.
|
||||
|
||||
### `browser_snapshot` extension
|
||||
|
||||
Adds three optional fields to the existing snapshot output when a supervisor
|
||||
is attached:
|
||||
|
||||
```json
|
||||
{
|
||||
"pending_dialogs": [
|
||||
{"id": "d-1", "type": "alert", "message": "Hello", "opened_at": 1650000000.0}
|
||||
],
|
||||
"recent_dialogs": [
|
||||
{"id": "d-1", "type": "alert", "message": "...", "opened_at": 1650000000.0,
|
||||
"closed_at": 1650000000.1, "closed_by": "remote"}
|
||||
],
|
||||
"frame_tree": {
|
||||
"top": {"frame_id": "FRAME_A", "url": "https://example.com/", "origin": "https://example.com"},
|
||||
"children": [
|
||||
{"frame_id": "FRAME_B", "url": "about:srcdoc", "is_oopif": false},
|
||||
{"frame_id": "FRAME_C", "url": "https://ads.example.net/", "is_oopif": true, "session_id": "SID_C"}
|
||||
],
|
||||
"truncated": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- **`pending_dialogs`**: dialogs currently blocking the page's JS thread.
|
||||
The agent must call `browser_dialog(action=...)` to respond. Empty on
|
||||
Browserbase because their CDP proxy auto-dismisses within ~10ms.
|
||||
|
||||
- **`recent_dialogs`**: ring buffer of up to 20 recently-closed dialogs with
|
||||
a `closed_by` tag — `"agent"` (we responded), `"auto_policy"` (local
|
||||
auto_dismiss/auto_accept), `"watchdog"` (must_respond timeout hit), or
|
||||
`"remote"` (browser/backend closed it on us, e.g. Browserbase). This is
|
||||
how agents on Browserbase still get visibility into what happened.
|
||||
|
||||
- **`frame_tree`**: frame structure including cross-origin (OOPIF) children.
|
||||
Capped at 30 entries + OOPIF depth 2 to bound snapshot size on ad-heavy
|
||||
pages. `truncated: true` surfaces when limits were hit; agents needing
|
||||
the full tree can use `browser_cdp` with `Page.getFrameTree`.
|
||||
|
||||
No new tool schema surface for any of these — the agent reads the snapshot
|
||||
it already requests.
|
||||
|
||||
### Availability gating
|
||||
|
||||
Both surfaces gate on `_browser_cdp_check` (supervisor can only run when a CDP
|
||||
endpoint is reachable). On Camofox / no-backend sessions, the dialog tool is
|
||||
hidden and snapshot omits the new fields — no schema bloat.
|
||||
|
||||
## Cross-origin iframe interaction
|
||||
|
||||
Extending the dialog-detect work, `browser_cdp(frame_id=...)` routes CDP
|
||||
calls (notably `Runtime.evaluate`) through the supervisor's already-connected
|
||||
WebSocket using the OOPIF's child `sessionId`. Agents pick frame_ids out of
|
||||
`browser_snapshot.frame_tree.children[]` where `is_oopif=true` and pass them
|
||||
to `browser_cdp`. For same-origin iframes (no dedicated CDP session), the
|
||||
agent uses `contentWindow`/`contentDocument` from a top-level
|
||||
`Runtime.evaluate` instead — supervisor surfaces an error pointing at that
|
||||
fallback when `frame_id` belongs to a non-OOPIF.
|
||||
|
||||
On Browserbase, this is the ONLY reliable path for iframe interaction —
|
||||
stateless CDP connections (opened per `browser_cdp` call) hit signed-URL
|
||||
expiry, while the supervisor's long-lived connection keeps a valid session.
|
||||
|
||||
## Camofox (follow-up)
|
||||
|
||||
Issue planned against `jo-inc/camofox-browser` adding:
|
||||
- Playwright `page.on('dialog', handler)` per session
|
||||
- `GET /tabs/:tabId/dialogs` polling endpoint
|
||||
- `POST /tabs/:tabId/dialogs/:id` to accept/dismiss
|
||||
- Frame-tree introspection endpoint
|
||||
|
||||
## Files touched (PR 1)
|
||||
|
||||
### New
|
||||
|
||||
- `tools/browser_supervisor.py` — `CDPSupervisor`, `SupervisorRegistry`, `PendingDialog`, `FrameInfo`
|
||||
- `tools/browser_dialog_tool.py` — `browser_dialog` tool handler
|
||||
- `tests/tools/test_browser_supervisor.py` — mock CDP WebSocket server + lifecycle/state tests
|
||||
- `website/docs/developer-guide/browser-supervisor.md` — this file
|
||||
|
||||
### Modified
|
||||
|
||||
- `toolsets.py` — register `browser_dialog` in `browser`, `hermes-acp`, `hermes-api-server`, core toolsets (gated on CDP reachability)
|
||||
- `tools/browser_tool.py`
|
||||
- `browser_navigate` start-hook: if CDP URL resolvable, `SupervisorRegistry.get_or_start(task_id, cdp_url)`
|
||||
- `browser_snapshot` (at ~line 1536): merge supervisor state into return payload
|
||||
- `/browser connect` handler: restart supervisor with new endpoint
|
||||
- Session teardown hooks in `_cleanup_browser_session`
|
||||
- `hermes_cli/config.py` — add `browser.dialog_policy` and `browser.dialog_timeout_s` to `DEFAULT_CONFIG`
|
||||
- Docs: `website/docs/user-guide/features/browser.md`, `website/docs/reference/tools-reference.md`, `website/docs/reference/toolsets-reference.md`
|
||||
|
||||
## Non-goals
|
||||
|
||||
- Detection/interaction for Camofox (upstream gap; tracked separately)
|
||||
- Streaming dialog/frame events live to the user (would require gateway hooks)
|
||||
- Persisting dialog history across sessions (in-memory only)
|
||||
- Per-iframe dialog policies (agent can express this via `dialog_id`)
|
||||
- Replacing `browser_cdp` — it stays as the escape hatch for the long tail (cookies, viewport, network throttling)
|
||||
|
||||
## Testing
|
||||
|
||||
Unit tests use an asyncio mock CDP server that speaks enough of the protocol
|
||||
to exercise all state transitions: attach, enable, navigate, dialog fire,
|
||||
dialog dismiss, frame attach/detach, child target attach, session teardown.
|
||||
Real-backend E2E (Browserbase + local Chrome) is manual; probe scripts from
|
||||
the 2026-04-23 investigation kept in-repo under
|
||||
`scripts/browser_supervisor_e2e.py` so anyone can re-verify on new backend
|
||||
versions.
|
||||
@@ -6,7 +6,7 @@ description: "Official optional skills shipped with hermes-agent — install via
|
||||
|
||||
# Optional Skills Catalog
|
||||
|
||||
Official optional skills ship with the hermes-agent repository under `optional-skills/` but are **not active by default**. Install them explicitly:
|
||||
Optional skills ship with hermes-agent under `optional-skills/` but are **not active by default**. Install them explicitly:
|
||||
|
||||
```bash
|
||||
hermes skills install official/<category>/<skill>
|
||||
@@ -19,7 +19,7 @@ hermes skills install official/blockchain/solana
|
||||
hermes skills install official/mlops/flash-attention
|
||||
```
|
||||
|
||||
Once installed, the skill appears in the agent's skill list and can be loaded automatically when relevant tasks are detected.
|
||||
Each skill below links to a dedicated page with its full definition, setup, and usage.
|
||||
|
||||
To uninstall:
|
||||
|
||||
@@ -27,136 +27,139 @@ To uninstall:
|
||||
hermes skills uninstall <skill-name>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Autonomous AI Agents
|
||||
## autonomous-ai-agents
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **blackbox** | Delegate coding tasks to Blackbox AI CLI agent. Multi-model agent with built-in judge that runs tasks through multiple LLMs and picks the best result. |
|
||||
| **honcho** | Configure and use Honcho memory with Hermes — cross-session user modeling, multi-profile peer isolation, observation config, and dialectic reasoning. |
|
||||
| [**blackbox**](/docs/user-guide/skills/optional/autonomous-ai-agents/autonomous-ai-agents-blackbox) | Delegate coding tasks to Blackbox AI CLI agent. Multi-model agent with built-in judge that runs tasks through multiple LLMs and picks the best result. Requires the blackbox CLI and a Blackbox AI API key. |
|
||||
| [**honcho**](/docs/user-guide/skills/optional/autonomous-ai-agents/autonomous-ai-agents-honcho) | Configure and use Honcho memory with Hermes -- cross-session user modeling, multi-profile peer isolation, observation config, dialectic reasoning, session summaries, and context budget enforcement. Use when setting up Honcho, troubleshoo... |
|
||||
|
||||
## Blockchain
|
||||
## blockchain
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **base** | Query Base (Ethereum L2) blockchain data with USD pricing — wallet balances, token info, transaction details, gas analysis, contract inspection, whale detection, and live network stats. No API key required. |
|
||||
| **solana** | Query Solana blockchain data with USD pricing — wallet balances, token portfolios, transaction details, NFTs, whale detection, and live network stats. No API key required. |
|
||||
| [**base**](/docs/user-guide/skills/optional/blockchain/blockchain-base) | Query Base (Ethereum L2) blockchain data with USD pricing — wallet balances, token info, transaction details, gas analysis, contract inspection, whale detection, and live network stats. Uses Base RPC + CoinGecko. No API key required. |
|
||||
| [**solana**](/docs/user-guide/skills/optional/blockchain/blockchain-solana) | Query Solana blockchain data with USD pricing — wallet balances, token portfolios with values, transaction details, NFTs, whale detection, and live network stats. Uses Solana RPC + CoinGecko. No API key required. |
|
||||
|
||||
## Communication
|
||||
## communication
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **one-three-one-rule** | Structured communication framework for proposals and decision-making. |
|
||||
| [**one-three-one-rule**](/docs/user-guide/skills/optional/communication/communication-one-three-one-rule) | Structured decision-making framework for technical proposals and trade-off analysis. When the user faces a choice between multiple approaches (architecture decisions, tool selection, refactoring strategies, migration paths), this skill p... |
|
||||
|
||||
## Creative
|
||||
## creative
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **blender-mcp** | Control Blender directly from Hermes via socket connection to the blender-mcp addon. Create 3D objects, materials, animations, and run arbitrary Blender Python (bpy) code. |
|
||||
| **concept-diagrams** | Generate flat, minimal light/dark-aware SVG diagrams as standalone HTML files, using a unified educational visual language (9 semantic color ramps, automatic dark mode). Best for physics setups, chemistry mechanisms, math curves, physical objects (aircraft, turbines, smartphones), floor plans, cross-sections, lifecycle/process narratives, and hub-spoke system diagrams. Ships with 15 example diagrams. |
|
||||
| **meme-generation** | Generate real meme images by picking a template and overlaying text with Pillow. Produces actual `.png` meme files. |
|
||||
| **touchdesigner-mcp** | Control a running TouchDesigner instance via the twozero MCP plugin — create operators, set parameters, wire connections, execute Python, build real-time audio-reactive visuals and GLSL networks. 36 native tools. |
|
||||
| [**blender-mcp**](/docs/user-guide/skills/optional/creative/creative-blender-mcp) | Control Blender directly from Hermes via socket connection to the blender-mcp addon. Create 3D objects, materials, animations, and run arbitrary Blender Python (bpy) code. Use when user wants to create or modify anything in Blender. |
|
||||
| [**concept-diagrams**](/docs/user-guide/skills/optional/creative/creative-concept-diagrams) | Generate flat, minimal light/dark-aware SVG diagrams as standalone HTML files, using a unified educational visual language with 9 semantic color ramps, sentence-case typography, and automatic dark mode. Best suited for educational and no... |
|
||||
| [**meme-generation**](/docs/user-guide/skills/optional/creative/creative-meme-generation) | Generate real meme images by picking a template and overlaying text with Pillow. Produces actual .png meme files. |
|
||||
| [**touchdesigner-mcp**](/docs/user-guide/skills/optional/creative/creative-touchdesigner-mcp) | Control a running TouchDesigner instance via twozero MCP — create operators, set parameters, wire connections, execute Python, build real-time visuals. 36 native tools. |
|
||||
|
||||
## Dogfood
|
||||
## devops
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **adversarial-ux-test** | Roleplay the most difficult, tech-resistant user for a product — browse in-persona, rant, then filter through a RED/YELLOW/WHITE/GREEN pragmatism layer so only real UX friction becomes tickets. |
|
||||
| [**inference-sh-cli**](/docs/user-guide/skills/optional/devops/devops-cli) | Run 150+ AI apps via inference.sh CLI (infsh) — image generation, video creation, LLMs, search, 3D, social automation. Uses the terminal tool. Triggers: inference.sh, infsh, ai apps, flux, veo, image generation, video generation, seedrea... |
|
||||
| [**docker-management**](/docs/user-guide/skills/optional/devops/devops-docker-management) | Manage Docker containers, images, volumes, networks, and Compose stacks — lifecycle ops, debugging, cleanup, and Dockerfile optimization. |
|
||||
|
||||
## DevOps
|
||||
## dogfood
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **cli** | Run 150+ AI apps via inference.sh CLI (infsh) — image generation, video creation, LLMs, search, 3D, and social automation. |
|
||||
| **docker-management** | Manage Docker containers, images, volumes, networks, and Compose stacks — lifecycle ops, debugging, cleanup, and Dockerfile optimization. |
|
||||
| [**adversarial-ux-test**](/docs/user-guide/skills/optional/dogfood/dogfood-adversarial-ux-test) | Roleplay the most difficult, tech-resistant user for your product. Browse the app as that persona, find every UX pain point, then filter complaints through a pragmatism layer to separate real problems from noise. Creates actionable ticke... |
|
||||
|
||||
## Email
|
||||
## email
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **agentmail** | Give the agent its own dedicated email inbox via AgentMail. Send, receive, and manage email autonomously using agent-owned email addresses. |
|
||||
| [**agentmail**](/docs/user-guide/skills/optional/email/email-agentmail) | Give the agent its own dedicated email inbox via AgentMail. Send, receive, and manage email autonomously using agent-owned email addresses (e.g. hermes-agent@agentmail.to). |
|
||||
|
||||
## Health
|
||||
## health
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **fitness-nutrition** | Gym workout planner and nutrition tracker. Search 690+ exercises by muscle, equipment, or category via wger. Look up macros and calories for 380,000+ foods via USDA FoodData Central. Computes BMI, TDEE, one-rep max, macro splits, and body fat — pure Python, no pip installs. |
|
||||
| **neuroskill-bci** | Brain-Computer Interface (BCI) integration for neuroscience research workflows. |
|
||||
| [**fitness-nutrition**](/docs/user-guide/skills/optional/health/health-fitness-nutrition) | Gym workout planner and nutrition tracker. Search 690+ exercises by muscle, equipment, or category via wger. Look up macros and calories for 380,000+ foods via USDA FoodData Central. Compute BMI, TDEE, one-rep max, macro splits, and body... |
|
||||
| [**neuroskill-bci**](/docs/user-guide/skills/optional/health/health-neuroskill-bci) | Connect to a running NeuroSkill instance and incorporate the user's real-time cognitive and emotional state (focus, relaxation, mood, cognitive load, drowsiness, heart rate, HRV, sleep staging, and 40+ derived EXG scores) into responses.... |
|
||||
|
||||
## MCP
|
||||
## mcp
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **fastmcp** | Build, test, inspect, install, and deploy MCP servers with FastMCP in Python. Covers wrapping APIs or databases as MCP tools, exposing resources or prompts, and deployment. |
|
||||
| **mcporter** | The `mcporter` CLI — list, configure, auth, and call MCP servers/tools directly (HTTP or stdio) from the terminal. Useful for ad-hoc MCP interactions; for always-on tool discovery use the built-in `native-mcp` client instead. |
|
||||
| [**fastmcp**](/docs/user-guide/skills/optional/mcp/mcp-fastmcp) | Build, test, inspect, install, and deploy MCP servers with FastMCP in Python. Use when creating a new MCP server, wrapping an API or database as MCP tools, exposing resources or prompts, or preparing a FastMCP server for Claude Code, Cur... |
|
||||
| [**mcporter**](/docs/user-guide/skills/optional/mcp/mcp-mcporter) | Use the mcporter CLI to list, configure, auth, and call MCP servers/tools directly (HTTP or stdio), including ad-hoc servers, config edits, and CLI/type generation. |
|
||||
|
||||
## Migration
|
||||
## migration
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **openclaw-migration** | Migrate a user's OpenClaw customization footprint into Hermes Agent. Imports memories, SOUL.md, command allowlists, user skills, and selected workspace assets. |
|
||||
| [**openclaw-migration**](/docs/user-guide/skills/optional/migration/migration-openclaw-migration) | Migrate a user's OpenClaw customization footprint into Hermes Agent. Imports Hermes-compatible memories, SOUL.md, command allowlists, user skills, and selected workspace assets from ~/.openclaw, then reports exactly what could not be mig... |
|
||||
|
||||
## MLOps
|
||||
|
||||
The largest optional category — covers the full ML pipeline from data curation to production inference.
|
||||
## mlops
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **accelerate** | Simplest distributed training API. 4 lines to add distributed support to any PyTorch script. Unified API for DeepSpeed/FSDP/Megatron/DDP. |
|
||||
| **chroma** | Open-source embedding database. Store embeddings and metadata, perform vector and full-text search. Simple 4-function API for RAG and semantic search. |
|
||||
| **clip** | OpenAI's vision-language model connecting images and text. Zero-shot image classification, image-text matching, and cross-modal retrieval. Trained on 400M image-text pairs. Use for image search, content moderation, or vision-language tasks without fine-tuning. |
|
||||
| **faiss** | Facebook's library for efficient similarity search and clustering of dense vectors. Supports billions of vectors, GPU acceleration, and various index types (Flat, IVF, HNSW). |
|
||||
| **flash-attention** | Optimize transformer attention with Flash Attention for 2-4x speedup and 10-20x memory reduction. Supports PyTorch SDPA, flash-attn library, H100 FP8, and sliding window. |
|
||||
| **guidance** | Control LLM output with regex and grammars, guarantee valid JSON/XML/code generation, enforce structured formats, and build multi-step workflows with Guidance — Microsoft Research's constrained generation framework. |
|
||||
| **hermes-atropos-environments** | Build, test, and debug Hermes Agent RL environments for Atropos training. Covers the HermesAgentBaseEnv interface, reward functions, agent loop integration, and evaluation. |
|
||||
| **huggingface-tokenizers** | Fast Rust-based tokenizers for research and production. Tokenizes 1GB in under 20 seconds. Supports BPE, WordPiece, and Unigram algorithms. |
|
||||
| **instructor** | Extract structured data from LLM responses with Pydantic validation, retry failed extractions automatically, and stream partial results. |
|
||||
| **lambda-labs** | Reserved and on-demand GPU cloud instances for ML training and inference. SSH access, persistent filesystems, and multi-node clusters. |
|
||||
| **llava** | Large Language and Vision Assistant — visual instruction tuning and image-based conversations combining CLIP vision with LLaMA language models. |
|
||||
| **modal** | Serverless GPU cloud platform for running ML workloads. On-demand GPU access without infrastructure management, ML model deployment as APIs, or batch jobs with automatic scaling. |
|
||||
| **nemo-curator** | GPU-accelerated data curation for LLM training. Fuzzy deduplication (16x faster), quality filtering (30+ heuristics), semantic dedup, PII redaction. Scales with RAPIDS. |
|
||||
| **peft-fine-tuning** | Parameter-efficient fine-tuning for LLMs using LoRA, QLoRA, and 25+ methods. Train `<1%` of parameters with minimal accuracy loss for 7B–70B models on limited GPU memory. HuggingFace's official PEFT library. |
|
||||
| **pinecone** | Managed vector database for production AI. Auto-scaling, hybrid search (dense + sparse), metadata filtering, and low latency (under 100ms p95). |
|
||||
| **pytorch-fsdp** | Expert guidance for Fully Sharded Data Parallel training with PyTorch FSDP — parameter sharding, mixed precision, CPU offloading, FSDP2. |
|
||||
| **pytorch-lightning** | High-level PyTorch framework with Trainer class, automatic distributed training (DDP/FSDP/DeepSpeed), callbacks, and minimal boilerplate. |
|
||||
| **qdrant** | High-performance vector similarity search engine. Rust-powered with fast nearest neighbor search, hybrid search with filtering, and scalable vector storage. |
|
||||
| **saelens** | Train and analyze Sparse Autoencoders (SAEs) using SAELens to decompose neural network activations into interpretable features. |
|
||||
| **simpo** | Simple Preference Optimization — reference-free alternative to DPO with better performance (+6.4 pts on AlpacaEval 2.0). No reference model needed. |
|
||||
| **slime** | LLM post-training with RL using Megatron+SGLang framework. Custom data generation workflows and tight Megatron-LM integration for RL scaling. |
|
||||
| **stable-diffusion-image-generation** | State-of-the-art text-to-image generation with Stable Diffusion via HuggingFace Diffusers. Text-to-image, image-to-image translation, inpainting, and custom diffusion pipelines. |
|
||||
| **tensorrt-llm** | Optimize LLM inference with NVIDIA TensorRT for maximum throughput. 10-100x faster than PyTorch on A100/H100 with quantization (FP8/INT4) and in-flight batching. |
|
||||
| **torchtitan** | PyTorch-native distributed LLM pretraining with 4D parallelism (FSDP2, TP, PP, CP). Scale from 8 to 512+ GPUs with Float8 and torch.compile. |
|
||||
| **whisper** | OpenAI's general-purpose speech recognition. 99 languages, transcription, translation to English, and language ID. Six model sizes from tiny (39M) to large (1550M). Best for robust multilingual ASR. |
|
||||
| [**huggingface-accelerate**](/docs/user-guide/skills/optional/mlops/mlops-accelerate) | Simplest distributed training API. 4 lines to add distributed support to any PyTorch script. Unified API for DeepSpeed/FSDP/Megatron/DDP. Automatic device placement, mixed precision (FP16/BF16/FP8). Interactive config, single launch comm... |
|
||||
| [**chroma**](/docs/user-guide/skills/optional/mlops/mlops-chroma) | Open-source embedding database for AI applications. Store embeddings and metadata, perform vector and full-text search, filter by metadata. Simple 4-function API. Scales from notebooks to production clusters. Use for semantic search, RAG... |
|
||||
| [**clip**](/docs/user-guide/skills/optional/mlops/mlops-clip) | OpenAI's model connecting vision and language. Enables zero-shot image classification, image-text matching, and cross-modal retrieval. Trained on 400M image-text pairs. Use for image search, content moderation, or vision-language tasks w... |
|
||||
| [**faiss**](/docs/user-guide/skills/optional/mlops/mlops-faiss) | Facebook's library for efficient similarity search and clustering of dense vectors. Supports billions of vectors, GPU acceleration, and various index types (Flat, IVF, HNSW). Use for fast k-NN search, large-scale vector retrieval, or whe... |
|
||||
| [**optimizing-attention-flash**](/docs/user-guide/skills/optional/mlops/mlops-flash-attention) | Optimizes transformer attention with Flash Attention for 2-4x speedup and 10-20x memory reduction. Use when training/running transformers with long sequences (>512 tokens), encountering GPU memory issues with attention, or need faster in... |
|
||||
| [**guidance**](/docs/user-guide/skills/optional/mlops/mlops-guidance) | Control LLM output with regex and grammars, guarantee valid JSON/XML/code generation, enforce structured formats, and build multi-step workflows with Guidance - Microsoft Research's constrained generation framework |
|
||||
| [**hermes-atropos-environments**](/docs/user-guide/skills/optional/mlops/mlops-hermes-atropos-environments) | Build, test, and debug Hermes Agent RL environments for Atropos training. Covers the HermesAgentBaseEnv interface, reward functions, agent loop integration, evaluation with tools, wandb logging, and the three CLI modes (serve/process/eva... |
|
||||
| [**huggingface-tokenizers**](/docs/user-guide/skills/optional/mlops/mlops-huggingface-tokenizers) | Fast tokenizers optimized for research and production. Rust-based implementation tokenizes 1GB in <20 seconds. Supports BPE, WordPiece, and Unigram algorithms. Train custom vocabularies, track alignments, handle padding/truncation. Integ... |
|
||||
| [**instructor**](/docs/user-guide/skills/optional/mlops/mlops-instructor) | Extract structured data from LLM responses with Pydantic validation, retry failed extractions automatically, parse complex JSON with type safety, and stream partial results with Instructor - battle-tested structured output library |
|
||||
| [**lambda-labs-gpu-cloud**](/docs/user-guide/skills/optional/mlops/mlops-lambda-labs) | Reserved and on-demand GPU cloud instances for ML training and inference. Use when you need dedicated GPU instances with simple SSH access, persistent filesystems, or high-performance multi-node clusters for large-scale training. |
|
||||
| [**llava**](/docs/user-guide/skills/optional/mlops/mlops-llava) | Large Language and Vision Assistant. Enables visual instruction tuning and image-based conversations. Combines CLIP vision encoder with Vicuna/LLaMA language models. Supports multi-turn image chat, visual question answering, and instruct... |
|
||||
| [**modal-serverless-gpu**](/docs/user-guide/skills/optional/mlops/mlops-modal) | Serverless GPU cloud platform for running ML workloads. Use when you need on-demand GPU access without infrastructure management, deploying ML models as APIs, or running batch jobs with automatic scaling. |
|
||||
| [**nemo-curator**](/docs/user-guide/skills/optional/mlops/mlops-nemo-curator) | GPU-accelerated data curation for LLM training. Supports text/image/video/audio. Features fuzzy deduplication (16× faster), quality filtering (30+ heuristics), semantic deduplication, PII redaction, NSFW detection. Scales across GPUs wit... |
|
||||
| [**peft-fine-tuning**](/docs/user-guide/skills/optional/mlops/mlops-peft) | Parameter-efficient fine-tuning for LLMs using LoRA, QLoRA, and 25+ methods. Use when fine-tuning large models (7B-70B) with limited GPU memory, when you need to train <1% of parameters with minimal accuracy loss, or for multi-adapter se... |
|
||||
| [**pinecone**](/docs/user-guide/skills/optional/mlops/mlops-pinecone) | Managed vector database for production AI applications. Fully managed, auto-scaling, with hybrid search (dense + sparse), metadata filtering, and namespaces. Low latency (<100ms p95). Use for production RAG, recommendation systems, or se... |
|
||||
| [**pytorch-fsdp**](/docs/user-guide/skills/optional/mlops/mlops-pytorch-fsdp) | Expert guidance for Fully Sharded Data Parallel training with PyTorch FSDP - parameter sharding, mixed precision, CPU offloading, FSDP2 |
|
||||
| [**pytorch-lightning**](/docs/user-guide/skills/optional/mlops/mlops-pytorch-lightning) | High-level PyTorch framework with Trainer class, automatic distributed training (DDP/FSDP/DeepSpeed), callbacks system, and minimal boilerplate. Scales from laptop to supercomputer with same code. Use when you want clean training loops w... |
|
||||
| [**qdrant-vector-search**](/docs/user-guide/skills/optional/mlops/mlops-qdrant) | High-performance vector similarity search engine for RAG and semantic search. Use when building production RAG systems requiring fast nearest neighbor search, hybrid search with filtering, or scalable vector storage with Rust-powered per... |
|
||||
| [**sparse-autoencoder-training**](/docs/user-guide/skills/optional/mlops/mlops-saelens) | Provides guidance for training and analyzing Sparse Autoencoders (SAEs) using SAELens to decompose neural network activations into interpretable features. Use when discovering interpretable features, analyzing superposition, or studying... |
|
||||
| [**simpo-training**](/docs/user-guide/skills/optional/mlops/mlops-simpo) | Simple Preference Optimization for LLM alignment. Reference-free alternative to DPO with better performance (+6.4 points on AlpacaEval 2.0). No reference model needed, more efficient than DPO. Use for preference alignment when want simpl... |
|
||||
| [**slime-rl-training**](/docs/user-guide/skills/optional/mlops/mlops-slime) | Provides guidance for LLM post-training with RL using slime, a Megatron+SGLang framework. Use when training GLM models, implementing custom data generation workflows, or needing tight Megatron-LM integration for RL scaling. |
|
||||
| [**stable-diffusion-image-generation**](/docs/user-guide/skills/optional/mlops/mlops-stable-diffusion) | State-of-the-art text-to-image generation with Stable Diffusion models via HuggingFace Diffusers. Use when generating images from text prompts, performing image-to-image translation, inpainting, or building custom diffusion pipelines. |
|
||||
| [**tensorrt-llm**](/docs/user-guide/skills/optional/mlops/mlops-tensorrt-llm) | Optimizes LLM inference with NVIDIA TensorRT for maximum throughput and lowest latency. Use for production deployment on NVIDIA GPUs (A100/H100), when you need 10-100x faster inference than PyTorch, or for serving models with quantizatio... |
|
||||
| [**distributed-llm-pretraining-torchtitan**](/docs/user-guide/skills/optional/mlops/mlops-torchtitan) | Provides PyTorch-native distributed LLM pretraining using torchtitan with 4D parallelism (FSDP2, TP, PP, CP). Use when pretraining Llama 3.1, DeepSeek V3, or custom models at scale from 8 to 512+ GPUs with Float8, torch.compile, and dist... |
|
||||
| [**whisper**](/docs/user-guide/skills/optional/mlops/mlops-whisper) | OpenAI's general-purpose speech recognition model. Supports 99 languages, transcription, translation to English, and language identification. Six model sizes from tiny (39M params) to large (1550M params). Use for speech-to-text, podcast... |
|
||||
|
||||
## Productivity
|
||||
## productivity
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **canvas** | Canvas LMS integration — fetch enrolled courses and assignments using API token authentication. |
|
||||
| **memento-flashcards** | Spaced repetition flashcard system for learning and knowledge retention. |
|
||||
| **siyuan** | SiYuan Note API for searching, reading, creating, and managing blocks and documents in a self-hosted knowledge base. |
|
||||
| **telephony** | Give Hermes phone capabilities — provision a Twilio number, send/receive SMS/MMS, make calls, and place AI-driven outbound calls through Bland.ai or Vapi. |
|
||||
| [**canvas**](/docs/user-guide/skills/optional/productivity/productivity-canvas) | Canvas LMS integration — fetch enrolled courses and assignments using API token authentication. |
|
||||
| [**memento-flashcards**](/docs/user-guide/skills/optional/productivity/productivity-memento-flashcards) | Spaced-repetition flashcard system. Create cards from facts or text, chat with flashcards using free-text answers graded by the agent, generate quizzes from YouTube transcripts, review due cards with adaptive scheduling, and export/impor... |
|
||||
| [**siyuan**](/docs/user-guide/skills/optional/productivity/productivity-siyuan) | SiYuan Note API for searching, reading, creating, and managing blocks and documents in a self-hosted knowledge base via curl. |
|
||||
| [**telephony**](/docs/user-guide/skills/optional/productivity/productivity-telephony) | Give Hermes phone capabilities without core tool changes. Provision and persist a Twilio number, send and receive SMS/MMS, make direct calls, and place AI-driven outbound calls through Bland.ai or Vapi. |
|
||||
|
||||
## Research
|
||||
## research
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **bioinformatics** | Gateway to 400+ bioinformatics skills from bioSkills and ClawBio. Covers genomics, transcriptomics, single-cell, variant calling, pharmacogenomics, metagenomics, and structural biology. |
|
||||
| **domain-intel** | Passive domain reconnaissance using Python stdlib. Subdomain discovery, SSL certificate inspection, WHOIS lookups, DNS records, and bulk multi-domain analysis. No API keys required. |
|
||||
| **duckduckgo-search** | Free web search via DuckDuckGo — text, news, images, videos. No API key needed. |
|
||||
| **gitnexus-explorer** | Index a codebase with GitNexus and serve an interactive knowledge graph via web UI and Cloudflare tunnel. |
|
||||
| **parallel-cli** | Vendor skill for Parallel CLI — agent-native web search, extraction, deep research, enrichment, and monitoring. |
|
||||
| **qmd** | Search personal knowledge bases, notes, docs, and meeting transcripts locally using qmd — a hybrid retrieval engine with BM25, vector search, and LLM reranking. |
|
||||
| **scrapling** | Web scraping with Scrapling — HTTP fetching, stealth browser automation, Cloudflare bypass, and spider crawling via CLI and Python. |
|
||||
| [**bioinformatics**](/docs/user-guide/skills/optional/research/research-bioinformatics) | Gateway to 400+ bioinformatics skills from bioSkills and ClawBio. Covers genomics, transcriptomics, single-cell, variant calling, pharmacogenomics, metagenomics, structural biology, and more. Fetches domain-specific reference material on... |
|
||||
| [**domain-intel**](/docs/user-guide/skills/optional/research/research-domain-intel) | Passive domain reconnaissance using Python stdlib. Subdomain discovery, SSL certificate inspection, WHOIS lookups, DNS records, domain availability checks, and bulk multi-domain analysis. No API keys required. |
|
||||
| [**drug-discovery**](/docs/user-guide/skills/optional/research/research-drug-discovery) | Pharmaceutical research assistant for drug discovery workflows. Search bioactive compounds on ChEMBL, calculate drug-likeness (Lipinski Ro5, QED, TPSA, synthetic accessibility), look up drug-drug interactions via OpenFDA, interpret ADMET... |
|
||||
| [**duckduckgo-search**](/docs/user-guide/skills/optional/research/research-duckduckgo-search) | Free web search via DuckDuckGo — text, news, images, videos. No API key needed. Prefer the `ddgs` CLI when installed; use the Python DDGS library only after verifying that `ddgs` is available in the current runtime. |
|
||||
| [**gitnexus-explorer**](/docs/user-guide/skills/optional/research/research-gitnexus-explorer) | Index a codebase with GitNexus and serve an interactive knowledge graph via web UI + Cloudflare tunnel. |
|
||||
| [**parallel-cli**](/docs/user-guide/skills/optional/research/research-parallel-cli) | Optional vendor skill for Parallel CLI — agent-native web search, extraction, deep research, enrichment, FindAll, and monitoring. Prefer JSON output and non-interactive flows. |
|
||||
| [**qmd**](/docs/user-guide/skills/optional/research/research-qmd) | Search personal knowledge bases, notes, docs, and meeting transcripts locally using qmd — a hybrid retrieval engine with BM25, vector search, and LLM reranking. Supports CLI and MCP integration. |
|
||||
| [**scrapling**](/docs/user-guide/skills/optional/research/research-scrapling) | Web scraping with Scrapling - HTTP fetching, stealth browser automation, Cloudflare bypass, and spider crawling via CLI and Python. |
|
||||
|
||||
## Security
|
||||
## security
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| **1password** | Set up and use 1Password CLI (op). Install the CLI, enable desktop app integration, sign in, and read/inject secrets for commands. |
|
||||
| **oss-forensics** | Open-source software forensics — analyze packages, dependencies, and supply chain risks. |
|
||||
| **sherlock** | OSINT username search across 400+ social networks. Hunt down social media accounts by username. |
|
||||
| [**1password**](/docs/user-guide/skills/optional/security/security-1password) | Set up and use 1Password CLI (op). Use when installing the CLI, enabling desktop app integration, signing in, and reading/injecting secrets for commands. |
|
||||
| [**oss-forensics**](/docs/user-guide/skills/optional/security/security-oss-forensics) | Supply chain investigation, evidence recovery, and forensic analysis for GitHub repositories. Covers deleted commit recovery, force-push detection, IOC extraction, multi-source evidence collection, hypothesis formation/validation, and st... |
|
||||
| [**sherlock**](/docs/user-guide/skills/optional/security/security-sherlock) | OSINT username search across 400+ social networks. Hunt down social media accounts by username. |
|
||||
|
||||
## web-development
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| [**page-agent**](/docs/user-guide/skills/optional/web-development/web-development-page-agent) | Embed alibaba/page-agent into your own web application — a pure-JavaScript in-page GUI agent that ships as a single <script> tag or npm package and lets end-users of your site drive the UI with natural language ("click login, fill userna... |
|
||||
|
||||
---
|
||||
|
||||
@@ -167,4 +170,4 @@ To add a new optional skill to the repository:
|
||||
1. Create a directory under `optional-skills/<category>/<skill-name>/`
|
||||
2. Add a `SKILL.md` with standard frontmatter (name, description, version, author)
|
||||
3. Include any supporting files in `references/`, `templates/`, or `scripts/` subdirectories
|
||||
4. Submit a pull request — the skill will appear in this catalog once merged
|
||||
4. Submit a pull request — the skill will appear in this catalog and get its own docs page once merged
|
||||
|
||||
@@ -6,325 +6,174 @@ description: "Catalog of bundled skills that ship with Hermes Agent"
|
||||
|
||||
# Bundled Skills Catalog
|
||||
|
||||
Hermes ships with a large built-in skill library copied into `~/.hermes/skills/` on install. This page catalogs the bundled skills that live in the repository under `skills/`.
|
||||
Hermes ships with a large built-in skill library copied into `~/.hermes/skills/` on install. Each skill below links to a dedicated page with its full definition, setup, and usage.
|
||||
|
||||
If a skill is missing from this list but present in the repo, the catalog is regenerated by `website/scripts/generate-skill-docs.py`.
|
||||
|
||||
## apple
|
||||
|
||||
Apple/macOS-specific skills — iMessage, Reminders, Notes, FindMy, and macOS automation. These skills only load on macOS systems.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `apple-notes` | Manage Apple Notes via the memo CLI on macOS (create, view, search, edit). | `apple/apple-notes` |
|
||||
| `apple-reminders` | Manage Apple Reminders via remindctl CLI (list, add, complete, delete). | `apple/apple-reminders` |
|
||||
| `findmy` | Track Apple devices and AirTags via FindMy.app on macOS using AppleScript and screen capture. | `apple/findmy` |
|
||||
| `imessage` | Send and receive iMessages/SMS via the imsg CLI on macOS. | `apple/imessage` |
|
||||
| [`apple-notes`](/docs/user-guide/skills/bundled/apple/apple-apple-notes) | Manage Apple Notes via the memo CLI on macOS (create, view, search, edit). | `apple/apple-notes` |
|
||||
| [`apple-reminders`](/docs/user-guide/skills/bundled/apple/apple-apple-reminders) | Manage Apple Reminders via remindctl CLI (list, add, complete, delete). | `apple/apple-reminders` |
|
||||
| [`findmy`](/docs/user-guide/skills/bundled/apple/apple-findmy) | Track Apple devices and AirTags via FindMy.app on macOS using AppleScript and screen capture. | `apple/findmy` |
|
||||
| [`imessage`](/docs/user-guide/skills/bundled/apple/apple-imessage) | Send and receive iMessages/SMS via the imsg CLI on macOS. | `apple/imessage` |
|
||||
|
||||
## autonomous-ai-agents
|
||||
|
||||
Skills for spawning and orchestrating autonomous AI coding agents and multi-agent workflows — running independent agent processes, delegating tasks, and coordinating parallel workstreams.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `claude-code` | Delegate coding tasks to Claude Code (Anthropic's CLI agent). Use for building features, refactoring, PR reviews, and iterative coding. Requires the claude CLI installed. | `autonomous-ai-agents/claude-code` |
|
||||
| `codex` | Delegate coding tasks to OpenAI Codex CLI agent. Use for building features, refactoring, PR reviews, and batch issue fixing. Requires the codex CLI and a git repository. | `autonomous-ai-agents/codex` |
|
||||
| `hermes-agent` | Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, s… | `autonomous-ai-agents/hermes-agent` |
|
||||
| `opencode` | Delegate coding tasks to OpenCode CLI agent for feature implementation, refactoring, PR review, and long-running autonomous sessions. Requires the opencode CLI installed and authenticated. | `autonomous-ai-agents/opencode` |
|
||||
| [`claude-code`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-claude-code) | Delegate coding tasks to Claude Code (Anthropic's CLI agent). Use for building features, refactoring, PR reviews, and iterative coding. Requires the claude CLI installed. | `autonomous-ai-agents/claude-code` |
|
||||
| [`codex`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-codex) | Delegate coding tasks to OpenAI Codex CLI agent. Use for building features, refactoring, PR reviews, and batch issue fixing. Requires the codex CLI and a git repository. | `autonomous-ai-agents/codex` |
|
||||
| [`hermes-agent`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-hermes-agent) | Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users... | `autonomous-ai-agents/hermes-agent` |
|
||||
| [`opencode`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-opencode) | Delegate coding tasks to OpenCode CLI agent for feature implementation, refactoring, PR review, and long-running autonomous sessions. Requires the opencode CLI installed and authenticated. | `autonomous-ai-agents/opencode` |
|
||||
|
||||
## creative
|
||||
|
||||
Creative content generation — ASCII art, hand-drawn diagrams, animations, music, and visual design tools.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `architecture-diagram` | Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics. Semantic component colors (cyan=frontend, emerald=backend, violet=database, amber=cloud/AWS, rose=security, orange=message bus), JetBrains Mono fon… | `creative/architecture-diagram` |
|
||||
| `ascii-art` | Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii.co.uk), and LLM fallback. No API keys required. | `creative/ascii-art` |
|
||||
| `ascii-video` | Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid… | `creative/ascii-video` |
|
||||
| `excalidraw` | Create hand-drawn style diagrams using Excalidraw JSON format. Generate .excalidraw files for architecture diagrams, flowcharts, sequence diagrams, concept maps, and more. Files can be opened at excalidraw.com or uploaded for shareable links. | `creative/excalidraw` |
|
||||
| `ideation` | Generate project ideas through creative constraints. Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools,… | `creative/creative-ideation` |
|
||||
| `manim-video` | Production pipeline for mathematical and technical animations using Manim Community Edition. Creates 3Blue1Brown-style explainer videos, algorithm visualizations, equation derivations, architecture diagrams, and data stories. Use when users request: animated explanations, math… | `creative/manim-video` |
|
||||
| `p5js` | Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D… | `creative/p5js` |
|
||||
| `popular-web-designs` | 54 production-quality design systems extracted from real websites. Load a template to generate HTML/CSS that matches the visual identity of sites like Stripe, Linear, Vercel, Notion, Airbnb, and more. Each template includes colors, typography, components, layout rules, and rea… | `creative/popular-web-designs` |
|
||||
| `songwriting-and-ai-music` | Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation techniques, phonetic tricks, and lessons learned. These are tools and ideas, not rules. Break any of them when the art calls for it. | `creative/songwriting-and-ai-music` |
|
||||
| [`architecture-diagram`](/docs/user-guide/skills/bundled/creative/creative-architecture-diagram) | Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics. Semantic component colors (cyan=frontend, emerald=backend, violet=database, amber=cloud/AWS, rose=security,... | `creative/architecture-diagram` |
|
||||
| [`ascii-art`](/docs/user-guide/skills/bundled/creative/creative-ascii-art) | Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii.co.uk), and LLM fallback. No API keys required. | `creative/ascii-art` |
|
||||
| [`ascii-video`](/docs/user-guide/skills/bundled/creative/creative-ascii-video) | Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers,... | `creative/ascii-video` |
|
||||
| [`baoyu-comic`](/docs/user-guide/skills/bundled/creative/creative-baoyu-comic) | Knowledge comic creator supporting multiple art styles and tones. Creates original educational comics with detailed panel layouts and sequential image generation. Use when user asks to create "知识漫画", "教育漫画", "biography comic", "tutorial... | `creative/baoyu-comic` |
|
||||
| [`baoyu-infographic`](/docs/user-guide/skills/bundled/creative/creative-baoyu-infographic) | Generate professional infographics with 21 layout types and 21 visual styles. Analyzes content, recommends layout×style combinations, and generates publication-ready infographics. Use when user asks to create "infographic", "visual summa... | `creative/baoyu-infographic` |
|
||||
| [`ideation`](/docs/user-guide/skills/bundled/creative/creative-creative-ideation) | Generate project ideas through creative constraints. Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works... | `creative/creative-ideation` |
|
||||
| [`design-md`](/docs/user-guide/skills/bundled/creative/creative-design-md) | Author, validate, diff, and export DESIGN.md files — Google's open-source format spec that gives coding agents a persistent, structured understanding of a design system (tokens + rationale in one file). Use when building a design system,... | `creative/design-md` |
|
||||
| [`excalidraw`](/docs/user-guide/skills/bundled/creative/creative-excalidraw) | Create hand-drawn style diagrams using Excalidraw JSON format. Generate .excalidraw files for architecture diagrams, flowcharts, sequence diagrams, concept maps, and more. Files can be opened at excalidraw.com or uploaded for shareable l... | `creative/excalidraw` |
|
||||
| [`manim-video`](/docs/user-guide/skills/bundled/creative/creative-manim-video) | Production pipeline for mathematical and technical animations using Manim Community Edition. Creates 3Blue1Brown-style explainer videos, algorithm visualizations, equation derivations, architecture diagrams, and data stories. Use when us... | `creative/manim-video` |
|
||||
| [`p5js`](/docs/user-guide/skills/bundled/creative/creative-p5js) | Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as... | `creative/p5js` |
|
||||
| [`pixel-art`](/docs/user-guide/skills/bundled/creative/creative-pixel-art) | Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc.), and animate them into short videos. Presets cover arcade, SNES, and 10+ era-correct looks. Use `clarify` to let the user pick a style... | `creative/pixel-art` |
|
||||
| [`popular-web-designs`](/docs/user-guide/skills/bundled/creative/creative-popular-web-designs) | 54 production-quality design systems extracted from real websites. Load a template to generate HTML/CSS that matches the visual identity of sites like Stripe, Linear, Vercel, Notion, Airbnb, and more. Each template includes colors, typog... | `creative/popular-web-designs` |
|
||||
| [`songwriting-and-ai-music`](/docs/user-guide/skills/bundled/creative/creative-songwriting-and-ai-music) | Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation techniques, phonetic tricks, and lessons learned. These are tools and ideas, not rules. Break any of them when the art calls for it. | `creative/songwriting-and-ai-music` |
|
||||
|
||||
## data-science
|
||||
|
||||
Skills for data science workflows — interactive exploration, Jupyter notebooks, data analysis, and visualization.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `jupyter-live-kernel` | Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb. Load this skill when the task involves exploration, iteration, or inspecting intermediate results — data science, ML experimentation, API exploration, or building up complex code step-by-step. Uses… | `data-science/jupyter-live-kernel` |
|
||||
| [`jupyter-live-kernel`](/docs/user-guide/skills/bundled/data-science/data-science-jupyter-live-kernel) | Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb. Load this skill when the task involves exploration, iteration, or inspecting intermediate results — data science, ML experimentation, API exploration, or bui... | `data-science/jupyter-live-kernel` |
|
||||
|
||||
## devops
|
||||
|
||||
DevOps and infrastructure automation skills.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `webhook-subscriptions` | Create and manage webhook subscriptions for event-driven agent activation. Use when the user wants external services to trigger agent runs automatically. | `devops/webhook-subscriptions` |
|
||||
| [`webhook-subscriptions`](/docs/user-guide/skills/bundled/devops/devops-webhook-subscriptions) | Create and manage webhook subscriptions for event-driven agent activation, or for direct push notifications (zero LLM cost). Use when the user wants external services to trigger agent runs OR push notifications to chats. | `devops/webhook-subscriptions` |
|
||||
|
||||
## dogfood
|
||||
|
||||
Internal dogfooding and QA skills used to test Hermes Agent itself.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `dogfood` | Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports | `dogfood` |
|
||||
| `adversarial-ux-test` | Roleplay the most difficult, tech-resistant user for a product — browse in-persona, rant, then filter through a RED/YELLOW/WHITE/GREEN pragmatism layer so only real UX friction becomes tickets. | `dogfood/adversarial-ux-test` |
|
||||
| [`dogfood`](/docs/user-guide/skills/bundled/dogfood/dogfood-dogfood) | Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports | `dogfood` |
|
||||
|
||||
## email
|
||||
|
||||
Skills for sending, receiving, searching, and managing email from the terminal.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `himalaya` | CLI to manage emails via IMAP/SMTP. Use himalaya to list, read, write, reply, forward, search, and organize emails from the terminal. Supports multiple accounts and message composition with MML (MIME Meta Language). | `email/himalaya` |
|
||||
| [`himalaya`](/docs/user-guide/skills/bundled/email/email-himalaya) | CLI to manage emails via IMAP/SMTP. Use himalaya to list, read, write, reply, forward, search, and organize emails from the terminal. Supports multiple accounts and message composition with MML (MIME Meta Language). | `email/himalaya` |
|
||||
|
||||
## gaming
|
||||
|
||||
Skills for setting up, configuring, and managing game servers, modpacks, and gaming-related infrastructure.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `minecraft-modpack-server` | Set up a modded Minecraft server from a CurseForge/Modrinth server pack zip. Covers NeoForge/Forge install, Java version, JVM tuning, firewall, LAN config, backups, and launch scripts. | `gaming/minecraft-modpack-server` |
|
||||
| `pokemon-player` | Play Pokemon games autonomously via headless emulation. Starts a game server, reads structured game state from RAM, makes strategic decisions, and sends button inputs — all from the terminal. | `gaming/pokemon-player` |
|
||||
| [`minecraft-modpack-server`](/docs/user-guide/skills/bundled/gaming/gaming-minecraft-modpack-server) | Set up a modded Minecraft server from a CurseForge/Modrinth server pack zip. Covers NeoForge/Forge install, Java version, JVM tuning, firewall, LAN config, backups, and launch scripts. | `gaming/minecraft-modpack-server` |
|
||||
| [`pokemon-player`](/docs/user-guide/skills/bundled/gaming/gaming-pokemon-player) | Play Pokemon games autonomously via headless emulation. Starts a game server, reads structured game state from RAM, makes strategic decisions, and sends button inputs — all from the terminal. | `gaming/pokemon-player` |
|
||||
|
||||
## github
|
||||
|
||||
GitHub workflow skills for managing repositories, pull requests, code reviews, issues, and CI/CD pipelines.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `codebase-inspection` | Inspect and analyze codebases using pygount for LOC counting, language breakdown, and code-vs-comment ratios. Use when asked to check lines of code, repo size, language composition, or codebase stats. | `github/codebase-inspection` |
|
||||
| `github-auth` | Set up GitHub authentication for the agent using git (universally available) or the gh CLI. Covers HTTPS tokens, SSH keys, credential helpers, and gh auth — with a detection flow to pick the right method automatically. | `github/github-auth` |
|
||||
| `github-code-review` | Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-code-review` |
|
||||
| `github-issues` | Create, manage, triage, and close GitHub issues. Search existing issues, add labels, assign people, and link to PRs. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-issues` |
|
||||
| `github-pr-workflow` | Full pull request lifecycle — create branches, commit changes, open PRs, monitor CI status, auto-fix failures, and merge. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-pr-workflow` |
|
||||
| `github-repo-management` | Clone, create, fork, configure, and manage GitHub repositories. Manage remotes, secrets, releases, and workflows. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-repo-management` |
|
||||
| [`codebase-inspection`](/docs/user-guide/skills/bundled/github/github-codebase-inspection) | Inspect and analyze codebases using pygount for LOC counting, language breakdown, and code-vs-comment ratios. Use when asked to check lines of code, repo size, language composition, or codebase stats. | `github/codebase-inspection` |
|
||||
| [`github-auth`](/docs/user-guide/skills/bundled/github/github-github-auth) | Set up GitHub authentication for the agent using git (universally available) or the gh CLI. Covers HTTPS tokens, SSH keys, credential helpers, and gh auth — with a detection flow to pick the right method automatically. | `github/github-auth` |
|
||||
| [`github-code-review`](/docs/user-guide/skills/bundled/github/github-github-code-review) | Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-code-review` |
|
||||
| [`github-issues`](/docs/user-guide/skills/bundled/github/github-github-issues) | Create, manage, triage, and close GitHub issues. Search existing issues, add labels, assign people, and link to PRs. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-issues` |
|
||||
| [`github-pr-workflow`](/docs/user-guide/skills/bundled/github/github-github-pr-workflow) | Full pull request lifecycle — create branches, commit changes, open PRs, monitor CI status, auto-fix failures, and merge. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-pr-workflow` |
|
||||
| [`github-repo-management`](/docs/user-guide/skills/bundled/github/github-github-repo-management) | Clone, create, fork, configure, and manage GitHub repositories. Manage remotes, secrets, releases, and workflows. Works with gh CLI or falls back to git + GitHub REST API via curl. | `github/github-repo-management` |
|
||||
|
||||
## mcp
|
||||
|
||||
Skills for working with MCP (Model Context Protocol) servers, tools, and integrations.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `native-mcp` | Built-in MCP (Model Context Protocol) client that connects to external MCP servers, discovers their tools, and registers them as native Hermes Agent tools. Supports stdio and HTTP transports with automatic reconnection, security filtering, and zero-config tool injection. | `mcp/native-mcp` |
|
||||
| [`native-mcp`](/docs/user-guide/skills/bundled/mcp/mcp-native-mcp) | Built-in MCP (Model Context Protocol) client that connects to external MCP servers, discovers their tools, and registers them as native Hermes Agent tools. Supports stdio and HTTP transports with automatic reconnection, security filterin... | `mcp/native-mcp` |
|
||||
|
||||
## media
|
||||
|
||||
Skills for working with media content — YouTube transcripts, GIF search, music generation, and audio visualization.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `gif-search` | Search and download GIFs from Tenor using curl. No dependencies beyond curl and jq. Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat. | `media/gif-search` |
|
||||
| `heartmula` | Set up and run HeartMuLa, the open-source music generation model family (Suno-like). Generates full songs from lyrics + tags with multilingual support. | `media/heartmula` |
|
||||
| `songsee` | Generate spectrograms and audio feature visualizations (mel, chroma, MFCC, tempogram, etc.) from audio files via CLI. Useful for audio analysis, music production debugging, and visual documentation. | `media/songsee` |
|
||||
| `youtube-content` | Fetch YouTube video transcripts and transform them into structured content (chapters, summaries, threads, blog posts). Use when the user shares a YouTube URL or video link, asks to summarize a video, requests a transcript, or wants to extract and reformat content from any YouT… | `media/youtube-content` |
|
||||
| [`gif-search`](/docs/user-guide/skills/bundled/media/media-gif-search) | Search and download GIFs from Tenor using curl. No dependencies beyond curl and jq. Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat. | `media/gif-search` |
|
||||
| [`heartmula`](/docs/user-guide/skills/bundled/media/media-heartmula) | Set up and run HeartMuLa, the open-source music generation model family (Suno-like). Generates full songs from lyrics + tags with multilingual support. | `media/heartmula` |
|
||||
| [`songsee`](/docs/user-guide/skills/bundled/media/media-songsee) | Generate spectrograms and audio feature visualizations (mel, chroma, MFCC, tempogram, etc.) from audio files via CLI. Useful for audio analysis, music production debugging, and visual documentation. | `media/songsee` |
|
||||
| [`youtube-content`](/docs/user-guide/skills/bundled/media/media-youtube-content) | Fetch YouTube video transcripts and transform them into structured content (chapters, summaries, threads, blog posts). Use when the user shares a YouTube URL or video link, asks to summarize a video, requests a transcript, or wants to ex... | `media/youtube-content` |
|
||||
|
||||
## mlops
|
||||
|
||||
General-purpose ML operations tools — model hub management, dataset operations, and workflow orchestration.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `huggingface-hub` | Hugging Face Hub CLI (hf) — search, download, and upload models and datasets, manage repos, query datasets with SQL, deploy inference endpoints, manage Spaces and buckets. | `mlops/huggingface-hub` |
|
||||
|
||||
## mlops/evaluation
|
||||
|
||||
Model evaluation benchmarks, experiment tracking, and interpretability tools.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `evaluating-llms-harness` | Evaluates LLMs across 60+ academic benchmarks (MMLU, HumanEval, GSM8K, TruthfulQA, HellaSwag). Use when benchmarking model quality, comparing models, reporting academic results, or tracking training progress. Industry standard used by EleutherAI, HuggingFace, and major labs. S… | `mlops/evaluation/lm-evaluation-harness` |
|
||||
| `weights-and-biases` | Track ML experiments with automatic logging, visualize training in real-time, optimize hyperparameters with sweeps, and manage model registry with W&B - collaborative MLOps platform | `mlops/evaluation/weights-and-biases` |
|
||||
|
||||
## mlops/inference
|
||||
|
||||
Model serving, quantization (GGUF/GPTQ), structured output, inference optimization, and model surgery tools for deploying and running LLMs.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `llama-cpp` | Run LLM inference with llama.cpp on CPU, Apple Silicon, AMD/Intel GPUs, or NVIDIA — plus GGUF model conversion and quantization (2–8 bit with K-quants and imatrix). Covers CLI, Python bindings, OpenAI-compatible server, and Ollama/LM Studio integration. Use for edge deployment… | `mlops/inference/llama-cpp` |
|
||||
| `obliteratus` | Remove refusal behaviors from open-weight LLMs using OBLITERATUS — mechanistic interpretability techniques (diff-in-means, SVD, whitened SVD, LEACE, SAE decomposition, etc.) to excise guardrails while preserving reasoning. 9 CLI methods, 28 analysis modules, 116 model presets … | `mlops/inference/obliteratus` |
|
||||
| `outlines` | Guarantee valid JSON/XML/code structure during generation, use Pydantic models for type-safe outputs, support local models (Transformers, vLLM), and maximize inference speed with Outlines - dottxt.ai's structured generation library | `mlops/inference/outlines` |
|
||||
| `serving-llms-vllm` | Serves LLMs with high throughput using vLLM's PagedAttention and continuous batching. Use when deploying production LLM APIs, optimizing inference latency/throughput, or serving models with limited GPU memory. Supports OpenAI-compatible endpoints, quantization (GPTQ/AWQ/FP8), … | `mlops/inference/vllm` |
|
||||
|
||||
## mlops/models
|
||||
|
||||
Specific model architectures — image segmentation (SAM) and audio generation (AudioCraft / MusicGen). Additional model skills (CLIP, Stable Diffusion, Whisper, LLaVA) are available as optional skills.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `audiocraft-audio-generation` | PyTorch library for audio generation including text-to-music (MusicGen) and text-to-sound (AudioGen). Use when you need to generate music from text descriptions, create sound effects, or perform melody-conditioned music generation. | `mlops/models/audiocraft` |
|
||||
| `segment-anything-model` | Foundation model for image segmentation with zero-shot transfer. Use when you need to segment any object in images using points, boxes, or masks as prompts, or automatically generate all object masks in an image. | `mlops/models/segment-anything` |
|
||||
|
||||
## mlops/research
|
||||
|
||||
ML research frameworks for building and optimizing AI systems with declarative programming.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `dspy` | Build complex AI systems with declarative programming, optimize prompts automatically, create modular RAG systems and agents with DSPy - Stanford NLP's framework for systematic LM programming | `mlops/research/dspy` |
|
||||
|
||||
## mlops/training
|
||||
|
||||
Fine-tuning, RLHF/DPO/GRPO training, distributed training frameworks, and optimization tools.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `axolotl` | Expert guidance for fine-tuning LLMs with Axolotl - YAML configs, 100+ models, LoRA/QLoRA, DPO/KTO/ORPO/GRPO, multimodal support | `mlops/training/axolotl` |
|
||||
| `fine-tuning-with-trl` | Fine-tune LLMs using reinforcement learning with TRL - SFT for instruction tuning, DPO for preference alignment, PPO/GRPO for reward optimization, and reward model training. Use when need RLHF, align model with preferences, or train from human feedback. Works with HuggingFace … | `mlops/training/trl-fine-tuning` |
|
||||
| `unsloth` | Expert guidance for fast fine-tuning with Unsloth - 2-5x faster training, 50-80% less memory, LoRA/QLoRA optimization | `mlops/training/unsloth` |
|
||||
| [`audiocraft-audio-generation`](/docs/user-guide/skills/bundled/mlops/mlops-models-audiocraft) | PyTorch library for audio generation including text-to-music (MusicGen) and text-to-sound (AudioGen). Use when you need to generate music from text descriptions, create sound effects, or perform melody-conditioned music generation. | `mlops/models/audiocraft` |
|
||||
| [`axolotl`](/docs/user-guide/skills/bundled/mlops/mlops-training-axolotl) | Expert guidance for fine-tuning LLMs with Axolotl - YAML configs, 100+ models, LoRA/QLoRA, DPO/KTO/ORPO/GRPO, multimodal support | `mlops/training/axolotl` |
|
||||
| [`dspy`](/docs/user-guide/skills/bundled/mlops/mlops-research-dspy) | Build complex AI systems with declarative programming, optimize prompts automatically, create modular RAG systems and agents with DSPy - Stanford NLP's framework for systematic LM programming | `mlops/research/dspy` |
|
||||
| [`huggingface-hub`](/docs/user-guide/skills/bundled/mlops/mlops-huggingface-hub) | Hugging Face Hub CLI (hf) — search, download, and upload models and datasets, manage repos, query datasets with SQL, deploy inference endpoints, manage Spaces and buckets. | `mlops/huggingface-hub` |
|
||||
| [`llama-cpp`](/docs/user-guide/skills/bundled/mlops/mlops-inference-llama-cpp) | llama.cpp local GGUF inference + HF Hub model discovery. | `mlops/inference/llama-cpp` |
|
||||
| [`evaluating-llms-harness`](/docs/user-guide/skills/bundled/mlops/mlops-evaluation-lm-evaluation-harness) | Evaluates LLMs across 60+ academic benchmarks (MMLU, HumanEval, GSM8K, TruthfulQA, HellaSwag). Use when benchmarking model quality, comparing models, reporting academic results, or tracking training progress. Industry standard used by El... | `mlops/evaluation/lm-evaluation-harness` |
|
||||
| [`obliteratus`](/docs/user-guide/skills/bundled/mlops/mlops-inference-obliteratus) | Remove refusal behaviors from open-weight LLMs using OBLITERATUS — mechanistic interpretability techniques (diff-in-means, SVD, whitened SVD, LEACE, SAE decomposition, etc.) to excise guardrails while preserving reasoning. 9 CLI methods,... | `mlops/inference/obliteratus` |
|
||||
| [`outlines`](/docs/user-guide/skills/bundled/mlops/mlops-inference-outlines) | Guarantee valid JSON/XML/code structure during generation, use Pydantic models for type-safe outputs, support local models (Transformers, vLLM), and maximize inference speed with Outlines - dottxt.ai's structured generation library | `mlops/inference/outlines` |
|
||||
| [`segment-anything-model`](/docs/user-guide/skills/bundled/mlops/mlops-models-segment-anything) | Foundation model for image segmentation with zero-shot transfer. Use when you need to segment any object in images using points, boxes, or masks as prompts, or automatically generate all object masks in an image. | `mlops/models/segment-anything` |
|
||||
| [`fine-tuning-with-trl`](/docs/user-guide/skills/bundled/mlops/mlops-training-trl-fine-tuning) | Fine-tune LLMs using reinforcement learning with TRL - SFT for instruction tuning, DPO for preference alignment, PPO/GRPO for reward optimization, and reward model training. Use when need RLHF, align model with preferences, or train from... | `mlops/training/trl-fine-tuning` |
|
||||
| [`unsloth`](/docs/user-guide/skills/bundled/mlops/mlops-training-unsloth) | Expert guidance for fast fine-tuning with Unsloth - 2-5x faster training, 50-80% less memory, LoRA/QLoRA optimization | `mlops/training/unsloth` |
|
||||
| [`serving-llms-vllm`](/docs/user-guide/skills/bundled/mlops/mlops-inference-vllm) | Serves LLMs with high throughput using vLLM's PagedAttention and continuous batching. Use when deploying production LLM APIs, optimizing inference latency/throughput, or serving models with limited GPU memory. Supports OpenAI-compatible... | `mlops/inference/vllm` |
|
||||
| [`weights-and-biases`](/docs/user-guide/skills/bundled/mlops/mlops-evaluation-weights-and-biases) | Track ML experiments with automatic logging, visualize training in real-time, optimize hyperparameters with sweeps, and manage model registry with W&B - collaborative MLOps platform | `mlops/evaluation/weights-and-biases` |
|
||||
|
||||
## note-taking
|
||||
|
||||
Note taking skills, to save information, assist with research, and collaborate on multi-session planning.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `obsidian` | Read, search, and create notes in the Obsidian vault. | `note-taking/obsidian` |
|
||||
| [`obsidian`](/docs/user-guide/skills/bundled/note-taking/note-taking-obsidian) | Read, search, and create notes in the Obsidian vault. | `note-taking/obsidian` |
|
||||
|
||||
## productivity
|
||||
|
||||
Skills for document creation, presentations, spreadsheets, and other productivity workflows.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `google-workspace` | Gmail, Calendar, Drive, Contacts, Sheets, and Docs integration for Hermes. Uses Hermes-managed OAuth2 setup, prefers the Google Workspace CLI (`gws`) when available for broader API coverage, and falls back to the Python client libraries otherwise. | `productivity/google-workspace` |
|
||||
| `linear` | Manage Linear issues, projects, and teams via the GraphQL API. Create, update, search, and organize issues. Uses API key auth (no OAuth needed). All operations via curl — no dependencies. | `productivity/linear` |
|
||||
| `maps` | Location intelligence — geocode, reverse-geocode, nearby POI search (44 categories, coordinates or address via `--near`), driving/walking/cycling distance + time, turn-by-turn directions, timezone, bounding box + area, POI search in a rectangle. Uses OpenStreetMap + Overpass + OSRM. No API key needed. Telegram location-pin friendly. | `productivity/maps` |
|
||||
| `nano-pdf` | Edit PDFs with natural-language instructions using the nano-pdf CLI. Modify text, fix typos, update titles, and make content changes to specific pages without manual editing. | `productivity/nano-pdf` |
|
||||
| `notion` | Notion API for creating and managing pages, databases, and blocks via curl. Search, create, update, and query Notion workspaces directly from the terminal. | `productivity/notion` |
|
||||
| `ocr-and-documents` | Extract text from PDFs and scanned documents. Use web_extract for remote URLs, pymupdf for local text-based PDFs, marker-pdf for OCR/scanned docs. For DOCX use python-docx, for PPTX see the powerpoint skill. | `productivity/ocr-and-documents` |
|
||||
| `powerpoint` | Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in … | `productivity/powerpoint` |
|
||||
| [`google-workspace`](/docs/user-guide/skills/bundled/productivity/productivity-google-workspace) | Gmail, Calendar, Drive, Contacts, Sheets, and Docs integration for Hermes. Uses Hermes-managed OAuth2 setup, prefers the Google Workspace CLI (`gws`) when available for broader API coverage, and falls back to the Python client libraries... | `productivity/google-workspace` |
|
||||
| [`linear`](/docs/user-guide/skills/bundled/productivity/productivity-linear) | Manage Linear issues, projects, and teams via the GraphQL API. Create, update, search, and organize issues. Uses API key auth (no OAuth needed). All operations via curl — no dependencies. | `productivity/linear` |
|
||||
| [`maps`](/docs/user-guide/skills/bundled/productivity/productivity-maps) | Location intelligence — geocode a place, reverse-geocode coordinates, find nearby places (46 POI categories), driving/walking/cycling distance + time, turn-by-turn directions, timezone lookup, bounding box + area for a named place, and P... | `productivity/maps` |
|
||||
| [`nano-pdf`](/docs/user-guide/skills/bundled/productivity/productivity-nano-pdf) | Edit PDFs with natural-language instructions using the nano-pdf CLI. Modify text, fix typos, update titles, and make content changes to specific pages without manual editing. | `productivity/nano-pdf` |
|
||||
| [`notion`](/docs/user-guide/skills/bundled/productivity/productivity-notion) | Notion API for creating and managing pages, databases, and blocks via curl. Search, create, update, and query Notion workspaces directly from the terminal. | `productivity/notion` |
|
||||
| [`ocr-and-documents`](/docs/user-guide/skills/bundled/productivity/productivity-ocr-and-documents) | Extract text from PDFs and scanned documents. Use web_extract for remote URLs, pymupdf for local text-based PDFs, marker-pdf for OCR/scanned docs. For DOCX use python-docx, for PPTX see the powerpoint skill. | `productivity/ocr-and-documents` |
|
||||
| [`powerpoint`](/docs/user-guide/skills/bundled/productivity/productivity-powerpoint) | Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted... | `productivity/powerpoint` |
|
||||
|
||||
## red-teaming
|
||||
|
||||
Skills for LLM red-teaming, jailbreaking, and safety filter bypass research.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `godmode` | Jailbreak API-served LLMs using G0DM0D3 techniques — Parseltongue input obfuscation (33 techniques), GODMODE CLASSIC system prompt templates, ULTRAPLINIAN multi-model racing, encoding escalation, and Hermes-native prefill/system prompt integration. Use when a user wants to byp… | `red-teaming/godmode` |
|
||||
| [`godmode`](/docs/user-guide/skills/bundled/red-teaming/red-teaming-godmode) | Jailbreak API-served LLMs using G0DM0D3 techniques — Parseltongue input obfuscation (33 techniques), GODMODE CLASSIC system prompt templates, ULTRAPLINIAN multi-model racing, encoding escalation, and Hermes-native prefill/system prompt i... | `red-teaming/godmode` |
|
||||
|
||||
## research
|
||||
|
||||
Skills for academic research, paper discovery, literature review, market data, content monitoring, and scientific knowledge retrieval.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `arxiv` | Search and retrieve academic papers from arXiv using their free REST API. No API key needed. Search by keyword, author, category, or ID. Combine with web_extract or the ocr-and-documents skill to read full paper content. | `research/arxiv` |
|
||||
| `blogwatcher` | Monitor blogs and RSS/Atom feeds for updates using the blogwatcher-cli tool. Add blogs, scan for new articles, track read status, and filter by category. | `research/blogwatcher` |
|
||||
| `llm-wiki` | Karpathy's LLM Wiki — build and maintain a persistent, interlinked markdown knowledge base. Ingest sources, query compiled knowledge, and lint for consistency. | `research/llm-wiki` |
|
||||
| `polymarket` | Query Polymarket prediction market data — search markets, get prices, orderbooks, and price history. Read-only via public REST APIs, no API key needed. | `research/polymarket` |
|
||||
| `research-paper-writing` | End-to-end pipeline for writing ML/AI research papers — from experiment design through analysis, drafting, revision, and submission. Covers NeurIPS, ICML, ICLR, ACL, AAAI, COLM. Integrates automated experiment monitoring, statistical analysis, iterative writing, and citation v… | `research/research-paper-writing` |
|
||||
| [`arxiv`](/docs/user-guide/skills/bundled/research/research-arxiv) | Search and retrieve academic papers from arXiv using their free REST API. No API key needed. Search by keyword, author, category, or ID. Combine with web_extract or the ocr-and-documents skill to read full paper content. | `research/arxiv` |
|
||||
| [`blogwatcher`](/docs/user-guide/skills/bundled/research/research-blogwatcher) | Monitor blogs and RSS/Atom feeds for updates using the blogwatcher-cli tool. Add blogs, scan for new articles, track read status, and filter by category. | `research/blogwatcher` |
|
||||
| [`llm-wiki`](/docs/user-guide/skills/bundled/research/research-llm-wiki) | Karpathy's LLM Wiki — build and maintain a persistent, interlinked markdown knowledge base. Ingest sources, query compiled knowledge, and lint for consistency. | `research/llm-wiki` |
|
||||
| [`polymarket`](/docs/user-guide/skills/bundled/research/research-polymarket) | Query Polymarket prediction market data — search markets, get prices, orderbooks, and price history. Read-only via public REST APIs, no API key needed. | `research/polymarket` |
|
||||
| [`research-paper-writing`](/docs/user-guide/skills/bundled/research/research-research-paper-writing) | End-to-end pipeline for writing ML/AI research papers — from experiment design through analysis, drafting, revision, and submission. Covers NeurIPS, ICML, ICLR, ACL, AAAI, COLM. Integrates automated experiment monitoring, statistical ana... | `research/research-paper-writing` |
|
||||
|
||||
## smart-home
|
||||
|
||||
Skills for controlling smart home devices — lights, switches, sensors, and home automation systems.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `openhue` | Control Philips Hue lights, rooms, and scenes via the OpenHue CLI. Turn lights on/off, adjust brightness, color, color temperature, and activate scenes. | `smart-home/openhue` |
|
||||
| [`openhue`](/docs/user-guide/skills/bundled/smart-home/smart-home-openhue) | Control Philips Hue lights, rooms, and scenes via the OpenHue CLI. Turn lights on/off, adjust brightness, color, color temperature, and activate scenes. | `smart-home/openhue` |
|
||||
|
||||
## social-media
|
||||
|
||||
Skills for interacting with social platforms — posting, reading, monitoring, and account operations.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `xurl` | Interact with X/Twitter via xurl, the official X API CLI. Use for posting, replying, quoting, searching, timelines, mentions, likes, reposts, bookmarks, follows, DMs, media upload, and raw v2 endpoint access. | `social-media/xurl` |
|
||||
| [`xurl`](/docs/user-guide/skills/bundled/social-media/social-media-xurl) | Interact with X/Twitter via xurl, the official X API CLI. Use for posting, replying, quoting, searching, timelines, mentions, likes, reposts, bookmarks, follows, DMs, media upload, and raw v2 endpoint access. | `social-media/xurl` |
|
||||
|
||||
## software-development
|
||||
|
||||
General software-engineering skills — planning, reviewing, debugging, and test-driven development.
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `plan` | Plan mode for Hermes — inspect context, write a markdown plan into the active workspace's `.hermes/plans/` directory, and do not execute the work. | `software-development/plan` |
|
||||
| `requesting-code-review` | Pre-commit verification pipeline — static security scan, baseline-aware quality gates, independent reviewer subagent, and auto-fix loop. Use after code changes and before committing, pushing, or opening a PR. | `software-development/requesting-code-review` |
|
||||
| `subagent-driven-development` | Use when executing implementation plans with independent tasks. Dispatches fresh delegate_task per task with two-stage review (spec compliance then code quality). | `software-development/subagent-driven-development` |
|
||||
| `systematic-debugging` | Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first. | `software-development/systematic-debugging` |
|
||||
| `test-driven-development` | Use when implementing any feature or bugfix, before writing implementation code. Enforces RED-GREEN-REFACTOR cycle with test-first approach. | `software-development/test-driven-development` |
|
||||
| `writing-plans` | Use when you have a spec or requirements for a multi-step task. Creates comprehensive implementation plans with bite-sized tasks, exact file paths, and complete code examples. | `software-development/writing-plans` |
|
||||
|
||||
|
||||
---
|
||||
|
||||
# Optional Skills
|
||||
|
||||
Optional skills ship with the repository under `optional-skills/` but are **not active by default**. They cover heavier or niche use cases. Install them with:
|
||||
|
||||
```bash
|
||||
hermes skills install official/<category>/<skill>
|
||||
```
|
||||
|
||||
## autonomous-ai-agents
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `blackbox` | Delegate coding tasks to Blackbox AI CLI agent. Multi-model agent with built-in judge that runs tasks through multiple LLMs and picks the best result. Requires the blackbox CLI and a Blackbox AI API key. | `autonomous-ai-agents/blackbox` |
|
||||
|
||||
## blockchain
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `base` | Query Base (Ethereum L2) blockchain data with USD pricing — wallet balances, token info, transaction details, gas analysis, contract inspection, whale detection, and live network stats. Uses Base RPC + CoinGecko. No API key required. | `blockchain/base` |
|
||||
| `solana` | Query Solana blockchain data with USD pricing — wallet balances, token portfolios with values, transaction details, NFTs, whale detection, and live network stats. Uses Solana RPC + CoinGecko. No API key required. | `blockchain/solana` |
|
||||
|
||||
## creative
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `blender-mcp` | Control Blender directly from Hermes via socket connection to the blender-mcp addon. Create 3D objects, materials, animations, and run arbitrary Blender Python (bpy) code. | `creative/blender-mcp` |
|
||||
| `meme-generation` | Generate real meme images by picking a template and overlaying text with Pillow. Produces actual .png meme files. | `creative/meme-generation` |
|
||||
| `touchdesigner-mcp` | Control a running TouchDesigner instance via the twozero MCP plugin — create operators, set parameters, wire connections, execute Python, build real-time audio-reactive visuals and GLSL networks. 36 native tools. | `creative/touchdesigner-mcp` |
|
||||
|
||||
## devops
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `docker-management` | Manage Docker containers, images, volumes, networks, and Compose stacks — lifecycle ops, debugging, cleanup, and Dockerfile optimization. | `devops/docker-management` |
|
||||
|
||||
## email
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `agentmail` | Give the agent its own dedicated email inbox via AgentMail. Send, receive, and manage email autonomously using agent-owned email addresses (e.g. hermes-agent@agentmail.to). | `email/agentmail` |
|
||||
|
||||
## health
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `neuroskill-bci` | Connect to a running NeuroSkill instance and incorporate the user's real-time cognitive and emotional state (focus, relaxation, mood, cognitive load, drowsiness, heart rate, HRV, sleep staging, and 40+ derived EXG scores) into responses. Requires a BCI wearable (Muse 2/S or OpenBCI) and the NeuroSkill desktop app. | `health/neuroskill-bci` |
|
||||
|
||||
## mcp
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `fastmcp` | Build, test, inspect, install, and deploy MCP servers with FastMCP in Python. Use when creating a new MCP server, wrapping an API or database as MCP tools, exposing resources or prompts, or preparing a FastMCP server for HTTP deployment. | `mcp/fastmcp` |
|
||||
|
||||
## migration
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `openclaw-migration` | Migrate a user's OpenClaw customization footprint into Hermes Agent. Imports Hermes-compatible memories, SOUL.md, command allowlists, user skills, and selected workspace assets from ~/.openclaw, then reports what could not be migrated and why. | `migration/openclaw-migration` |
|
||||
|
||||
## productivity
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `telephony` | Give Hermes phone capabilities — provision and persist a Twilio number, send and receive SMS/MMS, make direct calls, and place AI-driven outbound calls through Bland.ai or Vapi. | `productivity/telephony` |
|
||||
|
||||
## research
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `bioinformatics` | Gateway to 400+ bioinformatics skills from bioSkills and ClawBio. Covers genomics, transcriptomics, single-cell, variant calling, pharmacogenomics, metagenomics, structural biology, and more. | `research/bioinformatics` |
|
||||
| `qmd` | Search personal knowledge bases, notes, docs, and meeting transcripts locally using qmd — a hybrid retrieval engine with BM25, vector search, and LLM reranking. Supports CLI and MCP integration. | `research/qmd` |
|
||||
|
||||
## security
|
||||
|
||||
| Skill | Description | Path |
|
||||
|-------|-------------|------|
|
||||
| `1password` | Set up and use 1Password CLI (op). Use when installing the CLI, enabling desktop app integration, signing in, and reading/injecting secrets for commands. | `security/1password` |
|
||||
| `oss-forensics` | Supply chain investigation, evidence recovery, and forensic analysis for GitHub repositories. Covers deleted commit recovery, force-push detection, IOC extraction, multi-source evidence collection, and structured forensic reporting. | `security/oss-forensics` |
|
||||
| `sherlock` | OSINT username search across 400+ social networks. Hunt down social media accounts by username. | `security/sherlock` |
|
||||
| [`plan`](/docs/user-guide/skills/bundled/software-development/software-development-plan) | Plan mode for Hermes — inspect context, write a markdown plan into the active workspace's `.hermes/plans/` directory, and do not execute the work. | `software-development/plan` |
|
||||
| [`requesting-code-review`](/docs/user-guide/skills/bundled/software-development/software-development-requesting-code-review) | Pre-commit verification pipeline — static security scan, baseline-aware quality gates, independent reviewer subagent, and auto-fix loop. Use after code changes and before committing, pushing, or opening a PR. | `software-development/requesting-code-review` |
|
||||
| [`subagent-driven-development`](/docs/user-guide/skills/bundled/software-development/software-development-subagent-driven-development) | Use when executing implementation plans with independent tasks. Dispatches fresh delegate_task per task with two-stage review (spec compliance then code quality). | `software-development/subagent-driven-development` |
|
||||
| [`systematic-debugging`](/docs/user-guide/skills/bundled/software-development/software-development-systematic-debugging) | Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first. | `software-development/systematic-debugging` |
|
||||
| [`test-driven-development`](/docs/user-guide/skills/bundled/software-development/software-development-test-driven-development) | Use when implementing any feature or bugfix, before writing implementation code. Enforces RED-GREEN-REFACTOR cycle with test-first approach. | `software-development/test-driven-development` |
|
||||
| [`writing-plans`](/docs/user-guide/skills/bundled/software-development/software-development-writing-plans) | Use when you have a spec or requirements for a multi-step task. Creates comprehensive implementation plans with bite-sized tasks, exact file paths, and complete code examples. | `software-development/writing-plans` |
|
||||
|
||||
@@ -6,9 +6,9 @@ description: "Authoritative reference for Hermes built-in tools, grouped by tool
|
||||
|
||||
# Built-in Tools Reference
|
||||
|
||||
This page documents all 53 built-in tools in the Hermes tool registry, grouped by toolset. Availability varies by platform, credentials, and enabled toolsets.
|
||||
This page documents all 55 built-in tools in the Hermes tool registry, grouped by toolset. Availability varies by platform, credentials, and enabled toolsets.
|
||||
|
||||
**Quick counts:** 11 browser tools, 4 file tools, 10 RL tools, 4 Home Assistant tools, 2 terminal tools, 2 web tools, 5 Feishu tools, and 15 standalone tools across other toolsets.
|
||||
**Quick counts:** 12 browser tools, 4 file tools, 10 RL tools, 4 Home Assistant tools, 2 terminal tools, 2 web tools, 5 Feishu tools, and 15 standalone tools across other toolsets.
|
||||
|
||||
:::tip MCP Tools
|
||||
In addition to built-in tools, Hermes can load tools dynamically from MCP servers. MCP tools appear with a server-name prefix (e.g., `github_create_issue` for the `github` MCP server). See [MCP Integration](/docs/user-guide/features/mcp) for configuration.
|
||||
@@ -20,6 +20,7 @@ In addition to built-in tools, Hermes can load tools dynamically from MCP server
|
||||
|------|-------------|----------------------|
|
||||
| `browser_back` | Navigate back to the previous page in browser history. Requires browser_navigate to be called first. | — |
|
||||
| `browser_cdp` | Send a raw Chrome DevTools Protocol (CDP) command. Escape hatch for browser operations not covered by browser_navigate, browser_click, browser_console, etc. Only available when a CDP endpoint is reachable at session start — via `/browser connect` or `browser.cdp_url` config. See https://chromedevtools.github.io/devtools-protocol/ | — |
|
||||
| `browser_dialog` | Respond to a native JavaScript dialog (alert / confirm / prompt / beforeunload). Call `browser_snapshot` first — pending dialogs appear in its `pending_dialogs` field. Then call `browser_dialog(action='accept'|'dismiss')`. Same availability as `browser_cdp` (Browserbase or `/browser connect`). | — |
|
||||
| `browser_click` | Click on an element identified by its ref ID from the snapshot (e.g., '@e5'). The ref IDs are shown in square brackets in the snapshot output. Requires browser_navigate and browser_snapshot to be called first. | — |
|
||||
| `browser_console` | Get browser console output and JavaScript errors from the current page. Returns console.log/warn/error/info messages and uncaught JS exceptions. Use this to detect silent JavaScript errors, failed API calls, and application warnings. Requi… | — |
|
||||
| `browser_get_images` | Get a list of all images on the current page with their URLs and alt text. Useful for finding images to analyze with the vision tool. Requires browser_navigate to be called first. | — |
|
||||
|
||||
@@ -52,7 +52,7 @@ Or in-session:
|
||||
|
||||
| Toolset | Tools | Purpose |
|
||||
|---------|-------|---------|
|
||||
| `browser` | `browser_back`, `browser_cdp`, `browser_click`, `browser_console`, `browser_get_images`, `browser_navigate`, `browser_press`, `browser_scroll`, `browser_snapshot`, `browser_type`, `browser_vision`, `web_search` | Full browser automation. Includes `web_search` as a fallback for quick lookups. `browser_cdp` is a raw CDP passthrough gated on a reachable CDP endpoint — it only appears when `/browser connect` is active or `browser.cdp_url` is set. |
|
||||
| `browser` | `browser_back`, `browser_cdp`, `browser_click`, `browser_console`, `browser_dialog`, `browser_get_images`, `browser_navigate`, `browser_press`, `browser_scroll`, `browser_snapshot`, `browser_type`, `browser_vision`, `web_search` | Full browser automation. Includes `web_search` as a fallback for quick lookups. `browser_cdp` and `browser_dialog` are gated on a reachable CDP endpoint — they only appear when `/browser connect` is active, `browser.cdp_url` is set, or a Browserbase session is active. `browser_dialog` works together with the `pending_dialogs` and `frame_tree` fields that `browser_snapshot` adds when a CDP supervisor is attached. |
|
||||
| `clarify` | `clarify` | Ask the user a question when the agent needs clarification. |
|
||||
| `code_execution` | `execute_code` | Run Python scripts that call Hermes tools programmatically. |
|
||||
| `cronjob` | `cronjob` | Schedule and manage recurring tasks. |
|
||||
|
||||
@@ -431,6 +431,35 @@ file_read_max_chars: 30000
|
||||
|
||||
The agent also deduplicates file reads automatically — if the same file region is read twice and the file hasn't changed, a lightweight stub is returned instead of re-sending the content. This resets on context compression so the agent can re-read files after their content is summarized away.
|
||||
|
||||
## Tool Output Truncation Limits
|
||||
|
||||
Three related caps control how much raw output a tool can return before Hermes truncates it:
|
||||
|
||||
```yaml
|
||||
tool_output:
|
||||
max_bytes: 50000 # terminal output cap (chars)
|
||||
max_lines: 2000 # read_file pagination cap
|
||||
max_line_length: 2000 # per-line cap in read_file's line-numbered view
|
||||
```
|
||||
|
||||
- **`max_bytes`** — When a `terminal` command produces more than this many characters of combined stdout/stderr, Hermes keeps the first 40% and last 60% and inserts a `[OUTPUT TRUNCATED]` notice between them. Default `50000` (≈12-15K tokens across typical tokenisers).
|
||||
- **`max_lines`** — Upper bound on the `limit` parameter of a single `read_file` call. Requests above this are clamped so a single read can't flood the context window. Default `2000`.
|
||||
- **`max_line_length`** — Per-line cap applied when `read_file` emits the line-numbered view. Lines longer than this are truncated to this many chars followed by `... [truncated]`. Default `2000`.
|
||||
|
||||
Raise the limits on models with large context windows that can afford more raw output per call. Lower them for small-context models to keep tool results compact:
|
||||
|
||||
```yaml
|
||||
# Large context model (200K+)
|
||||
tool_output:
|
||||
max_bytes: 150000
|
||||
max_lines: 5000
|
||||
|
||||
# Small local model (16K context)
|
||||
tool_output:
|
||||
max_bytes: 20000
|
||||
max_lines: 500
|
||||
```
|
||||
|
||||
## Git Worktree Isolation
|
||||
|
||||
Enable isolated git worktrees for running multiple agents in parallel on the same repo:
|
||||
@@ -1211,10 +1240,26 @@ browser:
|
||||
inactivity_timeout: 120 # Seconds before auto-closing idle sessions
|
||||
command_timeout: 30 # Timeout in seconds for browser commands (screenshot, navigate, etc.)
|
||||
record_sessions: false # Auto-record browser sessions as WebM videos to ~/.hermes/browser_recordings/
|
||||
# Optional CDP override — when set, Hermes attaches directly to your own
|
||||
# Chrome (via /browser connect) rather than starting a headless browser.
|
||||
cdp_url: ""
|
||||
# Dialog supervisor — controls how native JS dialogs (alert / confirm / prompt)
|
||||
# are handled when a CDP backend is attached (Browserbase, local Chrome via
|
||||
# /browser connect). Ignored on Camofox and default local agent-browser mode.
|
||||
dialog_policy: must_respond # must_respond | auto_dismiss | auto_accept
|
||||
dialog_timeout_s: 300 # Safety auto-dismiss under must_respond (seconds)
|
||||
camofox:
|
||||
managed_persistence: false # When true, Camofox sessions persist cookies/logins across restarts
|
||||
```
|
||||
|
||||
**Dialog policies:**
|
||||
|
||||
- `must_respond` (default) — capture the dialog, surface it in `browser_snapshot.pending_dialogs`, and wait for the agent to call `browser_dialog(action=...)`. After `dialog_timeout_s` seconds with no response, the dialog is auto-dismissed to prevent the page's JS thread from stalling forever.
|
||||
- `auto_dismiss` — capture, dismiss immediately. The agent still sees the dialog record in `browser_snapshot.recent_dialogs` with `closed_by="auto_policy"` after the fact.
|
||||
- `auto_accept` — capture, accept immediately. Useful for pages with aggressive `beforeunload` prompts.
|
||||
|
||||
See the [browser feature page](./features/browser.md#browser_dialog) for the full dialog workflow.
|
||||
|
||||
The browser toolset supports multiple providers. See the [Browser feature page](/docs/user-guide/features/browser) for details on Browserbase, Browser Use, and local Chrome CDP setup.
|
||||
|
||||
## Timezone
|
||||
|
||||
@@ -355,7 +355,50 @@ browser_cdp(method="Runtime.evaluate",
|
||||
browser_cdp(method="Network.getAllCookies")
|
||||
```
|
||||
|
||||
Browser-level methods (`Target.*`, `Browser.*`, `Storage.*`) omit `target_id`. Page-level methods (`Page.*`, `Runtime.*`, `DOM.*`, `Emulation.*`) require a `target_id` from `Target.getTargets`. Each call is independent — sessions do not persist between calls.
|
||||
Browser-level methods (`Target.*`, `Browser.*`, `Storage.*`) omit `target_id`. Page-level methods (`Page.*`, `Runtime.*`, `DOM.*`, `Emulation.*`) require a `target_id` from `Target.getTargets`. Each stateless call is independent — sessions do not persist between calls.
|
||||
|
||||
**Cross-origin iframes:** pass `frame_id` (from `browser_snapshot.frame_tree.children[]` where `is_oopif=true`) to route the CDP call through the supervisor's live session for that iframe. This is how `Runtime.evaluate` inside a cross-origin iframe works on Browserbase, where stateless CDP connections would hit signed-URL expiry. Example:
|
||||
|
||||
```
|
||||
browser_cdp(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression": "document.title", "returnByValue": True},
|
||||
frame_id="<frame_id from browser_snapshot>",
|
||||
)
|
||||
```
|
||||
|
||||
Same-origin iframes don't need `frame_id` — use `document.querySelector('iframe').contentDocument` from a top-level `Runtime.evaluate` instead.
|
||||
|
||||
### `browser_dialog`
|
||||
|
||||
Responds to a native JS dialog (`alert` / `confirm` / `prompt` / `beforeunload`). Before this tool existed, dialogs would silently block the page's JavaScript thread and subsequent `browser_*` calls would hang or throw; now the agent sees pending dialogs in `browser_snapshot` output and responds explicitly.
|
||||
|
||||
**Workflow:**
|
||||
1. Call `browser_snapshot`. If a dialog is blocking the page, it shows up as `pending_dialogs: [{"id": "d-1", "type": "alert", "message": "..."}]`.
|
||||
2. Call `browser_dialog(action="accept")` or `browser_dialog(action="dismiss")`. For `prompt()` dialogs, pass `prompt_text="..."` to supply the response.
|
||||
3. Re-snapshot — `pending_dialogs` is empty; the page's JS thread has resumed.
|
||||
|
||||
**Detection happens automatically** via a persistent CDP supervisor — one WebSocket per task that subscribes to Page/Runtime/Target events. The supervisor also populates a `frame_tree` field in the snapshot so the agent can see the iframe structure of the current page, including cross-origin (OOPIF) iframes.
|
||||
|
||||
**Availability matrix:**
|
||||
|
||||
| Backend | Detection via `pending_dialogs` | Response (`browser_dialog` tool) |
|
||||
|---|---|---|
|
||||
| Local Chrome via `/browser connect` or `browser.cdp_url` | ✓ | ✓ full workflow |
|
||||
| Browserbase | ✓ | ✓ full workflow (via injected XHR bridge) |
|
||||
| Camofox / default local agent-browser | ✗ | ✗ (no CDP endpoint) |
|
||||
|
||||
**How it works on Browserbase.** Browserbase's CDP proxy auto-dismisses real native dialogs server-side within ~10ms, so we can't use `Page.handleJavaScriptDialog`. The supervisor injects a small script via `Page.addScriptToEvaluateOnNewDocument` that overrides `window.alert`/`confirm`/`prompt` with a synchronous XHR. We intercept those XHRs via `Fetch.enable` — the page's JS thread stays blocked on the XHR until we call `Fetch.fulfillRequest` with the agent's response. `prompt()` return values round-trip back into page JS unchanged.
|
||||
|
||||
**Dialog policy** is configured in `config.yaml` under `browser.dialog_policy`:
|
||||
|
||||
| Policy | Behavior |
|
||||
|--------|----------|
|
||||
| `must_respond` (default) | Capture, surface in snapshot, wait for explicit `browser_dialog()` call. Safety auto-dismiss after `browser.dialog_timeout_s` (default 300s) so a buggy agent can't stall forever. |
|
||||
| `auto_dismiss` | Capture, dismiss immediately. Agent still sees the dialog in `browser_state` history but doesn't have to act. |
|
||||
| `auto_accept` | Capture, accept immediately. Useful when navigating pages with aggressive `beforeunload` prompts. |
|
||||
|
||||
**Frame tree** inside `browser_snapshot.frame_tree` is capped to 30 frames and OOPIF depth 2 to keep payloads bounded on ad-heavy pages. A `truncated: true` flag surfaces when limits were hit; agents needing the full tree can use `browser_cdp` with `Page.getFrameTree`.
|
||||
|
||||
## Practical Examples
|
||||
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
---
|
||||
title: "Apple Notes — Manage Apple Notes via the memo CLI on macOS (create, view, search, edit)"
|
||||
sidebar_label: "Apple Notes"
|
||||
description: "Manage Apple Notes via the memo CLI on macOS (create, view, search, edit)"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Apple Notes
|
||||
|
||||
Manage Apple Notes via the memo CLI on macOS (create, view, search, edit).
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/apple/apple-notes` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Platforms | macos |
|
||||
| Tags | `Notes`, `Apple`, `macOS`, `note-taking` |
|
||||
| Related skills | [`obsidian`](/docs/user-guide/skills/bundled/note-taking/note-taking-obsidian) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Apple Notes
|
||||
|
||||
Use `memo` to manage Apple Notes directly from the terminal. Notes sync across all Apple devices via iCloud.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **macOS** with Notes.app
|
||||
- Install: `brew tap antoniorodr/memo && brew install antoniorodr/memo/memo`
|
||||
- Grant Automation access to Notes.app when prompted (System Settings → Privacy → Automation)
|
||||
|
||||
## When to Use
|
||||
|
||||
- User asks to create, view, or search Apple Notes
|
||||
- Saving information to Notes.app for cross-device access
|
||||
- Organizing notes into folders
|
||||
- Exporting notes to Markdown/HTML
|
||||
|
||||
## When NOT to Use
|
||||
|
||||
- Obsidian vault management → use the `obsidian` skill
|
||||
- Bear Notes → separate app (not supported here)
|
||||
- Quick agent-only notes → use the `memory` tool instead
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### View Notes
|
||||
|
||||
```bash
|
||||
memo notes # List all notes
|
||||
memo notes -f "Folder Name" # Filter by folder
|
||||
memo notes -s "query" # Search notes (fuzzy)
|
||||
```
|
||||
|
||||
### Create Notes
|
||||
|
||||
```bash
|
||||
memo notes -a # Interactive editor
|
||||
memo notes -a "Note Title" # Quick add with title
|
||||
```
|
||||
|
||||
### Edit Notes
|
||||
|
||||
```bash
|
||||
memo notes -e # Interactive selection to edit
|
||||
```
|
||||
|
||||
### Delete Notes
|
||||
|
||||
```bash
|
||||
memo notes -d # Interactive selection to delete
|
||||
```
|
||||
|
||||
### Move Notes
|
||||
|
||||
```bash
|
||||
memo notes -m # Move note to folder (interactive)
|
||||
```
|
||||
|
||||
### Export Notes
|
||||
|
||||
```bash
|
||||
memo notes -ex # Export to HTML/Markdown
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Cannot edit notes containing images or attachments
|
||||
- Interactive prompts require terminal access (use pty=true if needed)
|
||||
- macOS only — requires Apple Notes.app
|
||||
|
||||
## Rules
|
||||
|
||||
1. Prefer Apple Notes when user wants cross-device sync (iPhone/iPad/Mac)
|
||||
2. Use the `memory` tool for agent-internal notes that don't need to sync
|
||||
3. Use the `obsidian` skill for Markdown-native knowledge management
|
||||
@@ -0,0 +1,114 @@
|
||||
---
|
||||
title: "Apple Reminders — Manage Apple Reminders via remindctl CLI (list, add, complete, delete)"
|
||||
sidebar_label: "Apple Reminders"
|
||||
description: "Manage Apple Reminders via remindctl CLI (list, add, complete, delete)"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Apple Reminders
|
||||
|
||||
Manage Apple Reminders via remindctl CLI (list, add, complete, delete).
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/apple/apple-reminders` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Platforms | macos |
|
||||
| Tags | `Reminders`, `tasks`, `todo`, `macOS`, `Apple` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Apple Reminders
|
||||
|
||||
Use `remindctl` to manage Apple Reminders directly from the terminal. Tasks sync across all Apple devices via iCloud.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **macOS** with Reminders.app
|
||||
- Install: `brew install steipete/tap/remindctl`
|
||||
- Grant Reminders permission when prompted
|
||||
- Check: `remindctl status` / Request: `remindctl authorize`
|
||||
|
||||
## When to Use
|
||||
|
||||
- User mentions "reminder" or "Reminders app"
|
||||
- Creating personal to-dos with due dates that sync to iOS
|
||||
- Managing Apple Reminders lists
|
||||
- User wants tasks to appear on their iPhone/iPad
|
||||
|
||||
## When NOT to Use
|
||||
|
||||
- Scheduling agent alerts → use the cronjob tool instead
|
||||
- Calendar events → use Apple Calendar or Google Calendar
|
||||
- Project task management → use GitHub Issues, Notion, etc.
|
||||
- If user says "remind me" but means an agent alert → clarify first
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### View Reminders
|
||||
|
||||
```bash
|
||||
remindctl # Today's reminders
|
||||
remindctl today # Today
|
||||
remindctl tomorrow # Tomorrow
|
||||
remindctl week # This week
|
||||
remindctl overdue # Past due
|
||||
remindctl all # Everything
|
||||
remindctl 2026-01-04 # Specific date
|
||||
```
|
||||
|
||||
### Manage Lists
|
||||
|
||||
```bash
|
||||
remindctl list # List all lists
|
||||
remindctl list Work # Show specific list
|
||||
remindctl list Projects --create # Create list
|
||||
remindctl list Work --delete # Delete list
|
||||
```
|
||||
|
||||
### Create Reminders
|
||||
|
||||
```bash
|
||||
remindctl add "Buy milk"
|
||||
remindctl add --title "Call mom" --list Personal --due tomorrow
|
||||
remindctl add --title "Meeting prep" --due "2026-02-15 09:00"
|
||||
```
|
||||
|
||||
### Complete / Delete
|
||||
|
||||
```bash
|
||||
remindctl complete 1 2 3 # Complete by ID
|
||||
remindctl delete 4A83 --force # Delete by ID
|
||||
```
|
||||
|
||||
### Output Formats
|
||||
|
||||
```bash
|
||||
remindctl today --json # JSON for scripting
|
||||
remindctl today --plain # TSV format
|
||||
remindctl today --quiet # Counts only
|
||||
```
|
||||
|
||||
## Date Formats
|
||||
|
||||
Accepted by `--due` and date filters:
|
||||
- `today`, `tomorrow`, `yesterday`
|
||||
- `YYYY-MM-DD`
|
||||
- `YYYY-MM-DD HH:mm`
|
||||
- ISO 8601 (`2026-01-04T12:34:56Z`)
|
||||
|
||||
## Rules
|
||||
|
||||
1. When user says "remind me", clarify: Apple Reminders (syncs to phone) vs agent cronjob alert
|
||||
2. Always confirm reminder content and due date before creating
|
||||
3. Use `--json` for programmatic parsing
|
||||
@@ -0,0 +1,149 @@
|
||||
---
|
||||
title: "Findmy — Track Apple devices and AirTags via FindMy"
|
||||
sidebar_label: "Findmy"
|
||||
description: "Track Apple devices and AirTags via FindMy"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Findmy
|
||||
|
||||
Track Apple devices and AirTags via FindMy.app on macOS using AppleScript and screen capture.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/apple/findmy` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Platforms | macos |
|
||||
| Tags | `FindMy`, `AirTag`, `location`, `tracking`, `macOS`, `Apple` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Find My (Apple)
|
||||
|
||||
Track Apple devices and AirTags via the FindMy.app on macOS. Since Apple doesn't
|
||||
provide a CLI for FindMy, this skill uses AppleScript to open the app and
|
||||
screen capture to read device locations.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **macOS** with Find My app and iCloud signed in
|
||||
- Devices/AirTags already registered in Find My
|
||||
- Screen Recording permission for terminal (System Settings → Privacy → Screen Recording)
|
||||
- **Optional but recommended**: Install `peekaboo` for better UI automation:
|
||||
`brew install steipete/tap/peekaboo`
|
||||
|
||||
## When to Use
|
||||
|
||||
- User asks "where is my [device/cat/keys/bag]?"
|
||||
- Tracking AirTag locations
|
||||
- Checking device locations (iPhone, iPad, Mac, AirPods)
|
||||
- Monitoring pet or item movement over time (AirTag patrol routes)
|
||||
|
||||
## Method 1: AppleScript + Screenshot (Basic)
|
||||
|
||||
### Open FindMy and Navigate
|
||||
|
||||
```bash
|
||||
# Open Find My app
|
||||
osascript -e 'tell application "FindMy" to activate'
|
||||
|
||||
# Wait for it to load
|
||||
sleep 3
|
||||
|
||||
# Take a screenshot of the Find My window
|
||||
screencapture -w -o /tmp/findmy.png
|
||||
```
|
||||
|
||||
Then use `vision_analyze` to read the screenshot:
|
||||
```
|
||||
vision_analyze(image_url="/tmp/findmy.png", question="What devices/items are shown and what are their locations?")
|
||||
```
|
||||
|
||||
### Switch Between Tabs
|
||||
|
||||
```bash
|
||||
# Switch to Devices tab
|
||||
osascript -e '
|
||||
tell application "System Events"
|
||||
tell process "FindMy"
|
||||
click button "Devices" of toolbar 1 of window 1
|
||||
end tell
|
||||
end tell'
|
||||
|
||||
# Switch to Items tab (AirTags)
|
||||
osascript -e '
|
||||
tell application "System Events"
|
||||
tell process "FindMy"
|
||||
click button "Items" of toolbar 1 of window 1
|
||||
end tell
|
||||
end tell'
|
||||
```
|
||||
|
||||
## Method 2: Peekaboo UI Automation (Recommended)
|
||||
|
||||
If `peekaboo` is installed, use it for more reliable UI interaction:
|
||||
|
||||
```bash
|
||||
# Open Find My
|
||||
osascript -e 'tell application "FindMy" to activate'
|
||||
sleep 3
|
||||
|
||||
# Capture and annotate the UI
|
||||
peekaboo see --app "FindMy" --annotate --path /tmp/findmy-ui.png
|
||||
|
||||
# Click on a specific device/item by element ID
|
||||
peekaboo click --on B3 --app "FindMy"
|
||||
|
||||
# Capture the detail view
|
||||
peekaboo image --app "FindMy" --path /tmp/findmy-detail.png
|
||||
```
|
||||
|
||||
Then analyze with vision:
|
||||
```
|
||||
vision_analyze(image_url="/tmp/findmy-detail.png", question="What is the location shown for this device/item? Include address and coordinates if visible.")
|
||||
```
|
||||
|
||||
## Workflow: Track AirTag Location Over Time
|
||||
|
||||
For monitoring an AirTag (e.g., tracking a cat's patrol route):
|
||||
|
||||
```bash
|
||||
# 1. Open FindMy to Items tab
|
||||
osascript -e 'tell application "FindMy" to activate'
|
||||
sleep 3
|
||||
|
||||
# 2. Click on the AirTag item (stay on page — AirTag only updates when page is open)
|
||||
|
||||
# 3. Periodically capture location
|
||||
while true; do
|
||||
screencapture -w -o /tmp/findmy-$(date +%H%M%S).png
|
||||
sleep 300 # Every 5 minutes
|
||||
done
|
||||
```
|
||||
|
||||
Analyze each screenshot with vision to extract coordinates, then compile a route.
|
||||
|
||||
## Limitations
|
||||
|
||||
- FindMy has **no CLI or API** — must use UI automation
|
||||
- AirTags only update location while the FindMy page is actively displayed
|
||||
- Location accuracy depends on nearby Apple devices in the FindMy network
|
||||
- Screen Recording permission required for screenshots
|
||||
- AppleScript UI automation may break across macOS versions
|
||||
|
||||
## Rules
|
||||
|
||||
1. Keep FindMy app in the foreground when tracking AirTags (updates stop when minimized)
|
||||
2. Use `vision_analyze` to read screenshot content — don't try to parse pixels
|
||||
3. For ongoing tracking, use a cronjob to periodically capture and log locations
|
||||
4. Respect privacy — only track devices/items the user owns
|
||||
@@ -0,0 +1,118 @@
|
||||
---
|
||||
title: "Imessage — Send and receive iMessages/SMS via the imsg CLI on macOS"
|
||||
sidebar_label: "Imessage"
|
||||
description: "Send and receive iMessages/SMS via the imsg CLI on macOS"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Imessage
|
||||
|
||||
Send and receive iMessages/SMS via the imsg CLI on macOS.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/apple/imessage` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Platforms | macos |
|
||||
| Tags | `iMessage`, `SMS`, `messaging`, `macOS`, `Apple` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# iMessage
|
||||
|
||||
Use `imsg` to read and send iMessage/SMS via macOS Messages.app.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **macOS** with Messages.app signed in
|
||||
- Install: `brew install steipete/tap/imsg`
|
||||
- Grant Full Disk Access for terminal (System Settings → Privacy → Full Disk Access)
|
||||
- Grant Automation permission for Messages.app when prompted
|
||||
|
||||
## When to Use
|
||||
|
||||
- User asks to send an iMessage or text message
|
||||
- Reading iMessage conversation history
|
||||
- Checking recent Messages.app chats
|
||||
- Sending to phone numbers or Apple IDs
|
||||
|
||||
## When NOT to Use
|
||||
|
||||
- Telegram/Discord/Slack/WhatsApp messages → use the appropriate gateway channel
|
||||
- Group chat management (adding/removing members) → not supported
|
||||
- Bulk/mass messaging → always confirm with user first
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### List Chats
|
||||
|
||||
```bash
|
||||
imsg chats --limit 10 --json
|
||||
```
|
||||
|
||||
### View History
|
||||
|
||||
```bash
|
||||
# By chat ID
|
||||
imsg history --chat-id 1 --limit 20 --json
|
||||
|
||||
# With attachments info
|
||||
imsg history --chat-id 1 --limit 20 --attachments --json
|
||||
```
|
||||
|
||||
### Send Messages
|
||||
|
||||
```bash
|
||||
# Text only
|
||||
imsg send --to "+14155551212" --text "Hello!"
|
||||
|
||||
# With attachment
|
||||
imsg send --to "+14155551212" --text "Check this out" --file /path/to/image.jpg
|
||||
|
||||
# Force iMessage or SMS
|
||||
imsg send --to "+14155551212" --text "Hi" --service imessage
|
||||
imsg send --to "+14155551212" --text "Hi" --service sms
|
||||
```
|
||||
|
||||
### Watch for New Messages
|
||||
|
||||
```bash
|
||||
imsg watch --chat-id 1 --attachments
|
||||
```
|
||||
|
||||
## Service Options
|
||||
|
||||
- `--service imessage` — Force iMessage (requires recipient has iMessage)
|
||||
- `--service sms` — Force SMS (green bubble)
|
||||
- `--service auto` — Let Messages.app decide (default)
|
||||
|
||||
## Rules
|
||||
|
||||
1. **Always confirm recipient and message content** before sending
|
||||
2. **Never send to unknown numbers** without explicit user approval
|
||||
3. **Verify file paths** exist before attaching
|
||||
4. **Don't spam** — rate-limit yourself
|
||||
|
||||
## Example Workflow
|
||||
|
||||
User: "Text mom that I'll be late"
|
||||
|
||||
```bash
|
||||
# 1. Find mom's chat
|
||||
imsg chats --limit 20 --json | jq '.[] | select(.displayName | contains("Mom"))'
|
||||
|
||||
# 2. Confirm with user: "Found Mom at +1555123456. Send 'I'll be late' via iMessage?"
|
||||
|
||||
# 3. Send after confirmation
|
||||
imsg send --to "+1555123456" --text "I'll be late"
|
||||
```
|
||||
+762
@@ -0,0 +1,762 @@
|
||||
---
|
||||
title: "Claude Code — Delegate coding tasks to Claude Code (Anthropic's CLI agent)"
|
||||
sidebar_label: "Claude Code"
|
||||
description: "Delegate coding tasks to Claude Code (Anthropic's CLI agent)"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Claude Code
|
||||
|
||||
Delegate coding tasks to Claude Code (Anthropic's CLI agent). Use for building features, refactoring, PR reviews, and iterative coding. Requires the claude CLI installed.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/autonomous-ai-agents/claude-code` |
|
||||
| Version | `2.2.0` |
|
||||
| Author | Hermes Agent + Teknium |
|
||||
| License | MIT |
|
||||
| Tags | `Coding-Agent`, `Claude`, `Anthropic`, `Code-Review`, `Refactoring`, `PTY`, `Automation` |
|
||||
| Related skills | [`codex`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-codex), [`hermes-agent`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-hermes-agent), [`opencode`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-opencode) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Claude Code — Hermes Orchestration Guide
|
||||
|
||||
Delegate coding tasks to [Claude Code](https://code.claude.com/docs/en/cli-reference) (Anthropic's autonomous coding agent CLI) via the Hermes terminal. Claude Code v2.x can read files, write code, run shell commands, spawn subagents, and manage git workflows autonomously.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **Install:** `npm install -g @anthropic-ai/claude-code`
|
||||
- **Auth:** run `claude` once to log in (browser OAuth for Pro/Max, or set `ANTHROPIC_API_KEY`)
|
||||
- **Console auth:** `claude auth login --console` for API key billing
|
||||
- **SSO auth:** `claude auth login --sso` for Enterprise
|
||||
- **Check status:** `claude auth status` (JSON) or `claude auth status --text` (human-readable)
|
||||
- **Health check:** `claude doctor` — checks auto-updater and installation health
|
||||
- **Version check:** `claude --version` (requires v2.x+)
|
||||
- **Update:** `claude update` or `claude upgrade`
|
||||
|
||||
## Two Orchestration Modes
|
||||
|
||||
Hermes interacts with Claude Code in two fundamentally different ways. Choose based on the task.
|
||||
|
||||
### Mode 1: Print Mode (`-p`) — Non-Interactive (PREFERRED for most tasks)
|
||||
|
||||
Print mode runs a one-shot task, returns the result, and exits. No PTY needed. No interactive prompts. This is the cleanest integration path.
|
||||
|
||||
```
|
||||
terminal(command="claude -p 'Add error handling to all API calls in src/' --allowedTools 'Read,Edit' --max-turns 10", workdir="/path/to/project", timeout=120)
|
||||
```
|
||||
|
||||
**When to use print mode:**
|
||||
- One-shot coding tasks (fix a bug, add a feature, refactor)
|
||||
- CI/CD automation and scripting
|
||||
- Structured data extraction with `--json-schema`
|
||||
- Piped input processing (`cat file | claude -p "analyze this"`)
|
||||
- Any task where you don't need multi-turn conversation
|
||||
|
||||
**Print mode skips ALL interactive dialogs** — no workspace trust prompt, no permission confirmations. This makes it ideal for automation.
|
||||
|
||||
### Mode 2: Interactive PTY via tmux — Multi-Turn Sessions
|
||||
|
||||
Interactive mode gives you a full conversational REPL where you can send follow-up prompts, use slash commands, and watch Claude work in real time. **Requires tmux orchestration.**
|
||||
|
||||
```
|
||||
# Start a tmux session
|
||||
terminal(command="tmux new-session -d -s claude-work -x 140 -y 40")
|
||||
|
||||
# Launch Claude Code inside it
|
||||
terminal(command="tmux send-keys -t claude-work 'cd /path/to/project && claude' Enter")
|
||||
|
||||
# Wait for startup, then send your task
|
||||
# (after ~3-5 seconds for the welcome screen)
|
||||
terminal(command="sleep 5 && tmux send-keys -t claude-work 'Refactor the auth module to use JWT tokens' Enter")
|
||||
|
||||
# Monitor progress by capturing the pane
|
||||
terminal(command="sleep 15 && tmux capture-pane -t claude-work -p -S -50")
|
||||
|
||||
# Send follow-up tasks
|
||||
terminal(command="tmux send-keys -t claude-work 'Now add unit tests for the new JWT code' Enter")
|
||||
|
||||
# Exit when done
|
||||
terminal(command="tmux send-keys -t claude-work '/exit' Enter")
|
||||
```
|
||||
|
||||
**When to use interactive mode:**
|
||||
- Multi-turn iterative work (refactor → review → fix → test cycle)
|
||||
- Tasks requiring human-in-the-loop decisions
|
||||
- Exploratory coding sessions
|
||||
- When you need to use Claude's slash commands (`/compact`, `/review`, `/model`)
|
||||
|
||||
## PTY Dialog Handling (CRITICAL for Interactive Mode)
|
||||
|
||||
Claude Code presents up to two confirmation dialogs on first launch. You MUST handle these via tmux send-keys:
|
||||
|
||||
### Dialog 1: Workspace Trust (first visit to a directory)
|
||||
```
|
||||
❯ 1. Yes, I trust this folder ← DEFAULT (just press Enter)
|
||||
2. No, exit
|
||||
```
|
||||
**Handling:** `tmux send-keys -t <session> Enter` — default selection is correct.
|
||||
|
||||
### Dialog 2: Bypass Permissions Warning (only with --dangerously-skip-permissions)
|
||||
```
|
||||
❯ 1. No, exit ← DEFAULT (WRONG choice!)
|
||||
2. Yes, I accept
|
||||
```
|
||||
**Handling:** Must navigate DOWN first, then Enter:
|
||||
```
|
||||
tmux send-keys -t <session> Down && sleep 0.3 && tmux send-keys -t <session> Enter
|
||||
```
|
||||
|
||||
### Robust Dialog Handling Pattern
|
||||
```
|
||||
# Launch with permissions bypass
|
||||
terminal(command="tmux send-keys -t claude-work 'claude --dangerously-skip-permissions \"your task\"' Enter")
|
||||
|
||||
# Handle trust dialog (Enter for default "Yes")
|
||||
terminal(command="sleep 4 && tmux send-keys -t claude-work Enter")
|
||||
|
||||
# Handle permissions dialog (Down then Enter for "Yes, I accept")
|
||||
terminal(command="sleep 3 && tmux send-keys -t claude-work Down && sleep 0.3 && tmux send-keys -t claude-work Enter")
|
||||
|
||||
# Now wait for Claude to work
|
||||
terminal(command="sleep 15 && tmux capture-pane -t claude-work -p -S -60")
|
||||
```
|
||||
|
||||
**Note:** After the first trust acceptance for a directory, the trust dialog won't appear again. Only the permissions dialog recurs each time you use `--dangerously-skip-permissions`.
|
||||
|
||||
## CLI Subcommands
|
||||
|
||||
| Subcommand | Purpose |
|
||||
|------------|---------|
|
||||
| `claude` | Start interactive REPL |
|
||||
| `claude "query"` | Start REPL with initial prompt |
|
||||
| `claude -p "query"` | Print mode (non-interactive, exits when done) |
|
||||
| `cat file \| claude -p "query"` | Pipe content as stdin context |
|
||||
| `claude -c` | Continue the most recent conversation in this directory |
|
||||
| `claude -r "id"` | Resume a specific session by ID or name |
|
||||
| `claude auth login` | Sign in (add `--console` for API billing, `--sso` for Enterprise) |
|
||||
| `claude auth status` | Check login status (returns JSON; `--text` for human-readable) |
|
||||
| `claude mcp add <name> -- <cmd>` | Add an MCP server |
|
||||
| `claude mcp list` | List configured MCP servers |
|
||||
| `claude mcp remove <name>` | Remove an MCP server |
|
||||
| `claude agents` | List configured agents |
|
||||
| `claude doctor` | Run health checks on installation and auto-updater |
|
||||
| `claude update` / `claude upgrade` | Update Claude Code to latest version |
|
||||
| `claude remote-control` | Start server to control Claude from claude.ai or mobile app |
|
||||
| `claude install [target]` | Install native build (stable, latest, or specific version) |
|
||||
| `claude setup-token` | Set up long-lived auth token (requires subscription) |
|
||||
| `claude plugin` / `claude plugins` | Manage Claude Code plugins |
|
||||
| `claude auto-mode` | Inspect auto mode classifier configuration |
|
||||
|
||||
## Print Mode Deep Dive
|
||||
|
||||
### Structured JSON Output
|
||||
```
|
||||
terminal(command="claude -p 'Analyze auth.py for security issues' --output-format json --max-turns 5", workdir="/project", timeout=120)
|
||||
```
|
||||
|
||||
Returns a JSON object with:
|
||||
```json
|
||||
{
|
||||
"type": "result",
|
||||
"subtype": "success",
|
||||
"result": "The analysis text...",
|
||||
"session_id": "75e2167f-...",
|
||||
"num_turns": 3,
|
||||
"total_cost_usd": 0.0787,
|
||||
"duration_ms": 10276,
|
||||
"stop_reason": "end_turn",
|
||||
"terminal_reason": "completed",
|
||||
"usage": { "input_tokens": 5, "output_tokens": 603, ... },
|
||||
"modelUsage": { "claude-sonnet-4-6": { "costUSD": 0.078, "contextWindow": 200000 } }
|
||||
}
|
||||
```
|
||||
|
||||
**Key fields:** `session_id` for resumption, `num_turns` for agentic loop count, `total_cost_usd` for spend tracking, `subtype` for success/error detection (`success`, `error_max_turns`, `error_budget`).
|
||||
|
||||
### Streaming JSON Output
|
||||
For real-time token streaming, use `stream-json` with `--verbose`:
|
||||
```
|
||||
terminal(command="claude -p 'Write a summary' --output-format stream-json --verbose --include-partial-messages", timeout=60)
|
||||
```
|
||||
|
||||
Returns newline-delimited JSON events. Filter with jq for live text:
|
||||
```
|
||||
claude -p "Explain X" --output-format stream-json --verbose --include-partial-messages | \
|
||||
jq -rj 'select(.type == "stream_event" and .event.delta.type? == "text_delta") | .event.delta.text'
|
||||
```
|
||||
|
||||
Stream events include `system/api_retry` with `attempt`, `max_retries`, and `error` fields (e.g., `rate_limit`, `billing_error`).
|
||||
|
||||
### Bidirectional Streaming
|
||||
For real-time input AND output streaming:
|
||||
```
|
||||
claude -p "task" --input-format stream-json --output-format stream-json --replay-user-messages
|
||||
```
|
||||
`--replay-user-messages` re-emits user messages on stdout for acknowledgment.
|
||||
|
||||
### Piped Input
|
||||
```
|
||||
# Pipe a file for analysis
|
||||
terminal(command="cat src/auth.py | claude -p 'Review this code for bugs' --max-turns 1", timeout=60)
|
||||
|
||||
# Pipe multiple files
|
||||
terminal(command="cat src/*.py | claude -p 'Find all TODO comments' --max-turns 1", timeout=60)
|
||||
|
||||
# Pipe command output
|
||||
terminal(command="git diff HEAD~3 | claude -p 'Summarize these changes' --max-turns 1", timeout=60)
|
||||
```
|
||||
|
||||
### JSON Schema for Structured Extraction
|
||||
```
|
||||
terminal(command="claude -p 'List all functions in src/' --output-format json --json-schema '{\"type\":\"object\",\"properties\":{\"functions\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}}},\"required\":[\"functions\"]}' --max-turns 5", workdir="/project", timeout=90)
|
||||
```
|
||||
|
||||
Parse `structured_output` from the JSON result. Claude validates output against the schema before returning.
|
||||
|
||||
### Session Continuation
|
||||
```
|
||||
# Start a task
|
||||
terminal(command="claude -p 'Start refactoring the database layer' --output-format json --max-turns 10 > /tmp/session.json", workdir="/project", timeout=180)
|
||||
|
||||
# Resume with session ID
|
||||
terminal(command="claude -p 'Continue and add connection pooling' --resume $(cat /tmp/session.json | python3 -c 'import json,sys; print(json.load(sys.stdin)[\"session_id\"])') --max-turns 5", workdir="/project", timeout=120)
|
||||
|
||||
# Or resume the most recent session in the same directory
|
||||
terminal(command="claude -p 'What did you do last time?' --continue --max-turns 1", workdir="/project", timeout=30)
|
||||
|
||||
# Fork a session (new ID, keeps history)
|
||||
terminal(command="claude -p 'Try a different approach' --resume <id> --fork-session --max-turns 10", workdir="/project", timeout=120)
|
||||
```
|
||||
|
||||
### Bare Mode for CI/Scripting
|
||||
```
|
||||
terminal(command="claude --bare -p 'Run all tests and report failures' --allowedTools 'Read,Bash' --max-turns 10", workdir="/project", timeout=180)
|
||||
```
|
||||
|
||||
`--bare` skips hooks, plugins, MCP discovery, and CLAUDE.md loading. Fastest startup. Requires `ANTHROPIC_API_KEY` (skips OAuth).
|
||||
|
||||
To selectively load context in bare mode:
|
||||
| To load | Flag |
|
||||
|---------|------|
|
||||
| System prompt additions | `--append-system-prompt "text"` or `--append-system-prompt-file path` |
|
||||
| Settings | `--settings <file-or-json>` |
|
||||
| MCP servers | `--mcp-config <file-or-json>` |
|
||||
| Custom agents | `--agents '<json>'` |
|
||||
|
||||
### Fallback Model for Overload
|
||||
```
|
||||
terminal(command="claude -p 'task' --fallback-model haiku --max-turns 5", timeout=90)
|
||||
```
|
||||
Automatically falls back to the specified model when the default is overloaded (print mode only).
|
||||
|
||||
## Complete CLI Flags Reference
|
||||
|
||||
### Session & Environment
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `-p, --print` | Non-interactive one-shot mode (exits when done) |
|
||||
| `-c, --continue` | Resume most recent conversation in current directory |
|
||||
| `-r, --resume <id>` | Resume specific session by ID or name (interactive picker if no ID) |
|
||||
| `--fork-session` | When resuming, create new session ID instead of reusing original |
|
||||
| `--session-id <uuid>` | Use a specific UUID for the conversation |
|
||||
| `--no-session-persistence` | Don't save session to disk (print mode only) |
|
||||
| `--add-dir <paths...>` | Grant Claude access to additional working directories |
|
||||
| `-w, --worktree [name]` | Run in an isolated git worktree at `.claude/worktrees/<name>` |
|
||||
| `--tmux` | Create a tmux session for the worktree (requires `--worktree`) |
|
||||
| `--ide` | Auto-connect to a valid IDE on startup |
|
||||
| `--chrome` / `--no-chrome` | Enable/disable Chrome browser integration for web testing |
|
||||
| `--from-pr [number]` | Resume session linked to a specific GitHub PR |
|
||||
| `--file <specs...>` | File resources to download at startup (format: `file_id:relative_path`) |
|
||||
|
||||
### Model & Performance
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `--model <alias>` | Model selection: `sonnet`, `opus`, `haiku`, or full name like `claude-sonnet-4-6` |
|
||||
| `--effort <level>` | Reasoning depth: `low`, `medium`, `high`, `max`, `auto` | Both |
|
||||
| `--max-turns <n>` | Limit agentic loops (print mode only; prevents runaway) |
|
||||
| `--max-budget-usd <n>` | Cap API spend in dollars (print mode only) |
|
||||
| `--fallback-model <model>` | Auto-fallback when default model is overloaded (print mode only) |
|
||||
| `--betas <betas...>` | Beta headers to include in API requests (API key users only) |
|
||||
|
||||
### Permission & Safety
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `--dangerously-skip-permissions` | Auto-approve ALL tool use (file writes, bash, network, etc.) |
|
||||
| `--allow-dangerously-skip-permissions` | Enable bypass as an *option* without enabling it by default |
|
||||
| `--permission-mode <mode>` | `default`, `acceptEdits`, `plan`, `auto`, `dontAsk`, `bypassPermissions` |
|
||||
| `--allowedTools <tools...>` | Whitelist specific tools (comma or space-separated) |
|
||||
| `--disallowedTools <tools...>` | Blacklist specific tools |
|
||||
| `--tools <tools...>` | Override built-in tool set (`""` = none, `"default"` = all, or tool names) |
|
||||
|
||||
### Output & Input Format
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `--output-format <fmt>` | `text` (default), `json` (single result object), `stream-json` (newline-delimited) |
|
||||
| `--input-format <fmt>` | `text` (default) or `stream-json` (real-time streaming input) |
|
||||
| `--json-schema <schema>` | Force structured JSON output matching a schema |
|
||||
| `--verbose` | Full turn-by-turn output |
|
||||
| `--include-partial-messages` | Include partial message chunks as they arrive (stream-json + print) |
|
||||
| `--replay-user-messages` | Re-emit user messages on stdout (stream-json bidirectional) |
|
||||
|
||||
### System Prompt & Context
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `--append-system-prompt <text>` | **Add** to the default system prompt (preserves built-in capabilities) |
|
||||
| `--append-system-prompt-file <path>` | **Add** file contents to the default system prompt |
|
||||
| `--system-prompt <text>` | **Replace** the entire system prompt (use --append instead usually) |
|
||||
| `--system-prompt-file <path>` | **Replace** the system prompt with file contents |
|
||||
| `--bare` | Skip hooks, plugins, MCP discovery, CLAUDE.md, OAuth (fastest startup) |
|
||||
| `--agents '<json>'` | Define custom subagents dynamically as JSON |
|
||||
| `--mcp-config <path>` | Load MCP servers from JSON file (repeatable) |
|
||||
| `--strict-mcp-config` | Only use MCP servers from `--mcp-config`, ignoring all other MCP configs |
|
||||
| `--settings <file-or-json>` | Load additional settings from a JSON file or inline JSON |
|
||||
| `--setting-sources <sources>` | Comma-separated sources to load: `user`, `project`, `local` |
|
||||
| `--plugin-dir <paths...>` | Load plugins from directories for this session only |
|
||||
| `--disable-slash-commands` | Disable all skills/slash commands |
|
||||
|
||||
### Debugging
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `-d, --debug [filter]` | Enable debug logging with optional category filter (e.g., `"api,hooks"`, `"!1p,!file"`) |
|
||||
| `--debug-file <path>` | Write debug logs to file (implicitly enables debug mode) |
|
||||
|
||||
### Agent Teams
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `--teammate-mode <mode>` | How agent teams display: `auto`, `in-process`, or `tmux` |
|
||||
| `--brief` | Enable `SendUserMessage` tool for agent-to-user communication |
|
||||
|
||||
### Tool Name Syntax for --allowedTools / --disallowedTools
|
||||
```
|
||||
Read # All file reading
|
||||
Edit # File editing (existing files)
|
||||
Write # File creation (new files)
|
||||
Bash # All shell commands
|
||||
Bash(git *) # Only git commands
|
||||
Bash(git commit *) # Only git commit commands
|
||||
Bash(npm run lint:*) # Pattern matching with wildcards
|
||||
WebSearch # Web search capability
|
||||
WebFetch # Web page fetching
|
||||
mcp__<server>__<tool> # Specific MCP tool
|
||||
```
|
||||
|
||||
## Settings & Configuration
|
||||
|
||||
### Settings Hierarchy (highest to lowest priority)
|
||||
1. **CLI flags** — override everything
|
||||
2. **Local project:** `.claude/settings.local.json` (personal, gitignored)
|
||||
3. **Project:** `.claude/settings.json` (shared, git-tracked)
|
||||
4. **User:** `~/.claude/settings.json` (global)
|
||||
|
||||
### Permissions in Settings
|
||||
```json
|
||||
{
|
||||
"permissions": {
|
||||
"allow": ["Bash(npm run lint:*)", "WebSearch", "Read"],
|
||||
"ask": ["Write(*.ts)", "Bash(git push*)"],
|
||||
"deny": ["Read(.env)", "Bash(rm -rf *)"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Memory Files (CLAUDE.md) Hierarchy
|
||||
1. **Global:** `~/.claude/CLAUDE.md` — applies to all projects
|
||||
2. **Project:** `./CLAUDE.md` — project-specific context (git-tracked)
|
||||
3. **Local:** `.claude/CLAUDE.local.md` — personal project overrides (gitignored)
|
||||
|
||||
Use the `#` prefix in interactive mode to quickly add to memory: `# Always use 2-space indentation`.
|
||||
|
||||
## Interactive Session: Slash Commands
|
||||
|
||||
### Session & Context
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `/help` | Show all commands (including custom and MCP commands) |
|
||||
| `/compact [focus]` | Compress context to save tokens; CLAUDE.md survives compaction. E.g., `/compact focus on auth logic` |
|
||||
| `/clear` | Wipe conversation history for a fresh start |
|
||||
| `/context` | Visualize context usage as a colored grid with optimization tips |
|
||||
| `/cost` | View token usage with per-model and cache-hit breakdowns |
|
||||
| `/resume` | Switch to or resume a different session |
|
||||
| `/rewind` | Revert to a previous checkpoint in conversation or code |
|
||||
| `/btw <question>` | Ask a side question without adding to context cost |
|
||||
| `/status` | Show version, connectivity, and session info |
|
||||
| `/todos` | List tracked action items from the conversation |
|
||||
| `/exit` or `Ctrl+D` | End session |
|
||||
|
||||
### Development & Review
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `/review` | Request code review of current changes |
|
||||
| `/security-review` | Perform security analysis of current changes |
|
||||
| `/plan [description]` | Enter Plan mode with auto-start for task planning |
|
||||
| `/loop [interval]` | Schedule recurring tasks within the session |
|
||||
| `/batch` | Auto-create worktrees for large parallel changes (5-30 worktrees) |
|
||||
|
||||
### Configuration & Tools
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `/model [model]` | Switch models mid-session (use arrow keys to adjust effort) |
|
||||
| `/effort [level]` | Set reasoning effort: `low`, `medium`, `high`, `max`, or `auto` |
|
||||
| `/init` | Create a CLAUDE.md file for project memory |
|
||||
| `/memory` | Open CLAUDE.md for editing |
|
||||
| `/config` | Open interactive settings configuration |
|
||||
| `/permissions` | View/update tool permissions |
|
||||
| `/agents` | Manage specialized subagents |
|
||||
| `/mcp` | Interactive UI to manage MCP servers |
|
||||
| `/add-dir` | Add additional working directories (useful for monorepos) |
|
||||
| `/usage` | Show plan limits and rate limit status |
|
||||
| `/voice` | Enable push-to-talk voice mode (20 languages; hold Space to record, release to send) |
|
||||
| `/release-notes` | Interactive picker for version release notes |
|
||||
|
||||
### Custom Slash Commands
|
||||
Create `.claude/commands/<name>.md` (project-shared) or `~/.claude/commands/<name>.md` (personal):
|
||||
|
||||
```markdown
|
||||
# .claude/commands/deploy.md
|
||||
Run the deploy pipeline:
|
||||
1. Run all tests
|
||||
2. Build the Docker image
|
||||
3. Push to registry
|
||||
4. Update the $ARGUMENTS environment (default: staging)
|
||||
```
|
||||
|
||||
Usage: `/deploy production` — `$ARGUMENTS` is replaced with the user's input.
|
||||
|
||||
### Skills (Natural Language Invocation)
|
||||
Unlike slash commands (manually invoked), skills in `.claude/skills/` are markdown guides that Claude invokes automatically via natural language when the task matches:
|
||||
|
||||
```markdown
|
||||
# .claude/skills/database-migration.md
|
||||
When asked to create or modify database migrations:
|
||||
1. Use Alembic for migration generation
|
||||
2. Always create a rollback function
|
||||
3. Test migrations against a local database copy
|
||||
```
|
||||
|
||||
## Interactive Session: Keyboard Shortcuts
|
||||
|
||||
### General Controls
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `Ctrl+C` | Cancel current input or generation |
|
||||
| `Ctrl+D` | Exit session |
|
||||
| `Ctrl+R` | Reverse search command history |
|
||||
| `Ctrl+B` | Background a running task |
|
||||
| `Ctrl+V` | Paste image into conversation |
|
||||
| `Ctrl+O` | Transcript mode — see Claude's thinking process |
|
||||
| `Ctrl+G` or `Ctrl+X Ctrl+E` | Open prompt in external editor |
|
||||
| `Esc Esc` | Rewind conversation or code state / summarize |
|
||||
|
||||
### Mode Toggles
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `Shift+Tab` | Cycle permission modes (Normal → Auto-Accept → Plan) |
|
||||
| `Alt+P` | Switch model |
|
||||
| `Alt+T` | Toggle thinking mode |
|
||||
| `Alt+O` | Toggle Fast Mode |
|
||||
|
||||
### Multiline Input
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `\` + `Enter` | Quick newline |
|
||||
| `Shift+Enter` | Newline (alternative) |
|
||||
| `Ctrl+J` | Newline (alternative) |
|
||||
|
||||
### Input Prefixes
|
||||
| Prefix | Action |
|
||||
|--------|--------|
|
||||
| `!` | Execute bash directly, bypassing AI (e.g., `!npm test`). Use `!` alone to toggle shell mode. |
|
||||
| `@` | Reference files/directories with autocomplete (e.g., `@./src/api/`) |
|
||||
| `#` | Quick add to CLAUDE.md memory (e.g., `# Use 2-space indentation`) |
|
||||
| `/` | Slash commands |
|
||||
|
||||
### Pro Tip: "ultrathink"
|
||||
Use the keyword "ultrathink" in your prompt for maximum reasoning effort on a specific turn. This triggers the deepest thinking mode regardless of the current `/effort` setting.
|
||||
|
||||
## PR Review Pattern
|
||||
|
||||
### Quick Review (Print Mode)
|
||||
```
|
||||
terminal(command="cd /path/to/repo && git diff main...feature-branch | claude -p 'Review this diff for bugs, security issues, and style problems. Be thorough.' --max-turns 1", timeout=60)
|
||||
```
|
||||
|
||||
### Deep Review (Interactive + Worktree)
|
||||
```
|
||||
terminal(command="tmux new-session -d -s review -x 140 -y 40")
|
||||
terminal(command="tmux send-keys -t review 'cd /path/to/repo && claude -w pr-review' Enter")
|
||||
terminal(command="sleep 5 && tmux send-keys -t review Enter") # Trust dialog
|
||||
terminal(command="sleep 2 && tmux send-keys -t review 'Review all changes vs main. Check for bugs, security issues, race conditions, and missing tests.' Enter")
|
||||
terminal(command="sleep 30 && tmux capture-pane -t review -p -S -60")
|
||||
```
|
||||
|
||||
### PR Review from Number
|
||||
```
|
||||
terminal(command="claude -p 'Review this PR thoroughly' --from-pr 42 --max-turns 10", workdir="/path/to/repo", timeout=120)
|
||||
```
|
||||
|
||||
### Claude Worktree with tmux
|
||||
```
|
||||
terminal(command="claude -w feature-x --tmux", workdir="/path/to/repo")
|
||||
```
|
||||
Creates an isolated git worktree at `.claude/worktrees/feature-x` AND a tmux session for it. Uses iTerm2 native panes when available; add `--tmux=classic` for traditional tmux.
|
||||
|
||||
## Parallel Claude Instances
|
||||
|
||||
Run multiple independent Claude tasks simultaneously:
|
||||
|
||||
```
|
||||
# Task 1: Fix backend
|
||||
terminal(command="tmux new-session -d -s task1 -x 140 -y 40 && tmux send-keys -t task1 'cd ~/project && claude -p \"Fix the auth bug in src/auth.py\" --allowedTools \"Read,Edit\" --max-turns 10' Enter")
|
||||
|
||||
# Task 2: Write tests
|
||||
terminal(command="tmux new-session -d -s task2 -x 140 -y 40 && tmux send-keys -t task2 'cd ~/project && claude -p \"Write integration tests for the API endpoints\" --allowedTools \"Read,Write,Bash\" --max-turns 15' Enter")
|
||||
|
||||
# Task 3: Update docs
|
||||
terminal(command="tmux new-session -d -s task3 -x 140 -y 40 && tmux send-keys -t task3 'cd ~/project && claude -p \"Update README.md with the new API endpoints\" --allowedTools \"Read,Edit\" --max-turns 5' Enter")
|
||||
|
||||
# Monitor all
|
||||
terminal(command="sleep 30 && for s in task1 task2 task3; do echo '=== '$s' ==='; tmux capture-pane -t $s -p -S -5 2>/dev/null; done")
|
||||
```
|
||||
|
||||
## CLAUDE.md — Project Context File
|
||||
|
||||
Claude Code auto-loads `CLAUDE.md` from the project root. Use it to persist project context:
|
||||
|
||||
```markdown
|
||||
# Project: My API
|
||||
|
||||
## Architecture
|
||||
- FastAPI backend with SQLAlchemy ORM
|
||||
- PostgreSQL database, Redis cache
|
||||
- pytest for testing with 90% coverage target
|
||||
|
||||
## Key Commands
|
||||
- `make test` — run full test suite
|
||||
- `make lint` — ruff + mypy
|
||||
- `make dev` — start dev server on :8000
|
||||
|
||||
## Code Standards
|
||||
- Type hints on all public functions
|
||||
- Docstrings in Google style
|
||||
- 2-space indentation for YAML, 4-space for Python
|
||||
- No wildcard imports
|
||||
```
|
||||
|
||||
**Be specific.** Instead of "Write good code", use "Use 2-space indentation for JS" or "Name test files with `.test.ts` suffix." Specific instructions save correction cycles.
|
||||
|
||||
### Rules Directory (Modular CLAUDE.md)
|
||||
For projects with many rules, use the rules directory instead of one massive CLAUDE.md:
|
||||
- **Project rules:** `.claude/rules/*.md` — team-shared, git-tracked
|
||||
- **User rules:** `~/.claude/rules/*.md` — personal, global
|
||||
|
||||
Each `.md` file in the rules directory is loaded as additional context. This is cleaner than cramming everything into a single CLAUDE.md.
|
||||
|
||||
### Auto-Memory
|
||||
Claude automatically stores learned project context in `~/.claude/projects/<project>/memory/`.
|
||||
- **Limit:** 25KB or 200 lines per project
|
||||
- This is separate from CLAUDE.md — it's Claude's own notes about the project, accumulated across sessions
|
||||
|
||||
## Custom Subagents
|
||||
|
||||
Define specialized agents in `.claude/agents/` (project), `~/.claude/agents/` (personal), or via `--agents` CLI flag (session):
|
||||
|
||||
### Agent Location Priority
|
||||
1. `.claude/agents/` — project-level, team-shared
|
||||
2. `--agents` CLI flag — session-specific, dynamic
|
||||
3. `~/.claude/agents/` — user-level, personal
|
||||
|
||||
### Creating an Agent
|
||||
```markdown
|
||||
# .claude/agents/security-reviewer.md
|
||||
---
|
||||
name: security-reviewer
|
||||
description: Security-focused code review
|
||||
model: opus
|
||||
tools: [Read, Bash]
|
||||
---
|
||||
You are a senior security engineer. Review code for:
|
||||
- Injection vulnerabilities (SQL, XSS, command injection)
|
||||
- Authentication/authorization flaws
|
||||
- Secrets in code
|
||||
- Unsafe deserialization
|
||||
```
|
||||
|
||||
Invoke via: `@security-reviewer review the auth module`
|
||||
|
||||
### Dynamic Agents via CLI
|
||||
```
|
||||
terminal(command="claude --agents '{\"reviewer\": {\"description\": \"Reviews code\", \"prompt\": \"You are a code reviewer focused on performance\"}}' -p 'Use @reviewer to check auth.py'", timeout=120)
|
||||
```
|
||||
|
||||
Claude can orchestrate multiple agents: "Use @db-expert to optimize queries, then @security to audit the changes."
|
||||
|
||||
## Hooks — Automation on Events
|
||||
|
||||
Configure in `.claude/settings.json` (project) or `~/.claude/settings.json` (global):
|
||||
|
||||
```json
|
||||
{
|
||||
"hooks": {
|
||||
"PostToolUse": [{
|
||||
"matcher": "Write(*.py)",
|
||||
"hooks": [{"type": "command", "command": "ruff check --fix $CLAUDE_FILE_PATHS"}]
|
||||
}],
|
||||
"PreToolUse": [{
|
||||
"matcher": "Bash",
|
||||
"hooks": [{"type": "command", "command": "if echo \"$CLAUDE_TOOL_INPUT\" | grep -q 'rm -rf'; then echo 'Blocked!' && exit 2; fi"}]
|
||||
}],
|
||||
"Stop": [{
|
||||
"hooks": [{"type": "command", "command": "echo 'Claude finished a response' >> /tmp/claude-activity.log"}]
|
||||
}]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### All 8 Hook Types
|
||||
| Hook | When it fires | Common use |
|
||||
|------|--------------|------------|
|
||||
| `UserPromptSubmit` | Before Claude processes a user prompt | Input validation, logging |
|
||||
| `PreToolUse` | Before tool execution | Security gates, block dangerous commands (exit 2 = block) |
|
||||
| `PostToolUse` | After a tool finishes | Auto-format code, run linters |
|
||||
| `Notification` | On permission requests or input waits | Desktop notifications, alerts |
|
||||
| `Stop` | When Claude finishes a response | Completion logging, status updates |
|
||||
| `SubagentStop` | When a subagent completes | Agent orchestration |
|
||||
| `PreCompact` | Before context memory is cleared | Backup session transcripts |
|
||||
| `SessionStart` | When a session begins | Load dev context (e.g., `git status`) |
|
||||
|
||||
### Hook Environment Variables
|
||||
| Variable | Content |
|
||||
|----------|---------|
|
||||
| `CLAUDE_PROJECT_DIR` | Current project path |
|
||||
| `CLAUDE_FILE_PATHS` | Files being modified |
|
||||
| `CLAUDE_TOOL_INPUT` | Tool parameters as JSON |
|
||||
|
||||
### Security Hook Examples
|
||||
```json
|
||||
{
|
||||
"PreToolUse": [{
|
||||
"matcher": "Bash",
|
||||
"hooks": [{"type": "command", "command": "if echo \"$CLAUDE_TOOL_INPUT\" | grep -qE 'rm -rf|git push.*--force|:(){ :|:& };:'; then echo 'Dangerous command blocked!' && exit 2; fi"}]
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
## MCP Integration
|
||||
|
||||
Add external tool servers for databases, APIs, and services:
|
||||
|
||||
```
|
||||
# GitHub integration
|
||||
terminal(command="claude mcp add -s user github -- npx @modelcontextprotocol/server-github", timeout=30)
|
||||
|
||||
# PostgreSQL queries
|
||||
terminal(command="claude mcp add -s local postgres -- npx @anthropic-ai/server-postgres --connection-string postgresql://localhost/mydb", timeout=30)
|
||||
|
||||
# Puppeteer for web testing
|
||||
terminal(command="claude mcp add puppeteer -- npx @anthropic-ai/server-puppeteer", timeout=30)
|
||||
```
|
||||
|
||||
### MCP Scopes
|
||||
| Flag | Scope | Storage |
|
||||
|------|-------|---------|
|
||||
| `-s user` | Global (all projects) | `~/.claude.json` |
|
||||
| `-s local` | This project (personal) | `.claude/settings.local.json` (gitignored) |
|
||||
| `-s project` | This project (team-shared) | `.claude/settings.json` (git-tracked) |
|
||||
|
||||
### MCP in Print/CI Mode
|
||||
```
|
||||
terminal(command="claude --bare -p 'Query database' --mcp-config mcp-servers.json --strict-mcp-config", timeout=60)
|
||||
```
|
||||
`--strict-mcp-config` ignores all MCP servers except those from `--mcp-config`.
|
||||
|
||||
Reference MCP resources in chat: `@github:issue://123`
|
||||
|
||||
### MCP Limits & Tuning
|
||||
- **Tool descriptions:** 2KB cap per server for tool descriptions and server instructions
|
||||
- **Result size:** Default capped; use `maxResultSizeChars` annotation to allow up to **500K** characters for large outputs
|
||||
- **Output tokens:** `export MAX_MCP_OUTPUT_TOKENS=50000` — cap output from MCP servers to prevent context flooding
|
||||
- **Transports:** `stdio` (local process), `http` (remote), `sse` (server-sent events)
|
||||
|
||||
## Monitoring Interactive Sessions
|
||||
|
||||
### Reading the TUI Status
|
||||
```
|
||||
# Periodic capture to check if Claude is still working or waiting for input
|
||||
terminal(command="tmux capture-pane -t dev -p -S -10")
|
||||
```
|
||||
|
||||
Look for these indicators:
|
||||
- `❯` at bottom = waiting for your input (Claude is done or asking a question)
|
||||
- `●` lines = Claude is actively using tools (reading, writing, running commands)
|
||||
- `⏵⏵ bypass permissions on` = status bar showing permissions mode
|
||||
- `◐ medium · /effort` = current effort level in status bar
|
||||
- `ctrl+o to expand` = tool output was truncated (can be expanded interactively)
|
||||
|
||||
### Context Window Health
|
||||
Use `/context` in interactive mode to see a colored grid of context usage. Key thresholds:
|
||||
- **< 70%** — Normal operation, full precision
|
||||
- **70-85%** — Precision starts dropping, consider `/compact`
|
||||
- **> 85%** — Hallucination risk spikes significantly, use `/compact` or `/clear`
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Effect |
|
||||
|----------|--------|
|
||||
| `ANTHROPIC_API_KEY` | API key for authentication (alternative to OAuth) |
|
||||
| `CLAUDE_CODE_EFFORT_LEVEL` | Default effort: `low`, `medium`, `high`, `max`, or `auto` |
|
||||
| `MAX_THINKING_TOKENS` | Cap thinking tokens (set to `0` to disable thinking entirely) |
|
||||
| `MAX_MCP_OUTPUT_TOKENS` | Cap output from MCP servers (default varies; set e.g., `50000`) |
|
||||
| `CLAUDE_CODE_NO_FLICKER=1` | Enable alt-screen rendering to eliminate terminal flicker |
|
||||
| `CLAUDE_CODE_SUBPROCESS_ENV_SCRUB` | Strip credentials from sub-processes for security |
|
||||
|
||||
## Cost & Performance Tips
|
||||
|
||||
1. **Use `--max-turns`** in print mode to prevent runaway loops. Start with 5-10 for most tasks.
|
||||
2. **Use `--max-budget-usd`** for cost caps. Note: minimum ~$0.05 for system prompt cache creation.
|
||||
3. **Use `--effort low`** for simple tasks (faster, cheaper). `high` or `max` for complex reasoning.
|
||||
4. **Use `--bare`** for CI/scripting to skip plugin/hook discovery overhead.
|
||||
5. **Use `--allowedTools`** to restrict to only what's needed (e.g., `Read` only for reviews).
|
||||
6. **Use `/compact`** in interactive sessions when context gets large.
|
||||
7. **Pipe input** instead of having Claude read files when you just need analysis of known content.
|
||||
8. **Use `--model haiku`** for simple tasks (cheaper) and `--model opus` for complex multi-step work.
|
||||
9. **Use `--fallback-model haiku`** in print mode to gracefully handle model overload.
|
||||
10. **Start new sessions for distinct tasks** — sessions last 5 hours; fresh context is more efficient.
|
||||
11. **Use `--no-session-persistence`** in CI to avoid accumulating saved sessions on disk.
|
||||
|
||||
## Pitfalls & Gotchas
|
||||
|
||||
1. **Interactive mode REQUIRES tmux** — Claude Code is a full TUI app. Using `pty=true` alone in Hermes terminal works but tmux gives you `capture-pane` for monitoring and `send-keys` for input, which is essential for orchestration.
|
||||
2. **`--dangerously-skip-permissions` dialog defaults to "No, exit"** — you must send Down then Enter to accept. Print mode (`-p`) skips this entirely.
|
||||
3. **`--max-budget-usd` minimum is ~$0.05** — system prompt cache creation alone costs this much. Setting lower will error immediately.
|
||||
4. **`--max-turns` is print-mode only** — ignored in interactive sessions.
|
||||
5. **Claude may use `python` instead of `python3`** — on systems without a `python` symlink, Claude's bash commands will fail on first try but it self-corrects.
|
||||
6. **Session resumption requires same directory** — `--continue` finds the most recent session for the current working directory.
|
||||
7. **`--json-schema` needs enough `--max-turns`** — Claude must read files before producing structured output, which takes multiple turns.
|
||||
8. **Trust dialog only appears once per directory** — first-time only, then cached.
|
||||
9. **Background tmux sessions persist** — always clean up with `tmux kill-session -t <name>` when done.
|
||||
10. **Slash commands (like `/commit`) only work in interactive mode** — in `-p` mode, describe the task in natural language instead.
|
||||
11. **`--bare` skips OAuth** — requires `ANTHROPIC_API_KEY` env var or an `apiKeyHelper` in settings.
|
||||
12. **Context degradation is real** — AI output quality measurably degrades above 70% context window usage. Monitor with `/context` and proactively `/compact`.
|
||||
|
||||
## Rules for Hermes Agents
|
||||
|
||||
1. **Prefer print mode (`-p`) for single tasks** — cleaner, no dialog handling, structured output
|
||||
2. **Use tmux for multi-turn interactive work** — the only reliable way to orchestrate the TUI
|
||||
3. **Always set `workdir`** — keep Claude focused on the right project directory
|
||||
4. **Set `--max-turns` in print mode** — prevents infinite loops and runaway costs
|
||||
5. **Monitor tmux sessions** — use `tmux capture-pane -t <session> -p -S -50` to check progress
|
||||
6. **Look for the `❯` prompt** — indicates Claude is waiting for input (done or asking a question)
|
||||
7. **Clean up tmux sessions** — kill them when done to avoid resource leaks
|
||||
8. **Report results to user** — after completion, summarize what Claude did and what changed
|
||||
9. **Don't kill slow sessions** — Claude may be doing multi-step work; check progress instead
|
||||
10. **Use `--allowedTools`** — restrict capabilities to what the task actually needs
|
||||
+131
@@ -0,0 +1,131 @@
|
||||
---
|
||||
title: "Codex — Delegate coding tasks to OpenAI Codex CLI agent"
|
||||
sidebar_label: "Codex"
|
||||
description: "Delegate coding tasks to OpenAI Codex CLI agent"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Codex
|
||||
|
||||
Delegate coding tasks to OpenAI Codex CLI agent. Use for building features, refactoring, PR reviews, and batch issue fixing. Requires the codex CLI and a git repository.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/autonomous-ai-agents/codex` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `Coding-Agent`, `Codex`, `OpenAI`, `Code-Review`, `Refactoring` |
|
||||
| Related skills | [`claude-code`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-claude-code), [`hermes-agent`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-hermes-agent) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Codex CLI
|
||||
|
||||
Delegate coding tasks to [Codex](https://github.com/openai/codex) via the Hermes terminal. Codex is OpenAI's autonomous coding agent CLI.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Codex installed: `npm install -g @openai/codex`
|
||||
- OpenAI API key configured
|
||||
- **Must run inside a git repository** — Codex refuses to run outside one
|
||||
- Use `pty=true` in terminal calls — Codex is an interactive terminal app
|
||||
|
||||
## One-Shot Tasks
|
||||
|
||||
```
|
||||
terminal(command="codex exec 'Add dark mode toggle to settings'", workdir="~/project", pty=true)
|
||||
```
|
||||
|
||||
For scratch work (Codex needs a git repo):
|
||||
```
|
||||
terminal(command="cd $(mktemp -d) && git init && codex exec 'Build a snake game in Python'", pty=true)
|
||||
```
|
||||
|
||||
## Background Mode (Long Tasks)
|
||||
|
||||
```
|
||||
# Start in background with PTY
|
||||
terminal(command="codex exec --full-auto 'Refactor the auth module'", workdir="~/project", background=true, pty=true)
|
||||
# Returns session_id
|
||||
|
||||
# Monitor progress
|
||||
process(action="poll", session_id="<id>")
|
||||
process(action="log", session_id="<id>")
|
||||
|
||||
# Send input if Codex asks a question
|
||||
process(action="submit", session_id="<id>", data="yes")
|
||||
|
||||
# Kill if needed
|
||||
process(action="kill", session_id="<id>")
|
||||
```
|
||||
|
||||
## Key Flags
|
||||
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `exec "prompt"` | One-shot execution, exits when done |
|
||||
| `--full-auto` | Sandboxed but auto-approves file changes in workspace |
|
||||
| `--yolo` | No sandbox, no approvals (fastest, most dangerous) |
|
||||
|
||||
## PR Reviews
|
||||
|
||||
Clone to a temp directory for safe review:
|
||||
|
||||
```
|
||||
terminal(command="REVIEW=$(mktemp -d) && git clone https://github.com/user/repo.git $REVIEW && cd $REVIEW && gh pr checkout 42 && codex review --base origin/main", pty=true)
|
||||
```
|
||||
|
||||
## Parallel Issue Fixing with Worktrees
|
||||
|
||||
```
|
||||
# Create worktrees
|
||||
terminal(command="git worktree add -b fix/issue-78 /tmp/issue-78 main", workdir="~/project")
|
||||
terminal(command="git worktree add -b fix/issue-99 /tmp/issue-99 main", workdir="~/project")
|
||||
|
||||
# Launch Codex in each
|
||||
terminal(command="codex --yolo exec 'Fix issue #78: <description>. Commit when done.'", workdir="/tmp/issue-78", background=true, pty=true)
|
||||
terminal(command="codex --yolo exec 'Fix issue #99: <description>. Commit when done.'", workdir="/tmp/issue-99", background=true, pty=true)
|
||||
|
||||
# Monitor
|
||||
process(action="list")
|
||||
|
||||
# After completion, push and create PRs
|
||||
terminal(command="cd /tmp/issue-78 && git push -u origin fix/issue-78")
|
||||
terminal(command="gh pr create --repo user/repo --head fix/issue-78 --title 'fix: ...' --body '...'")
|
||||
|
||||
# Cleanup
|
||||
terminal(command="git worktree remove /tmp/issue-78", workdir="~/project")
|
||||
```
|
||||
|
||||
## Batch PR Reviews
|
||||
|
||||
```
|
||||
# Fetch all PR refs
|
||||
terminal(command="git fetch origin '+refs/pull/*/head:refs/remotes/origin/pr/*'", workdir="~/project")
|
||||
|
||||
# Review multiple PRs in parallel
|
||||
terminal(command="codex exec 'Review PR #86. git diff origin/main...origin/pr/86'", workdir="~/project", background=true, pty=true)
|
||||
terminal(command="codex exec 'Review PR #87. git diff origin/main...origin/pr/87'", workdir="~/project", background=true, pty=true)
|
||||
|
||||
# Post results
|
||||
terminal(command="gh pr comment 86 --body '<review>'", workdir="~/project")
|
||||
```
|
||||
|
||||
## Rules
|
||||
|
||||
1. **Always use `pty=true`** — Codex is an interactive terminal app and hangs without a PTY
|
||||
2. **Git repo required** — Codex won't run outside a git directory. Use `mktemp -d && git init` for scratch
|
||||
3. **Use `exec` for one-shots** — `codex exec "prompt"` runs and exits cleanly
|
||||
4. **`--full-auto` for building** — auto-approves changes within the sandbox
|
||||
5. **Background for long tasks** — use `background=true` and monitor with `process` tool
|
||||
6. **Don't interfere** — monitor with `poll`/`log`, be patient with long-running tasks
|
||||
7. **Parallel is fine** — run multiple Codex processes at once for batch work
|
||||
+722
@@ -0,0 +1,722 @@
|
||||
---
|
||||
title: "Hermes Agent"
|
||||
sidebar_label: "Hermes Agent"
|
||||
description: "Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, pr..."
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Hermes Agent
|
||||
|
||||
Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/autonomous-ai-agents/hermes-agent` |
|
||||
| Version | `2.0.0` |
|
||||
| Author | Hermes Agent + Teknium |
|
||||
| License | MIT |
|
||||
| Tags | `hermes`, `setup`, `configuration`, `multi-agent`, `spawning`, `cli`, `gateway`, `development` |
|
||||
| Related skills | [`claude-code`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-claude-code), [`codex`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-codex), [`opencode`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-opencode) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Hermes Agent
|
||||
|
||||
Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
|
||||
|
||||
What makes Hermes different:
|
||||
|
||||
- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
|
||||
- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
|
||||
- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
|
||||
- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
|
||||
- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
|
||||
- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
|
||||
|
||||
People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
|
||||
|
||||
**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
|
||||
|
||||
**Docs:** https://hermes-agent.nousresearch.com/docs/
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Install
|
||||
curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
|
||||
|
||||
# Interactive chat (default)
|
||||
hermes
|
||||
|
||||
# Single query
|
||||
hermes chat -q "What is the capital of France?"
|
||||
|
||||
# Setup wizard
|
||||
hermes setup
|
||||
|
||||
# Change model/provider
|
||||
hermes model
|
||||
|
||||
# Check health
|
||||
hermes doctor
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## CLI Reference
|
||||
|
||||
### Global Flags
|
||||
|
||||
```
|
||||
hermes [flags] [command]
|
||||
|
||||
--version, -V Show version
|
||||
--resume, -r SESSION Resume session by ID or title
|
||||
--continue, -c [NAME] Resume by name, or most recent session
|
||||
--worktree, -w Isolated git worktree mode (parallel agents)
|
||||
--skills, -s SKILL Preload skills (comma-separate or repeat)
|
||||
--profile, -p NAME Use a named profile
|
||||
--yolo Skip dangerous command approval
|
||||
--pass-session-id Include session ID in system prompt
|
||||
```
|
||||
|
||||
No subcommand defaults to `chat`.
|
||||
|
||||
### Chat
|
||||
|
||||
```
|
||||
hermes chat [flags]
|
||||
-q, --query TEXT Single query, non-interactive
|
||||
-m, --model MODEL Model (e.g. anthropic/claude-sonnet-4)
|
||||
-t, --toolsets LIST Comma-separated toolsets
|
||||
--provider PROVIDER Force provider (openrouter, anthropic, nous, etc.)
|
||||
-v, --verbose Verbose output
|
||||
-Q, --quiet Suppress banner, spinner, tool previews
|
||||
--checkpoints Enable filesystem checkpoints (/rollback)
|
||||
--source TAG Session source tag (default: cli)
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
```
|
||||
hermes setup [section] Interactive wizard (model|terminal|gateway|tools|agent)
|
||||
hermes model Interactive model/provider picker
|
||||
hermes config View current config
|
||||
hermes config edit Open config.yaml in $EDITOR
|
||||
hermes config set KEY VAL Set a config value
|
||||
hermes config path Print config.yaml path
|
||||
hermes config env-path Print .env path
|
||||
hermes config check Check for missing/outdated config
|
||||
hermes config migrate Update config with new options
|
||||
hermes login [--provider P] OAuth login (nous, openai-codex)
|
||||
hermes logout Clear stored auth
|
||||
hermes doctor [--fix] Check dependencies and config
|
||||
hermes status [--all] Show component status
|
||||
```
|
||||
|
||||
### Tools & Skills
|
||||
|
||||
```
|
||||
hermes tools Interactive tool enable/disable (curses UI)
|
||||
hermes tools list Show all tools and status
|
||||
hermes tools enable NAME Enable a toolset
|
||||
hermes tools disable NAME Disable a toolset
|
||||
|
||||
hermes skills list List installed skills
|
||||
hermes skills search QUERY Search the skills hub
|
||||
hermes skills install ID Install a skill
|
||||
hermes skills inspect ID Preview without installing
|
||||
hermes skills config Enable/disable skills per platform
|
||||
hermes skills check Check for updates
|
||||
hermes skills update Update outdated skills
|
||||
hermes skills uninstall N Remove a hub skill
|
||||
hermes skills publish PATH Publish to registry
|
||||
hermes skills browse Browse all available skills
|
||||
hermes skills tap add REPO Add a GitHub repo as skill source
|
||||
```
|
||||
|
||||
### MCP Servers
|
||||
|
||||
```
|
||||
hermes mcp serve Run Hermes as an MCP server
|
||||
hermes mcp add NAME Add an MCP server (--url or --command)
|
||||
hermes mcp remove NAME Remove an MCP server
|
||||
hermes mcp list List configured servers
|
||||
hermes mcp test NAME Test connection
|
||||
hermes mcp configure NAME Toggle tool selection
|
||||
```
|
||||
|
||||
### Gateway (Messaging Platforms)
|
||||
|
||||
```
|
||||
hermes gateway run Start gateway foreground
|
||||
hermes gateway install Install as background service
|
||||
hermes gateway start/stop Control the service
|
||||
hermes gateway restart Restart the service
|
||||
hermes gateway status Check status
|
||||
hermes gateway setup Configure platforms
|
||||
```
|
||||
|
||||
Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
|
||||
|
||||
Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
|
||||
|
||||
### Sessions
|
||||
|
||||
```
|
||||
hermes sessions list List recent sessions
|
||||
hermes sessions browse Interactive picker
|
||||
hermes sessions export OUT Export to JSONL
|
||||
hermes sessions rename ID T Rename a session
|
||||
hermes sessions delete ID Delete a session
|
||||
hermes sessions prune Clean up old sessions (--older-than N days)
|
||||
hermes sessions stats Session store statistics
|
||||
```
|
||||
|
||||
### Cron Jobs
|
||||
|
||||
```
|
||||
hermes cron list List jobs (--all for disabled)
|
||||
hermes cron create SCHED Create: '30m', 'every 2h', '0 9 * * *'
|
||||
hermes cron edit ID Edit schedule, prompt, delivery
|
||||
hermes cron pause/resume ID Control job state
|
||||
hermes cron run ID Trigger on next tick
|
||||
hermes cron remove ID Delete a job
|
||||
hermes cron status Scheduler status
|
||||
```
|
||||
|
||||
### Webhooks
|
||||
|
||||
```
|
||||
hermes webhook subscribe N Create route at /webhooks/<name>
|
||||
hermes webhook list List subscriptions
|
||||
hermes webhook remove NAME Remove a subscription
|
||||
hermes webhook test NAME Send a test POST
|
||||
```
|
||||
|
||||
### Profiles
|
||||
|
||||
```
|
||||
hermes profile list List all profiles
|
||||
hermes profile create NAME Create (--clone, --clone-all, --clone-from)
|
||||
hermes profile use NAME Set sticky default
|
||||
hermes profile delete NAME Delete a profile
|
||||
hermes profile show NAME Show details
|
||||
hermes profile alias NAME Manage wrapper scripts
|
||||
hermes profile rename A B Rename a profile
|
||||
hermes profile export NAME Export to tar.gz
|
||||
hermes profile import FILE Import from archive
|
||||
```
|
||||
|
||||
### Credential Pools
|
||||
|
||||
```
|
||||
hermes auth add Interactive credential wizard
|
||||
hermes auth list [PROVIDER] List pooled credentials
|
||||
hermes auth remove P INDEX Remove by provider + index
|
||||
hermes auth reset PROVIDER Clear exhaustion status
|
||||
```
|
||||
|
||||
### Other
|
||||
|
||||
```
|
||||
hermes insights [--days N] Usage analytics
|
||||
hermes update Update to latest version
|
||||
hermes pairing list/approve/revoke DM authorization
|
||||
hermes plugins list/install/remove Plugin management
|
||||
hermes honcho setup/status Honcho memory integration (requires honcho plugin)
|
||||
hermes memory setup/status/off Memory provider config
|
||||
hermes completion bash|zsh Shell completions
|
||||
hermes acp ACP server (IDE integration)
|
||||
hermes claw migrate Migrate from OpenClaw
|
||||
hermes uninstall Uninstall Hermes
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Slash Commands (In-Session)
|
||||
|
||||
Type these during an interactive chat session.
|
||||
|
||||
### Session Control
|
||||
```
|
||||
/new (/reset) Fresh session
|
||||
/clear Clear screen + new session (CLI)
|
||||
/retry Resend last message
|
||||
/undo Remove last exchange
|
||||
/title [name] Name the session
|
||||
/compress Manually compress context
|
||||
/stop Kill background processes
|
||||
/rollback [N] Restore filesystem checkpoint
|
||||
/background <prompt> Run prompt in background
|
||||
/queue <prompt> Queue for next turn
|
||||
/resume [name] Resume a named session
|
||||
```
|
||||
|
||||
### Configuration
|
||||
```
|
||||
/config Show config (CLI)
|
||||
/model [name] Show or change model
|
||||
/provider Show provider info
|
||||
/personality [name] Set personality
|
||||
/reasoning [level] Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
|
||||
/verbose Cycle: off → new → all → verbose
|
||||
/voice [on|off|tts] Voice mode
|
||||
/yolo Toggle approval bypass
|
||||
/skin [name] Change theme (CLI)
|
||||
/statusbar Toggle status bar (CLI)
|
||||
```
|
||||
|
||||
### Tools & Skills
|
||||
```
|
||||
/tools Manage tools (CLI)
|
||||
/toolsets List toolsets (CLI)
|
||||
/skills Search/install skills (CLI)
|
||||
/skill <name> Load a skill into session
|
||||
/cron Manage cron jobs (CLI)
|
||||
/reload-mcp Reload MCP servers
|
||||
/plugins List plugins (CLI)
|
||||
```
|
||||
|
||||
### Gateway
|
||||
```
|
||||
/approve Approve a pending command (gateway)
|
||||
/deny Deny a pending command (gateway)
|
||||
/restart Restart gateway (gateway)
|
||||
/sethome Set current chat as home channel (gateway)
|
||||
/update Update Hermes to latest (gateway)
|
||||
/platforms (/gateway) Show platform connection status (gateway)
|
||||
```
|
||||
|
||||
### Utility
|
||||
```
|
||||
/branch (/fork) Branch the current session
|
||||
/btw Ephemeral side question (doesn't interrupt main task)
|
||||
/fast Toggle priority/fast processing
|
||||
/browser Open CDP browser connection
|
||||
/history Show conversation history (CLI)
|
||||
/save Save conversation to file (CLI)
|
||||
/paste Attach clipboard image (CLI)
|
||||
/image Attach local image file (CLI)
|
||||
```
|
||||
|
||||
### Info
|
||||
```
|
||||
/help Show commands
|
||||
/commands [page] Browse all commands (gateway)
|
||||
/usage Token usage
|
||||
/insights [days] Usage analytics
|
||||
/status Session info (gateway)
|
||||
/profile Active profile info
|
||||
```
|
||||
|
||||
### Exit
|
||||
```
|
||||
/quit (/exit, /q) Exit CLI
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Paths & Config
|
||||
|
||||
```
|
||||
~/.hermes/config.yaml Main configuration
|
||||
~/.hermes/.env API keys and secrets
|
||||
$HERMES_HOME/skills/ Installed skills
|
||||
~/.hermes/sessions/ Session transcripts
|
||||
~/.hermes/logs/ Gateway and error logs
|
||||
~/.hermes/auth.json OAuth tokens and credential pools
|
||||
~/.hermes/hermes-agent/ Source code (if git-installed)
|
||||
```
|
||||
|
||||
Profiles use `~/.hermes/profiles/<name>/` with the same layout.
|
||||
|
||||
### Config Sections
|
||||
|
||||
Edit with `hermes config edit` or `hermes config set section.key value`.
|
||||
|
||||
| Section | Key options |
|
||||
|---------|-------------|
|
||||
| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
|
||||
| `agent` | `max_turns` (90), `tool_use_enforcement` |
|
||||
| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
|
||||
| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
|
||||
| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
|
||||
| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
|
||||
| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
|
||||
| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
|
||||
| `security` | `tirith_enabled`, `website_blocklist` |
|
||||
| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
|
||||
| `checkpoints` | `enabled`, `max_snapshots` (50) |
|
||||
|
||||
Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
|
||||
|
||||
### Providers
|
||||
|
||||
20+ providers supported. Set via `hermes model` or `hermes setup`.
|
||||
|
||||
| Provider | Auth | Key env var |
|
||||
|----------|------|-------------|
|
||||
| OpenRouter | API key | `OPENROUTER_API_KEY` |
|
||||
| Anthropic | API key | `ANTHROPIC_API_KEY` |
|
||||
| Nous Portal | OAuth | `hermes auth` |
|
||||
| OpenAI Codex | OAuth | `hermes auth` |
|
||||
| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
|
||||
| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
|
||||
| DeepSeek | API key | `DEEPSEEK_API_KEY` |
|
||||
| xAI / Grok | API key | `XAI_API_KEY` |
|
||||
| Hugging Face | Token | `HF_TOKEN` |
|
||||
| Z.AI / GLM | API key | `GLM_API_KEY` |
|
||||
| MiniMax | API key | `MINIMAX_API_KEY` |
|
||||
| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
|
||||
| Kimi / Moonshot | API key | `KIMI_API_KEY` |
|
||||
| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
|
||||
| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
|
||||
| Kilo Code | API key | `KILOCODE_API_KEY` |
|
||||
| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
|
||||
| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
|
||||
| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
|
||||
| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
|
||||
| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
|
||||
| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
|
||||
|
||||
Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
|
||||
|
||||
### Toolsets
|
||||
|
||||
Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
|
||||
|
||||
| Toolset | What it provides |
|
||||
|---------|-----------------|
|
||||
| `web` | Web search and content extraction |
|
||||
| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
|
||||
| `terminal` | Shell commands and process management |
|
||||
| `file` | File read/write/search/patch |
|
||||
| `code_execution` | Sandboxed Python execution |
|
||||
| `vision` | Image analysis |
|
||||
| `image_gen` | AI image generation |
|
||||
| `tts` | Text-to-speech |
|
||||
| `skills` | Skill browsing and management |
|
||||
| `memory` | Persistent cross-session memory |
|
||||
| `session_search` | Search past conversations |
|
||||
| `delegation` | Subagent task delegation |
|
||||
| `cronjob` | Scheduled task management |
|
||||
| `clarify` | Ask user clarifying questions |
|
||||
| `messaging` | Cross-platform message sending |
|
||||
| `search` | Web search only (subset of `web`) |
|
||||
| `todo` | In-session task planning and tracking |
|
||||
| `rl` | Reinforcement learning tools (off by default) |
|
||||
| `moa` | Mixture of Agents (off by default) |
|
||||
| `homeassistant` | Smart home control (off by default) |
|
||||
|
||||
Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
|
||||
|
||||
---
|
||||
|
||||
## Voice & Transcription
|
||||
|
||||
### STT (Voice → Text)
|
||||
|
||||
Voice messages from messaging platforms are auto-transcribed.
|
||||
|
||||
Provider priority (auto-detected):
|
||||
1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
|
||||
2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
|
||||
3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
|
||||
4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
|
||||
|
||||
Config:
|
||||
```yaml
|
||||
stt:
|
||||
enabled: true
|
||||
provider: local # local, groq, openai, mistral
|
||||
local:
|
||||
model: base # tiny, base, small, medium, large-v3
|
||||
```
|
||||
|
||||
### TTS (Text → Voice)
|
||||
|
||||
| Provider | Env var | Free? |
|
||||
|----------|---------|-------|
|
||||
| Edge TTS | None | Yes (default) |
|
||||
| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
|
||||
| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
|
||||
| MiniMax | `MINIMAX_API_KEY` | Paid |
|
||||
| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
|
||||
| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
|
||||
|
||||
Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
|
||||
|
||||
---
|
||||
|
||||
## Spawning Additional Hermes Instances
|
||||
|
||||
Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
|
||||
|
||||
### When to Use This vs delegate_task
|
||||
|
||||
| | `delegate_task` | Spawning `hermes` process |
|
||||
|-|-----------------|--------------------------|
|
||||
| Isolation | Separate conversation, shared process | Fully independent process |
|
||||
| Duration | Minutes (bounded by parent loop) | Hours/days |
|
||||
| Tool access | Subset of parent's tools | Full tool access |
|
||||
| Interactive | No | Yes (PTY mode) |
|
||||
| Use case | Quick parallel subtasks | Long autonomous missions |
|
||||
|
||||
### One-Shot Mode
|
||||
|
||||
```
|
||||
terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
|
||||
|
||||
# Background for long tasks:
|
||||
terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
|
||||
```
|
||||
|
||||
### Interactive PTY Mode (via tmux)
|
||||
|
||||
Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
|
||||
|
||||
```
|
||||
# Start
|
||||
terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
|
||||
|
||||
# Wait for startup, then send a message
|
||||
terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
|
||||
|
||||
# Read output
|
||||
terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
|
||||
|
||||
# Send follow-up
|
||||
terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
|
||||
|
||||
# Exit
|
||||
terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
|
||||
```
|
||||
|
||||
### Multi-Agent Coordination
|
||||
|
||||
```
|
||||
# Agent A: backend
|
||||
terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
|
||||
terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
|
||||
|
||||
# Agent B: frontend
|
||||
terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
|
||||
terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
|
||||
|
||||
# Check progress, relay context between them
|
||||
terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
|
||||
terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
|
||||
```
|
||||
|
||||
### Session Resume
|
||||
|
||||
```
|
||||
# Resume most recent session
|
||||
terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
|
||||
|
||||
# Resume specific session
|
||||
terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
|
||||
```
|
||||
|
||||
### Tips
|
||||
|
||||
- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
|
||||
- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
|
||||
- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
|
||||
- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
|
||||
- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
|
||||
- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Voice not working
|
||||
1. Check `stt.enabled: true` in config.yaml
|
||||
2. Verify provider: `pip install faster-whisper` or set API key
|
||||
3. In gateway: `/restart`. In CLI: exit and relaunch.
|
||||
|
||||
### Tool not available
|
||||
1. `hermes tools` — check if toolset is enabled for your platform
|
||||
2. Some tools need env vars (check `.env`)
|
||||
3. `/reset` after enabling tools
|
||||
|
||||
### Model/provider issues
|
||||
1. `hermes doctor` — check config and dependencies
|
||||
2. `hermes login` — re-authenticate OAuth providers
|
||||
3. Check `.env` has the right API key
|
||||
4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
|
||||
|
||||
### Changes not taking effect
|
||||
- **Tools/skills:** `/reset` starts a new session with updated toolset
|
||||
- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
|
||||
- **Code changes:** Restart the CLI or gateway process
|
||||
|
||||
### Skills not showing
|
||||
1. `hermes skills list` — verify installed
|
||||
2. `hermes skills config` — check platform enablement
|
||||
3. Load explicitly: `/skill name` or `hermes -s name`
|
||||
|
||||
### Gateway issues
|
||||
Check logs first:
|
||||
```bash
|
||||
grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
|
||||
```
|
||||
|
||||
Common gateway problems:
|
||||
- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
|
||||
- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
|
||||
- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
|
||||
|
||||
### Platform-specific issues
|
||||
- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
|
||||
- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
|
||||
- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
|
||||
|
||||
### Auxiliary models not working
|
||||
If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
|
||||
```bash
|
||||
hermes config set auxiliary.vision.provider <your_provider>
|
||||
hermes config set auxiliary.vision.model <model_name>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Where to Find Things
|
||||
|
||||
| Looking for... | Location |
|
||||
|----------------|----------|
|
||||
| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
|
||||
| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
|
||||
| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
|
||||
| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
|
||||
| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
|
||||
| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
|
||||
| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
|
||||
| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
|
||||
| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
|
||||
| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
|
||||
| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
|
||||
| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
|
||||
| Gateway logs | `~/.hermes/logs/gateway.log` |
|
||||
| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
|
||||
| Source code | `~/.hermes/hermes-agent/` |
|
||||
|
||||
---
|
||||
|
||||
## Contributor Quick Reference
|
||||
|
||||
For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
|
||||
|
||||
### Project Layout
|
||||
|
||||
```
|
||||
hermes-agent/
|
||||
├── run_agent.py # AIAgent — core conversation loop
|
||||
├── model_tools.py # Tool discovery and dispatch
|
||||
├── toolsets.py # Toolset definitions
|
||||
├── cli.py # Interactive CLI (HermesCLI)
|
||||
├── hermes_state.py # SQLite session store
|
||||
├── agent/ # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
|
||||
├── hermes_cli/ # CLI subcommands, config, setup, commands
|
||||
│ ├── commands.py # Slash command registry (CommandDef)
|
||||
│ ├── config.py # DEFAULT_CONFIG, env var definitions
|
||||
│ └── main.py # CLI entry point and argparse
|
||||
├── tools/ # One file per tool
|
||||
│ └── registry.py # Central tool registry
|
||||
├── gateway/ # Messaging gateway
|
||||
│ └── platforms/ # Platform adapters (telegram, discord, etc.)
|
||||
├── cron/ # Job scheduler
|
||||
├── tests/ # ~3000 pytest tests
|
||||
└── website/ # Docusaurus docs site
|
||||
```
|
||||
|
||||
Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
|
||||
|
||||
### Adding a Tool (3 files)
|
||||
|
||||
**1. Create `tools/your_tool.py`:**
|
||||
```python
|
||||
import json, os
|
||||
from tools.registry import registry
|
||||
|
||||
def check_requirements() -> bool:
|
||||
return bool(os.getenv("EXAMPLE_API_KEY"))
|
||||
|
||||
def example_tool(param: str, task_id: str = None) -> str:
|
||||
return json.dumps({"success": True, "data": "..."})
|
||||
|
||||
registry.register(
|
||||
name="example_tool",
|
||||
toolset="example",
|
||||
schema={"name": "example_tool", "description": "...", "parameters": {...}},
|
||||
handler=lambda args, **kw: example_tool(
|
||||
param=args.get("param", ""), task_id=kw.get("task_id")),
|
||||
check_fn=check_requirements,
|
||||
requires_env=["EXAMPLE_API_KEY"],
|
||||
)
|
||||
```
|
||||
|
||||
**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
|
||||
|
||||
Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
|
||||
|
||||
All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
|
||||
|
||||
### Adding a Slash Command
|
||||
|
||||
1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
|
||||
2. Add handler in `cli.py` → `process_command()`
|
||||
3. (Optional) Add gateway handler in `gateway/run.py`
|
||||
|
||||
All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
|
||||
|
||||
### Agent Loop (High Level)
|
||||
|
||||
```
|
||||
run_conversation():
|
||||
1. Build system prompt
|
||||
2. Loop while iterations < max:
|
||||
a. Call LLM (OpenAI-format messages + tool schemas)
|
||||
b. If tool_calls → dispatch each via handle_function_call() → append results → continue
|
||||
c. If text response → return
|
||||
3. Context compression triggers automatically near token limit
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
python -m pytest tests/ -o 'addopts=' -q # Full suite
|
||||
python -m pytest tests/tools/ -q # Specific area
|
||||
```
|
||||
|
||||
- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
|
||||
- Run full suite before pushing any change
|
||||
- Use `-o 'addopts='` to clear any baked-in pytest flags
|
||||
|
||||
### Commit Conventions
|
||||
|
||||
```
|
||||
type: concise subject line
|
||||
|
||||
Optional body.
|
||||
```
|
||||
|
||||
Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
|
||||
|
||||
### Key Rules
|
||||
|
||||
- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
|
||||
- **Message role alternation** — never two assistant or two user messages in a row
|
||||
- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
|
||||
- Config values go in `config.yaml`, secrets go in `.env`
|
||||
- New tools need a `check_fn` so they only appear when requirements are met
|
||||
+236
@@ -0,0 +1,236 @@
|
||||
---
|
||||
title: "Opencode"
|
||||
sidebar_label: "Opencode"
|
||||
description: "Delegate coding tasks to OpenCode CLI agent for feature implementation, refactoring, PR review, and long-running autonomous sessions"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Opencode
|
||||
|
||||
Delegate coding tasks to OpenCode CLI agent for feature implementation, refactoring, PR review, and long-running autonomous sessions. Requires the opencode CLI installed and authenticated.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/autonomous-ai-agents/opencode` |
|
||||
| Version | `1.2.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `Coding-Agent`, `OpenCode`, `Autonomous`, `Refactoring`, `Code-Review` |
|
||||
| Related skills | [`claude-code`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-claude-code), [`codex`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-codex), [`hermes-agent`](/docs/user-guide/skills/bundled/autonomous-ai-agents/autonomous-ai-agents-hermes-agent) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# OpenCode CLI
|
||||
|
||||
Use [OpenCode](https://opencode.ai) as an autonomous coding worker orchestrated by Hermes terminal/process tools. OpenCode is a provider-agnostic, open-source AI coding agent with a TUI and CLI.
|
||||
|
||||
## When to Use
|
||||
|
||||
- User explicitly asks to use OpenCode
|
||||
- You want an external coding agent to implement/refactor/review code
|
||||
- You need long-running coding sessions with progress checks
|
||||
- You want parallel task execution in isolated workdirs/worktrees
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- OpenCode installed: `npm i -g opencode-ai@latest` or `brew install anomalyco/tap/opencode`
|
||||
- Auth configured: `opencode auth login` or set provider env vars (OPENROUTER_API_KEY, etc.)
|
||||
- Verify: `opencode auth list` should show at least one provider
|
||||
- Git repository for code tasks (recommended)
|
||||
- `pty=true` for interactive TUI sessions
|
||||
|
||||
## Binary Resolution (Important)
|
||||
|
||||
Shell environments may resolve different OpenCode binaries. If behavior differs between your terminal and Hermes, check:
|
||||
|
||||
```
|
||||
terminal(command="which -a opencode")
|
||||
terminal(command="opencode --version")
|
||||
```
|
||||
|
||||
If needed, pin an explicit binary path:
|
||||
|
||||
```
|
||||
terminal(command="$HOME/.opencode/bin/opencode run '...'", workdir="~/project", pty=true)
|
||||
```
|
||||
|
||||
## One-Shot Tasks
|
||||
|
||||
Use `opencode run` for bounded, non-interactive tasks:
|
||||
|
||||
```
|
||||
terminal(command="opencode run 'Add retry logic to API calls and update tests'", workdir="~/project")
|
||||
```
|
||||
|
||||
Attach context files with `-f`:
|
||||
|
||||
```
|
||||
terminal(command="opencode run 'Review this config for security issues' -f config.yaml -f .env.example", workdir="~/project")
|
||||
```
|
||||
|
||||
Show model thinking with `--thinking`:
|
||||
|
||||
```
|
||||
terminal(command="opencode run 'Debug why tests fail in CI' --thinking", workdir="~/project")
|
||||
```
|
||||
|
||||
Force a specific model:
|
||||
|
||||
```
|
||||
terminal(command="opencode run 'Refactor auth module' --model openrouter/anthropic/claude-sonnet-4", workdir="~/project")
|
||||
```
|
||||
|
||||
## Interactive Sessions (Background)
|
||||
|
||||
For iterative work requiring multiple exchanges, start the TUI in background:
|
||||
|
||||
```
|
||||
terminal(command="opencode", workdir="~/project", background=true, pty=true)
|
||||
# Returns session_id
|
||||
|
||||
# Send a prompt
|
||||
process(action="submit", session_id="<id>", data="Implement OAuth refresh flow and add tests")
|
||||
|
||||
# Monitor progress
|
||||
process(action="poll", session_id="<id>")
|
||||
process(action="log", session_id="<id>")
|
||||
|
||||
# Send follow-up input
|
||||
process(action="submit", session_id="<id>", data="Now add error handling for token expiry")
|
||||
|
||||
# Exit cleanly — Ctrl+C
|
||||
process(action="write", session_id="<id>", data="\x03")
|
||||
# Or just kill the process
|
||||
process(action="kill", session_id="<id>")
|
||||
```
|
||||
|
||||
**Important:** Do NOT use `/exit` — it is not a valid OpenCode command and will open an agent selector dialog instead. Use Ctrl+C (`\x03`) or `process(action="kill")` to exit.
|
||||
|
||||
### TUI Keybindings
|
||||
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `Enter` | Submit message (press twice if needed) |
|
||||
| `Tab` | Switch between agents (build/plan) |
|
||||
| `Ctrl+P` | Open command palette |
|
||||
| `Ctrl+X L` | Switch session |
|
||||
| `Ctrl+X M` | Switch model |
|
||||
| `Ctrl+X N` | New session |
|
||||
| `Ctrl+X E` | Open editor |
|
||||
| `Ctrl+C` | Exit OpenCode |
|
||||
|
||||
### Resuming Sessions
|
||||
|
||||
After exiting, OpenCode prints a session ID. Resume with:
|
||||
|
||||
```
|
||||
terminal(command="opencode -c", workdir="~/project", background=true, pty=true) # Continue last session
|
||||
terminal(command="opencode -s ses_abc123", workdir="~/project", background=true, pty=true) # Specific session
|
||||
```
|
||||
|
||||
## Common Flags
|
||||
|
||||
| Flag | Use |
|
||||
|------|-----|
|
||||
| `run 'prompt'` | One-shot execution and exit |
|
||||
| `--continue` / `-c` | Continue the last OpenCode session |
|
||||
| `--session <id>` / `-s` | Continue a specific session |
|
||||
| `--agent <name>` | Choose OpenCode agent (build or plan) |
|
||||
| `--model provider/model` | Force specific model |
|
||||
| `--format json` | Machine-readable output/events |
|
||||
| `--file <path>` / `-f` | Attach file(s) to the message |
|
||||
| `--thinking` | Show model thinking blocks |
|
||||
| `--variant <level>` | Reasoning effort (high, max, minimal) |
|
||||
| `--title <name>` | Name the session |
|
||||
| `--attach <url>` | Connect to a running opencode server |
|
||||
|
||||
## Procedure
|
||||
|
||||
1. Verify tool readiness:
|
||||
- `terminal(command="opencode --version")`
|
||||
- `terminal(command="opencode auth list")`
|
||||
2. For bounded tasks, use `opencode run '...'` (no pty needed).
|
||||
3. For iterative tasks, start `opencode` with `background=true, pty=true`.
|
||||
4. Monitor long tasks with `process(action="poll"|"log")`.
|
||||
5. If OpenCode asks for input, respond via `process(action="submit", ...)`.
|
||||
6. Exit with `process(action="write", data="\x03")` or `process(action="kill")`.
|
||||
7. Summarize file changes, test results, and next steps back to user.
|
||||
|
||||
## PR Review Workflow
|
||||
|
||||
OpenCode has a built-in PR command:
|
||||
|
||||
```
|
||||
terminal(command="opencode pr 42", workdir="~/project", pty=true)
|
||||
```
|
||||
|
||||
Or review in a temporary clone for isolation:
|
||||
|
||||
```
|
||||
terminal(command="REVIEW=$(mktemp -d) && git clone https://github.com/user/repo.git $REVIEW && cd $REVIEW && opencode run 'Review this PR vs main. Report bugs, security risks, test gaps, and style issues.' -f $(git diff origin/main --name-only | head -20 | tr '\n' ' ')", pty=true)
|
||||
```
|
||||
|
||||
## Parallel Work Pattern
|
||||
|
||||
Use separate workdirs/worktrees to avoid collisions:
|
||||
|
||||
```
|
||||
terminal(command="opencode run 'Fix issue #101 and commit'", workdir="/tmp/issue-101", background=true, pty=true)
|
||||
terminal(command="opencode run 'Add parser regression tests and commit'", workdir="/tmp/issue-102", background=true, pty=true)
|
||||
process(action="list")
|
||||
```
|
||||
|
||||
## Session & Cost Management
|
||||
|
||||
List past sessions:
|
||||
|
||||
```
|
||||
terminal(command="opencode session list")
|
||||
```
|
||||
|
||||
Check token usage and costs:
|
||||
|
||||
```
|
||||
terminal(command="opencode stats")
|
||||
terminal(command="opencode stats --days 7 --models anthropic/claude-sonnet-4")
|
||||
```
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- Interactive `opencode` (TUI) sessions require `pty=true`. The `opencode run` command does NOT need pty.
|
||||
- `/exit` is NOT a valid command — it opens an agent selector. Use Ctrl+C to exit the TUI.
|
||||
- PATH mismatch can select the wrong OpenCode binary/model config.
|
||||
- If OpenCode appears stuck, inspect logs before killing:
|
||||
- `process(action="log", session_id="<id>")`
|
||||
- Avoid sharing one working directory across parallel OpenCode sessions.
|
||||
- Enter may need to be pressed twice to submit in the TUI (once to finalize text, once to send).
|
||||
|
||||
## Verification
|
||||
|
||||
Smoke test:
|
||||
|
||||
```
|
||||
terminal(command="opencode run 'Respond with exactly: OPENCODE_SMOKE_OK'")
|
||||
```
|
||||
|
||||
Success criteria:
|
||||
- Output includes `OPENCODE_SMOKE_OK`
|
||||
- Command exits without provider/model errors
|
||||
- For code tasks: expected files changed and tests pass
|
||||
|
||||
## Rules
|
||||
|
||||
1. Prefer `opencode run` for one-shot automation — it's simpler and doesn't need pty.
|
||||
2. Use interactive background mode only when iteration is needed.
|
||||
3. Always scope OpenCode sessions to a single repo/workdir.
|
||||
4. For long tasks, provide progress updates from `process` logs.
|
||||
5. Report concrete outcomes (files changed, tests, remaining risks).
|
||||
6. Exit interactive sessions with Ctrl+C or kill, never `/exit`.
|
||||
@@ -0,0 +1,164 @@
|
||||
---
|
||||
title: "Architecture Diagram"
|
||||
sidebar_label: "Architecture Diagram"
|
||||
description: "Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Architecture Diagram
|
||||
|
||||
Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics. Semantic component colors (cyan=frontend, emerald=backend, violet=database, amber=cloud/AWS, rose=security, orange=message bus), JetBrains Mono font, grid background. Best suited for software architecture, cloud/VPC topology, microservice maps, service-mesh diagrams, database + API layer diagrams, security groups, message buses — anything that fits a tech-infra deck with a dark aesthetic. If a more specialized diagramming skill exists for the subject (scientific, educational, hand-drawn, animated, etc.), prefer that — otherwise this skill can also serve as a general-purpose SVG diagram fallback. Based on Cocoon AI's architecture-diagram-generator (MIT).
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/architecture-diagram` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Cocoon AI (hello@cocoon-ai.com), ported by Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `architecture`, `diagrams`, `SVG`, `HTML`, `visualization`, `infrastructure`, `cloud` |
|
||||
| Related skills | [`concept-diagrams`](/docs/user-guide/skills/optional/creative/creative-concept-diagrams), [`excalidraw`](/docs/user-guide/skills/bundled/creative/creative-excalidraw) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Architecture Diagram Skill
|
||||
|
||||
Generate professional, dark-themed technical architecture diagrams as standalone HTML files with inline SVG graphics. No external tools, no API keys, no rendering libraries — just write the HTML file and open it in a browser.
|
||||
|
||||
## Scope
|
||||
|
||||
**Best suited for:**
|
||||
- Software system architecture (frontend / backend / database layers)
|
||||
- Cloud infrastructure (VPC, regions, subnets, managed services)
|
||||
- Microservice / service-mesh topology
|
||||
- Database + API map, deployment diagrams
|
||||
- Anything with a tech-infra subject that fits a dark, grid-backed aesthetic
|
||||
|
||||
**Look elsewhere first for:**
|
||||
- Physics, chemistry, math, biology, or other scientific subjects
|
||||
- Physical objects (vehicles, hardware, anatomy, cross-sections)
|
||||
- Floor plans, narrative journeys, educational / textbook-style visuals
|
||||
- Hand-drawn whiteboard sketches (consider `excalidraw`)
|
||||
- Animated explainers (consider an animation skill)
|
||||
|
||||
If a more specialized skill is available for the subject, prefer that. If none fits, this skill can also serve as a general SVG diagram fallback — the output will just carry the dark tech aesthetic described below.
|
||||
|
||||
Based on [Cocoon AI's architecture-diagram-generator](https://github.com/Cocoon-AI/architecture-diagram-generator) (MIT).
|
||||
|
||||
## Workflow
|
||||
|
||||
1. User describes their system architecture (components, connections, technologies)
|
||||
2. Generate the HTML file following the design system below
|
||||
3. Save with `write_file` to a `.html` file (e.g. `~/architecture-diagram.html`)
|
||||
4. User opens in any browser — works offline, no dependencies
|
||||
|
||||
### Output Location
|
||||
|
||||
Save diagrams to a user-specified path, or default to the current working directory:
|
||||
```
|
||||
./[project-name]-architecture.html
|
||||
```
|
||||
|
||||
### Preview
|
||||
|
||||
After saving, suggest the user open it:
|
||||
```bash
|
||||
# macOS
|
||||
open ./my-architecture.html
|
||||
# Linux
|
||||
xdg-open ./my-architecture.html
|
||||
```
|
||||
|
||||
## Design System & Visual Language
|
||||
|
||||
### Color Palette (Semantic Mapping)
|
||||
|
||||
Use specific `rgba` fills and hex strokes to categorize components:
|
||||
|
||||
| Component Type | Fill (rgba) | Stroke (Hex) |
|
||||
| :--- | :--- | :--- |
|
||||
| **Frontend** | `rgba(8, 51, 68, 0.4)` | `#22d3ee` (cyan-400) |
|
||||
| **Backend** | `rgba(6, 78, 59, 0.4)` | `#34d399` (emerald-400) |
|
||||
| **Database** | `rgba(76, 29, 149, 0.4)` | `#a78bfa` (violet-400) |
|
||||
| **AWS/Cloud** | `rgba(120, 53, 15, 0.3)` | `#fbbf24` (amber-400) |
|
||||
| **Security** | `rgba(136, 19, 55, 0.4)` | `#fb7185` (rose-400) |
|
||||
| **Message Bus** | `rgba(251, 146, 60, 0.3)` | `#fb923c` (orange-400) |
|
||||
| **External** | `rgba(30, 41, 59, 0.5)` | `#94a3b8` (slate-400) |
|
||||
|
||||
### Typography & Background
|
||||
- **Font:** JetBrains Mono (Monospace), loaded from Google Fonts
|
||||
- **Sizes:** 12px (Names), 9px (Sublabels), 8px (Annotations), 7px (Tiny labels)
|
||||
- **Background:** Slate-950 (`#020617`) with a subtle 40px grid pattern
|
||||
|
||||
```svg
|
||||
<!-- Background Grid Pattern -->
|
||||
<pattern id="grid" width="40" height="40" patternUnits="userSpaceOnUse">
|
||||
<path d="M 40 0 L 0 0 0 40" fill="none" stroke="#1e293b" stroke-width="0.5"/>
|
||||
</pattern>
|
||||
```
|
||||
|
||||
## Technical Implementation Details
|
||||
|
||||
### Component Rendering
|
||||
Components are rounded rectangles (`rx="6"`) with 1.5px strokes. To prevent arrows from showing through semi-transparent fills, use a **double-rect masking technique**:
|
||||
1. Draw an opaque background rect (`#0f172a`)
|
||||
2. Draw the semi-transparent styled rect on top
|
||||
|
||||
### Connection Rules
|
||||
- **Z-Order:** Draw arrows *early* in the SVG (after the grid) so they render behind component boxes
|
||||
- **Arrowheads:** Defined via SVG markers
|
||||
- **Security Flows:** Use dashed lines in rose color (`#fb7185`)
|
||||
- **Boundaries:**
|
||||
- *Security Groups:* Dashed (`4,4`), rose color
|
||||
- *Regions:* Large dashed (`8,4`), amber color, `rx="12"`
|
||||
|
||||
### Spacing & Layout Logic
|
||||
- **Standard Height:** 60px (Services); 80-120px (Large components)
|
||||
- **Vertical Gap:** Minimum 40px between components
|
||||
- **Message Buses:** Must be placed *in the gap* between services, not overlapping them
|
||||
- **Legend Placement:** **CRITICAL.** Must be placed outside all boundary boxes. Calculate the lowest Y-coordinate of all boundaries and place the legend at least 20px below it.
|
||||
|
||||
## Document Structure
|
||||
|
||||
The generated HTML file follows a four-part layout:
|
||||
1. **Header:** Title with a pulsing dot indicator and subtitle
|
||||
2. **Main SVG:** The diagram contained within a rounded border card
|
||||
3. **Summary Cards:** A grid of three cards below the diagram for high-level details
|
||||
4. **Footer:** Minimal metadata
|
||||
|
||||
### Info Card Pattern
|
||||
```html
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<div class="card-dot cyan"></div>
|
||||
<h3>Title</h3>
|
||||
</div>
|
||||
<ul>
|
||||
<li>• Item one</li>
|
||||
<li>• Item two</li>
|
||||
</ul>
|
||||
</div>
|
||||
```
|
||||
|
||||
## Output Requirements
|
||||
- **Single File:** One self-contained `.html` file
|
||||
- **No External Dependencies:** All CSS and SVG must be inline (except Google Fonts)
|
||||
- **No JavaScript:** Use pure CSS for any animations (like pulsing dots)
|
||||
- **Compatibility:** Must render correctly in any modern web browser
|
||||
|
||||
## Template Reference
|
||||
|
||||
Load the full HTML template for the exact structure, CSS, and SVG component examples:
|
||||
|
||||
```
|
||||
skill_view(name="architecture-diagram", file_path="templates/template.html")
|
||||
```
|
||||
|
||||
The template contains working examples of every component type (frontend, backend, database, cloud, security), arrow styles (standard, dashed, curved), security groups, region boundaries, and the legend — use it as your structural reference when generating diagrams.
|
||||
@@ -0,0 +1,337 @@
|
||||
---
|
||||
title: "Ascii Art"
|
||||
sidebar_label: "Ascii Art"
|
||||
description: "Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Ascii Art
|
||||
|
||||
Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii.co.uk), and LLM fallback. No API keys required.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/ascii-art` |
|
||||
| Version | `4.0.0` |
|
||||
| Author | 0xbyt4, Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `ASCII`, `Art`, `Banners`, `Creative`, `Unicode`, `Text-Art`, `pyfiglet`, `figlet`, `cowsay`, `boxes` |
|
||||
| Related skills | [`excalidraw`](/docs/user-guide/skills/bundled/creative/creative-excalidraw) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# ASCII Art Skill
|
||||
|
||||
Multiple tools for different ASCII art needs. All tools are local CLI programs or free REST APIs — no API keys required.
|
||||
|
||||
## Tool 1: Text Banners (pyfiglet — local)
|
||||
|
||||
Render text as large ASCII art banners. 571 built-in fonts.
|
||||
|
||||
### Setup
|
||||
|
||||
```bash
|
||||
pip install pyfiglet --break-system-packages -q
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
python3 -m pyfiglet "YOUR TEXT" -f slant
|
||||
python3 -m pyfiglet "TEXT" -f doom -w 80 # Set width
|
||||
python3 -m pyfiglet --list_fonts # List all 571 fonts
|
||||
```
|
||||
|
||||
### Recommended fonts
|
||||
|
||||
| Style | Font | Best for |
|
||||
|-------|------|----------|
|
||||
| Clean & modern | `slant` | Project names, headers |
|
||||
| Bold & blocky | `doom` | Titles, logos |
|
||||
| Big & readable | `big` | Banners |
|
||||
| Classic banner | `banner3` | Wide displays |
|
||||
| Compact | `small` | Subtitles |
|
||||
| Cyberpunk | `cyberlarge` | Tech themes |
|
||||
| 3D effect | `3-d` | Splash screens |
|
||||
| Gothic | `gothic` | Dramatic text |
|
||||
|
||||
### Tips
|
||||
|
||||
- Preview 2-3 fonts and let the user pick their favorite
|
||||
- Short text (1-8 chars) works best with detailed fonts like `doom` or `block`
|
||||
- Long text works better with compact fonts like `small` or `mini`
|
||||
|
||||
## Tool 2: Text Banners (asciified API — remote, no install)
|
||||
|
||||
Free REST API that converts text to ASCII art. 250+ FIGlet fonts. Returns plain text directly — no parsing needed. Use this when pyfiglet is not installed or as a quick alternative.
|
||||
|
||||
### Usage (via terminal curl)
|
||||
|
||||
```bash
|
||||
# Basic text banner (default font)
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=Hello+World"
|
||||
|
||||
# With a specific font
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=Hello&font=Slant"
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=Hello&font=Doom"
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=Hello&font=Star+Wars"
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=Hello&font=3-D"
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=Hello&font=Banner3"
|
||||
|
||||
# List all available fonts (returns JSON array)
|
||||
curl -s "https://asciified.thelicato.io/api/v2/fonts"
|
||||
```
|
||||
|
||||
### Tips
|
||||
|
||||
- URL-encode spaces as `+` in the text parameter
|
||||
- The response is plain text ASCII art — no JSON wrapping, ready to display
|
||||
- Font names are case-sensitive; use the fonts endpoint to get exact names
|
||||
- Works from any terminal with curl — no Python or pip needed
|
||||
|
||||
## Tool 3: Cowsay (Message Art)
|
||||
|
||||
Classic tool that wraps text in a speech bubble with an ASCII character.
|
||||
|
||||
### Setup
|
||||
|
||||
```bash
|
||||
sudo apt install cowsay -y # Debian/Ubuntu
|
||||
# brew install cowsay # macOS
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
cowsay "Hello World"
|
||||
cowsay -f tux "Linux rules" # Tux the penguin
|
||||
cowsay -f dragon "Rawr!" # Dragon
|
||||
cowsay -f stegosaurus "Roar!" # Stegosaurus
|
||||
cowthink "Hmm..." # Thought bubble
|
||||
cowsay -l # List all characters
|
||||
```
|
||||
|
||||
### Available characters (50+)
|
||||
|
||||
`beavis.zen`, `bong`, `bunny`, `cheese`, `daemon`, `default`, `dragon`,
|
||||
`dragon-and-cow`, `elephant`, `eyes`, `flaming-skull`, `ghostbusters`,
|
||||
`hellokitty`, `kiss`, `kitty`, `koala`, `luke-koala`, `mech-and-cow`,
|
||||
`meow`, `moofasa`, `moose`, `ren`, `sheep`, `skeleton`, `small`,
|
||||
`stegosaurus`, `stimpy`, `supermilker`, `surgery`, `three-eyes`,
|
||||
`turkey`, `turtle`, `tux`, `udder`, `vader`, `vader-koala`, `www`
|
||||
|
||||
### Eye/tongue modifiers
|
||||
|
||||
```bash
|
||||
cowsay -b "Borg" # =_= eyes
|
||||
cowsay -d "Dead" # x_x eyes
|
||||
cowsay -g "Greedy" # $_$ eyes
|
||||
cowsay -p "Paranoid" # @_@ eyes
|
||||
cowsay -s "Stoned" # *_* eyes
|
||||
cowsay -w "Wired" # O_O eyes
|
||||
cowsay -e "OO" "Msg" # Custom eyes
|
||||
cowsay -T "U " "Msg" # Custom tongue
|
||||
```
|
||||
|
||||
## Tool 4: Boxes (Decorative Borders)
|
||||
|
||||
Draw decorative ASCII art borders/frames around any text. 70+ built-in designs.
|
||||
|
||||
### Setup
|
||||
|
||||
```bash
|
||||
sudo apt install boxes -y # Debian/Ubuntu
|
||||
# brew install boxes # macOS
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
echo "Hello World" | boxes # Default box
|
||||
echo "Hello World" | boxes -d stone # Stone border
|
||||
echo "Hello World" | boxes -d parchment # Parchment scroll
|
||||
echo "Hello World" | boxes -d cat # Cat border
|
||||
echo "Hello World" | boxes -d dog # Dog border
|
||||
echo "Hello World" | boxes -d unicornsay # Unicorn
|
||||
echo "Hello World" | boxes -d diamonds # Diamond pattern
|
||||
echo "Hello World" | boxes -d c-cmt # C-style comment
|
||||
echo "Hello World" | boxes -d html-cmt # HTML comment
|
||||
echo "Hello World" | boxes -a c # Center text
|
||||
boxes -l # List all 70+ designs
|
||||
```
|
||||
|
||||
### Combine with pyfiglet or asciified
|
||||
|
||||
```bash
|
||||
python3 -m pyfiglet "HERMES" -f slant | boxes -d stone
|
||||
# Or without pyfiglet installed:
|
||||
curl -s "https://asciified.thelicato.io/api/v2/ascii?text=HERMES&font=Slant" | boxes -d stone
|
||||
```
|
||||
|
||||
## Tool 5: TOIlet (Colored Text Art)
|
||||
|
||||
Like pyfiglet but with ANSI color effects and visual filters. Great for terminal eye candy.
|
||||
|
||||
### Setup
|
||||
|
||||
```bash
|
||||
sudo apt install toilet toilet-fonts -y # Debian/Ubuntu
|
||||
# brew install toilet # macOS
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
toilet "Hello World" # Basic text art
|
||||
toilet -f bigmono12 "Hello" # Specific font
|
||||
toilet --gay "Rainbow!" # Rainbow coloring
|
||||
toilet --metal "Metal!" # Metallic effect
|
||||
toilet -F border "Bordered" # Add border
|
||||
toilet -F border --gay "Fancy!" # Combined effects
|
||||
toilet -f pagga "Block" # Block-style font (unique to toilet)
|
||||
toilet -F list # List available filters
|
||||
```
|
||||
|
||||
### Filters
|
||||
|
||||
`crop`, `gay` (rainbow), `metal`, `flip`, `flop`, `180`, `left`, `right`, `border`
|
||||
|
||||
**Note**: toilet outputs ANSI escape codes for colors — works in terminals but may not render in all contexts (e.g., plain text files, some chat platforms).
|
||||
|
||||
## Tool 6: Image to ASCII Art
|
||||
|
||||
Convert images (PNG, JPEG, GIF, WEBP) to ASCII art.
|
||||
|
||||
### Option A: ascii-image-converter (recommended, modern)
|
||||
|
||||
```bash
|
||||
# Install
|
||||
sudo snap install ascii-image-converter
|
||||
# OR: go install github.com/TheZoraiz/ascii-image-converter@latest
|
||||
```
|
||||
|
||||
```bash
|
||||
ascii-image-converter image.png # Basic
|
||||
ascii-image-converter image.png -C # Color output
|
||||
ascii-image-converter image.png -d 60,30 # Set dimensions
|
||||
ascii-image-converter image.png -b # Braille characters
|
||||
ascii-image-converter image.png -n # Negative/inverted
|
||||
ascii-image-converter https://url/image.jpg # Direct URL
|
||||
ascii-image-converter image.png --save-txt out # Save as text
|
||||
```
|
||||
|
||||
### Option B: jp2a (lightweight, JPEG only)
|
||||
|
||||
```bash
|
||||
sudo apt install jp2a -y
|
||||
jp2a --width=80 image.jpg
|
||||
jp2a --colors image.jpg # Colorized
|
||||
```
|
||||
|
||||
## Tool 7: Search Pre-Made ASCII Art
|
||||
|
||||
Search curated ASCII art from the web. Use `terminal` with `curl`.
|
||||
|
||||
### Source A: ascii.co.uk (recommended for pre-made art)
|
||||
|
||||
Large collection of classic ASCII art organized by subject. Art is inside HTML `<pre>` tags. Fetch the page with curl, then extract art with a small Python snippet.
|
||||
|
||||
**URL pattern:** `https://ascii.co.uk/art/{subject}`
|
||||
|
||||
**Step 1 — Fetch the page:**
|
||||
|
||||
```bash
|
||||
curl -s 'https://ascii.co.uk/art/cat' -o /tmp/ascii_art.html
|
||||
```
|
||||
|
||||
**Step 2 — Extract art from pre tags:**
|
||||
|
||||
```python
|
||||
import re, html
|
||||
with open('/tmp/ascii_art.html') as f:
|
||||
text = f.read()
|
||||
arts = re.findall(r'<pre[^>]*>(.*?)</pre>', text, re.DOTALL)
|
||||
for art in arts:
|
||||
clean = re.sub(r'<[^>]+>', '', art)
|
||||
clean = html.unescape(clean).strip()
|
||||
if len(clean) > 30:
|
||||
print(clean)
|
||||
print('\n---\n')
|
||||
```
|
||||
|
||||
**Available subjects** (use as URL path):
|
||||
- Animals: `cat`, `dog`, `horse`, `bird`, `fish`, `dragon`, `snake`, `rabbit`, `elephant`, `dolphin`, `butterfly`, `owl`, `wolf`, `bear`, `penguin`, `turtle`
|
||||
- Objects: `car`, `ship`, `airplane`, `rocket`, `guitar`, `computer`, `coffee`, `beer`, `cake`, `house`, `castle`, `sword`, `crown`, `key`
|
||||
- Nature: `tree`, `flower`, `sun`, `moon`, `star`, `mountain`, `ocean`, `rainbow`
|
||||
- Characters: `skull`, `robot`, `angel`, `wizard`, `pirate`, `ninja`, `alien`
|
||||
- Holidays: `christmas`, `halloween`, `valentine`
|
||||
|
||||
**Tips:**
|
||||
- Preserve artist signatures/initials — important etiquette
|
||||
- Multiple art pieces per page — pick the best one for the user
|
||||
- Works reliably via curl, no JavaScript needed
|
||||
|
||||
### Source B: GitHub Octocat API (fun easter egg)
|
||||
|
||||
Returns a random GitHub Octocat with a wise quote. No auth needed.
|
||||
|
||||
```bash
|
||||
curl -s https://api.github.com/octocat
|
||||
```
|
||||
|
||||
## Tool 8: Fun ASCII Utilities (via curl)
|
||||
|
||||
These free services return ASCII art directly — great for fun extras.
|
||||
|
||||
### QR Codes as ASCII Art
|
||||
|
||||
```bash
|
||||
curl -s "qrenco.de/Hello+World"
|
||||
curl -s "qrenco.de/https://example.com"
|
||||
```
|
||||
|
||||
### Weather as ASCII Art
|
||||
|
||||
```bash
|
||||
curl -s "wttr.in/London" # Full weather report with ASCII graphics
|
||||
curl -s "wttr.in/Moon" # Moon phase in ASCII art
|
||||
curl -s "v2.wttr.in/London" # Detailed version
|
||||
```
|
||||
|
||||
## Tool 9: LLM-Generated Custom Art (Fallback)
|
||||
|
||||
When tools above don't have what's needed, generate ASCII art directly using these Unicode characters:
|
||||
|
||||
### Character Palette
|
||||
|
||||
**Box Drawing:** `╔ ╗ ╚ ╝ ║ ═ ╠ ╣ ╦ ╩ ╬ ┌ ┐ └ ┘ │ ─ ├ ┤ ┬ ┴ ┼ ╭ ╮ ╰ ╯`
|
||||
|
||||
**Block Elements:** `░ ▒ ▓ █ ▄ ▀ ▌ ▐ ▖ ▗ ▘ ▝ ▚ ▞`
|
||||
|
||||
**Geometric & Symbols:** `◆ ◇ ◈ ● ○ ◉ ■ □ ▲ △ ▼ ▽ ★ ☆ ✦ ✧ ◀ ▶ ◁ ▷ ⬡ ⬢ ⌂`
|
||||
|
||||
### Rules
|
||||
|
||||
- Max width: 60 characters per line (terminal-safe)
|
||||
- Max height: 15 lines for banners, 25 for scenes
|
||||
- Monospace only: output must render correctly in fixed-width fonts
|
||||
|
||||
## Decision Flow
|
||||
|
||||
1. **Text as a banner** → pyfiglet if installed, otherwise asciified API via curl
|
||||
2. **Wrap a message in fun character art** → cowsay
|
||||
3. **Add decorative border/frame** → boxes (can combine with pyfiglet/asciified)
|
||||
4. **Art of a specific thing** (cat, rocket, dragon) → ascii.co.uk via curl + parsing
|
||||
5. **Convert an image to ASCII** → ascii-image-converter or jp2a
|
||||
6. **QR code** → qrenco.de via curl
|
||||
7. **Weather/moon art** → wttr.in via curl
|
||||
8. **Something custom/creative** → LLM generation with Unicode palette
|
||||
9. **Any tool not installed** → install it, or fall back to next option
|
||||
@@ -0,0 +1,252 @@
|
||||
---
|
||||
title: "Ascii Video — Production pipeline for ASCII art video — any format"
|
||||
sidebar_label: "Ascii Video"
|
||||
description: "Production pipeline for ASCII art video — any format"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Ascii Video
|
||||
|
||||
Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering. Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/ascii-video` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# ASCII Video Production Pipeline
|
||||
|
||||
## Creative Standard
|
||||
|
||||
This is visual art. ASCII characters are the medium; cinema is the standard.
|
||||
|
||||
**Before writing a single line of code**, articulate the creative concept. What is the mood? What visual story does this tell? What makes THIS project different from every other ASCII video? The user's prompt is a starting point — interpret it with creative ambition, not literal transcription.
|
||||
|
||||
**First-render excellence is non-negotiable.** The output must be visually striking without requiring revision rounds. If something looks generic, flat, or like "AI-generated ASCII art," it is wrong — rethink the creative concept before shipping.
|
||||
|
||||
**Go beyond the reference vocabulary.** The effect catalogs, shader presets, and palette libraries in the references are a starting vocabulary. For every project, combine, modify, and invent new patterns. The catalog is a palette of paints — you write the painting.
|
||||
|
||||
**Be proactively creative.** Extend the skill's vocabulary when the project calls for it. If the references don't have what the vision demands, build it. Include at least one visual moment the user didn't ask for but will appreciate — a transition, an effect, a color choice that elevates the whole piece.
|
||||
|
||||
**Cohesive aesthetic over technical correctness.** All scenes in a video must feel connected by a unifying visual language — shared color temperature, related character palettes, consistent motion vocabulary. A technically correct video where every scene uses a random different effect is an aesthetic failure.
|
||||
|
||||
**Dense, layered, considered.** Every frame should reward viewing. Never flat black backgrounds. Always multi-grid composition. Always per-scene variation. Always intentional color.
|
||||
|
||||
## Modes
|
||||
|
||||
| Mode | Input | Output | Reference |
|
||||
|------|-------|--------|-----------|
|
||||
| **Video-to-ASCII** | Video file | ASCII recreation of source footage | `references/inputs.md` § Video Sampling |
|
||||
| **Audio-reactive** | Audio file | Generative visuals driven by audio features | `references/inputs.md` § Audio Analysis |
|
||||
| **Generative** | None (or seed params) | Procedural ASCII animation | `references/effects.md` |
|
||||
| **Hybrid** | Video + audio | ASCII video with audio-reactive overlays | Both input refs |
|
||||
| **Lyrics/text** | Audio + text/SRT | Timed text with visual effects | `references/inputs.md` § Text/Lyrics |
|
||||
| **TTS narration** | Text quotes + TTS API | Narrated testimonial/quote video with typed text | `references/inputs.md` § TTS Integration |
|
||||
|
||||
## Stack
|
||||
|
||||
Single self-contained Python script per project. No GPU required.
|
||||
|
||||
| Layer | Tool | Purpose |
|
||||
|-------|------|---------|
|
||||
| Core | Python 3.10+, NumPy | Math, array ops, vectorized effects |
|
||||
| Signal | SciPy | FFT, peak detection (audio modes) |
|
||||
| Imaging | Pillow (PIL) | Font rasterization, frame decoding, image I/O |
|
||||
| Video I/O | ffmpeg (CLI) | Decode input, encode output, mux audio |
|
||||
| Parallel | concurrent.futures | N workers for batch/clip rendering |
|
||||
| TTS | ElevenLabs API (optional) | Generate narration clips |
|
||||
| Optional | OpenCV | Video frame sampling, edge detection |
|
||||
|
||||
## Pipeline Architecture
|
||||
|
||||
Every mode follows the same 6-stage pipeline:
|
||||
|
||||
```
|
||||
INPUT → ANALYZE → SCENE_FN → TONEMAP → SHADE → ENCODE
|
||||
```
|
||||
|
||||
1. **INPUT** — Load/decode source material (video frames, audio samples, images, or nothing)
|
||||
2. **ANALYZE** — Extract per-frame features (audio bands, video luminance/edges, motion vectors)
|
||||
3. **SCENE_FN** — Scene function renders to pixel canvas (`uint8 H,W,3`). Composes multiple character grids via `_render_vf()` + pixel blend modes. See `references/composition.md`
|
||||
4. **TONEMAP** — Percentile-based adaptive brightness normalization. See `references/composition.md` § Adaptive Tonemap
|
||||
5. **SHADE** — Post-processing via `ShaderChain` + `FeedbackBuffer`. See `references/shaders.md`
|
||||
6. **ENCODE** — Pipe raw RGB frames to ffmpeg for H.264/GIF encoding
|
||||
|
||||
## Creative Direction
|
||||
|
||||
### Aesthetic Dimensions
|
||||
|
||||
| Dimension | Options | Reference |
|
||||
|-----------|---------|-----------|
|
||||
| **Character palette** | Density ramps, block elements, symbols, scripts (katakana, Greek, runes, braille), project-specific | `architecture.md` § Palettes |
|
||||
| **Color strategy** | HSV, OKLAB/OKLCH, discrete RGB palettes, auto-generated harmony, monochrome, temperature | `architecture.md` § Color System |
|
||||
| **Background texture** | Sine fields, fBM noise, domain warp, voronoi, reaction-diffusion, cellular automata, video | `effects.md` |
|
||||
| **Primary effects** | Rings, spirals, tunnel, vortex, waves, interference, aurora, fire, SDFs, strange attractors | `effects.md` |
|
||||
| **Particles** | Sparks, snow, rain, bubbles, runes, orbits, flocking boids, flow-field followers, trails | `effects.md` § Particles |
|
||||
| **Shader mood** | Retro CRT, clean modern, glitch art, cinematic, dreamy, industrial, psychedelic | `shaders.md` |
|
||||
| **Grid density** | xs(8px) through xxl(40px), mixed per layer | `architecture.md` § Grid System |
|
||||
| **Coordinate space** | Cartesian, polar, tiled, rotated, fisheye, Möbius, domain-warped | `effects.md` § Transforms |
|
||||
| **Feedback** | Zoom tunnel, rainbow trails, ghostly echo, rotating mandala, color evolution | `composition.md` § Feedback |
|
||||
| **Masking** | Circle, ring, gradient, text stencil, animated iris/wipe/dissolve | `composition.md` § Masking |
|
||||
| **Transitions** | Crossfade, wipe, dissolve, glitch cut, iris, mask-based reveal | `shaders.md` § Transitions |
|
||||
|
||||
### Per-Section Variation
|
||||
|
||||
Never use the same config for the entire video. For each section/scene:
|
||||
- **Different background effect** (or compose 2-3)
|
||||
- **Different character palette** (match the mood)
|
||||
- **Different color strategy** (or at minimum a different hue)
|
||||
- **Vary shader intensity** (more bloom during peaks, more grain during quiet)
|
||||
- **Different particle types** if particles are active
|
||||
|
||||
### Project-Specific Invention
|
||||
|
||||
For every project, invent at least one of:
|
||||
- A custom character palette matching the theme
|
||||
- A custom background effect (combine/modify existing building blocks)
|
||||
- A custom color palette (discrete RGB set matching the brand/mood)
|
||||
- A custom particle character set
|
||||
- A novel scene transition or visual moment
|
||||
|
||||
Don't just pick from the catalog. The catalog is vocabulary — you write the poem.
|
||||
|
||||
## Workflow
|
||||
|
||||
### Step 1: Creative Vision
|
||||
|
||||
Before any code, articulate the creative concept:
|
||||
|
||||
- **Mood/atmosphere**: What should the viewer feel? Energetic, meditative, chaotic, elegant, ominous?
|
||||
- **Visual story**: What happens over the duration? Build tension? Transform? Dissolve?
|
||||
- **Color world**: Warm/cool? Monochrome? Neon? Earth tones? What's the dominant hue?
|
||||
- **Character texture**: Dense data? Sparse stars? Organic dots? Geometric blocks?
|
||||
- **What makes THIS different**: What's the one thing that makes this project unique?
|
||||
- **Emotional arc**: How do scenes progress? Open with energy, build to climax, resolve?
|
||||
|
||||
Map the user's prompt to aesthetic choices. A "chill lo-fi visualizer" demands different everything from a "glitch cyberpunk data stream."
|
||||
|
||||
### Step 2: Technical Design
|
||||
|
||||
- **Mode** — which of the 6 modes above
|
||||
- **Resolution** — landscape 1920x1080 (default), portrait 1080x1920, square 1080x1080 @ 24fps
|
||||
- **Hardware detection** — auto-detect cores/RAM, set quality profile. See `references/optimization.md`
|
||||
- **Sections** — map timestamps to scene functions, each with its own effect/palette/color/shader config
|
||||
- **Output format** — MP4 (default), GIF (640x360 @ 15fps), PNG sequence
|
||||
|
||||
### Step 3: Build the Script
|
||||
|
||||
Single Python file. Components (with references):
|
||||
|
||||
1. **Hardware detection + quality profile** — `references/optimization.md`
|
||||
2. **Input loader** — mode-dependent; `references/inputs.md`
|
||||
3. **Feature analyzer** — audio FFT, video luminance, or synthetic
|
||||
4. **Grid + renderer** — multi-density grids with bitmap cache; `references/architecture.md`
|
||||
5. **Character palettes** — multiple per project; `references/architecture.md` § Palettes
|
||||
6. **Color system** — HSV + discrete RGB + harmony generation; `references/architecture.md` § Color
|
||||
7. **Scene functions** — each returns `canvas (uint8 H,W,3)`; `references/scenes.md`
|
||||
8. **Tonemap** — adaptive brightness normalization; `references/composition.md`
|
||||
9. **Shader pipeline** — `ShaderChain` + `FeedbackBuffer`; `references/shaders.md`
|
||||
10. **Scene table + dispatcher** — time → scene function + config; `references/scenes.md`
|
||||
11. **Parallel encoder** — N-worker clip rendering with ffmpeg pipes
|
||||
12. **Main** — orchestrate full pipeline
|
||||
|
||||
### Step 4: Quality Verification
|
||||
|
||||
- **Test frames first**: render single frames at key timestamps before full render
|
||||
- **Brightness check**: `canvas.mean() > 8` for all ASCII content. If dark, lower gamma
|
||||
- **Visual coherence**: do all scenes feel like they belong to the same video?
|
||||
- **Creative vision check**: does the output match the concept from Step 1? If it looks generic, go back
|
||||
|
||||
## Critical Implementation Notes
|
||||
|
||||
### Brightness — Use `tonemap()`, Not Linear Multipliers
|
||||
|
||||
This is the #1 visual issue. ASCII on black is inherently dark. **Never use `canvas * N` multipliers** — they clip highlights. Use adaptive tonemap:
|
||||
|
||||
```python
|
||||
def tonemap(canvas, gamma=0.75):
|
||||
f = canvas.astype(np.float32)
|
||||
lo, hi = np.percentile(f[::4, ::4], [1, 99.5])
|
||||
if hi - lo < 10: hi = lo + 10
|
||||
f = np.clip((f - lo) / (hi - lo), 0, 1) ** gamma
|
||||
return (f * 255).astype(np.uint8)
|
||||
```
|
||||
|
||||
Pipeline: `scene_fn() → tonemap() → FeedbackBuffer → ShaderChain → ffmpeg`
|
||||
|
||||
Per-scene gamma: default 0.75, solarize 0.55, posterize 0.50, bright scenes 0.85. Use `screen` blend (not `overlay`) for dark layers.
|
||||
|
||||
### Font Cell Height
|
||||
|
||||
macOS Pillow: `textbbox()` returns wrong height. Use `font.getmetrics()`: `cell_height = ascent + descent`. See `references/troubleshooting.md`.
|
||||
|
||||
### ffmpeg Pipe Deadlock
|
||||
|
||||
Never `stderr=subprocess.PIPE` with long-running ffmpeg — buffer fills at 64KB and deadlocks. Redirect to file. See `references/troubleshooting.md`.
|
||||
|
||||
### Font Compatibility
|
||||
|
||||
Not all Unicode chars render in all fonts. Validate palettes at init — render each char, check for blank output. See `references/troubleshooting.md`.
|
||||
|
||||
### Per-Clip Architecture
|
||||
|
||||
For segmented videos (quotes, scenes, chapters), render each as a separate clip file for parallel rendering and selective re-rendering. See `references/scenes.md`.
|
||||
|
||||
## Performance Targets
|
||||
|
||||
| Component | Budget |
|
||||
|-----------|--------|
|
||||
| Feature extraction | 1-5ms |
|
||||
| Effect function | 2-15ms |
|
||||
| Character render | 80-150ms (bottleneck) |
|
||||
| Shader pipeline | 5-25ms |
|
||||
| **Total** | ~100-200ms/frame |
|
||||
|
||||
## References
|
||||
|
||||
| File | Contents |
|
||||
|------|----------|
|
||||
| `references/architecture.md` | Grid system, resolution presets, font selection, character palettes (20+), color system (HSV + OKLAB + discrete RGB + harmony generation), `_render_vf()` helper, GridLayer class |
|
||||
| `references/composition.md` | Pixel blend modes (20 modes), `blend_canvas()`, multi-grid composition, adaptive `tonemap()`, `FeedbackBuffer`, `PixelBlendStack`, masking/stencil system |
|
||||
| `references/effects.md` | Effect building blocks: value field generators, hue fields, noise/fBM/domain warp, voronoi, reaction-diffusion, cellular automata, SDFs, strange attractors, particle systems, coordinate transforms, temporal coherence |
|
||||
| `references/shaders.md` | `ShaderChain`, `_apply_shader_step()` dispatch, 38 shader catalog, audio-reactive scaling, transitions, tint presets, output format encoding, terminal rendering |
|
||||
| `references/scenes.md` | Scene protocol, `Renderer` class, `SCENES` table, `render_clip()`, beat-synced cutting, parallel rendering, design patterns (layer hierarchy, directional arcs, visual metaphors, compositional techniques), complete scene examples at every complexity level, scene design checklist |
|
||||
| `references/inputs.md` | Audio analysis (FFT, bands, beats), video sampling, image conversion, text/lyrics, TTS integration (ElevenLabs, voice assignment, audio mixing) |
|
||||
| `references/optimization.md` | Hardware detection, quality profiles, vectorized patterns, parallel rendering, memory management, performance budgets |
|
||||
| `references/troubleshooting.md` | NumPy broadcasting traps, blend mode pitfalls, multiprocessing/pickling, brightness diagnostics, ffmpeg issues, font problems, common mistakes |
|
||||
|
||||
---
|
||||
|
||||
## Creative Divergence (use only when user requests experimental/creative/unique output)
|
||||
|
||||
If the user asks for creative, experimental, surprising, or unconventional output, select the strategy that best fits and reason through its steps BEFORE generating code.
|
||||
|
||||
- **Forced Connections** — when the user wants cross-domain inspiration ("make it look organic," "industrial aesthetic")
|
||||
- **Conceptual Blending** — when the user names two things to combine ("ocean meets music," "space + calligraphy")
|
||||
- **Oblique Strategies** — when the user is maximally open ("surprise me," "something I've never seen")
|
||||
|
||||
### Forced Connections
|
||||
1. Pick a domain unrelated to the visual goal (weather systems, microbiology, architecture, fluid dynamics, textile weaving)
|
||||
2. List its core visual/structural elements (erosion → gradual reveal; mitosis → splitting duplication; weaving → interlocking patterns)
|
||||
3. Map those elements onto ASCII characters and animation patterns
|
||||
4. Synthesize — what does "erosion" or "crystallization" look like in a character grid?
|
||||
|
||||
### Conceptual Blending
|
||||
1. Name two distinct visual/conceptual spaces (e.g., ocean waves + sheet music)
|
||||
2. Map correspondences (crests = high notes, troughs = rests, foam = staccato)
|
||||
3. Blend selectively — keep the most interesting mappings, discard forced ones
|
||||
4. Develop emergent properties that exist only in the blend
|
||||
|
||||
### Oblique Strategies
|
||||
1. Draw one: "Honor thy error as a hidden intention" / "Use an old idea" / "What would your closest friend do?" / "Emphasize the flaws" / "Turn it upside down" / "Only a part, not the whole" / "Reverse"
|
||||
2. Interpret the directive against the current ASCII animation challenge
|
||||
3. Apply the lateral insight to the visual design before writing code
|
||||
@@ -0,0 +1,263 @@
|
||||
---
|
||||
title: "Baoyu Comic — Knowledge comic creator supporting multiple art styles and tones"
|
||||
sidebar_label: "Baoyu Comic"
|
||||
description: "Knowledge comic creator supporting multiple art styles and tones"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Baoyu Comic
|
||||
|
||||
Knowledge comic creator supporting multiple art styles and tones. Creates original educational comics with detailed panel layouts and sequential image generation. Use when user asks to create "知识漫画", "教育漫画", "biography comic", "tutorial comic", or "Logicomix-style comic".
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/baoyu-comic` |
|
||||
| Version | `1.56.1` |
|
||||
| Author | 宝玉 (JimLiu) |
|
||||
| License | MIT |
|
||||
| Tags | `comic`, `knowledge-comic`, `creative`, `image-generation` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Knowledge Comic Creator
|
||||
|
||||
Adapted from [baoyu-comic](https://github.com/JimLiu/baoyu-skills) for Hermes Agent's tool ecosystem.
|
||||
|
||||
Create original knowledge comics with flexible art style × tone combinations.
|
||||
|
||||
## When to Use
|
||||
|
||||
Trigger this skill when the user asks to create a knowledge/educational comic, biography comic, tutorial comic, or uses terms like "知识漫画", "教育漫画", or "Logicomix-style". The user provides content (text, file path, URL, or topic) and optionally specifies art style, tone, layout, aspect ratio, or language.
|
||||
|
||||
## Reference Images
|
||||
|
||||
Hermes' `image_generate` tool is **prompt-only** — it accepts a text prompt and an aspect ratio, and returns an image URL. It does **NOT** accept reference images. When the user supplies a reference image, use it to **extract traits in text** that get embedded in every page prompt:
|
||||
|
||||
**Intake**: Accept file paths when the user provides them (or pastes images in conversation).
|
||||
- File path(s) → copy to `refs/NN-ref-{slug}.{ext}` alongside the comic output for provenance
|
||||
- Pasted image with no path → ask the user for the path via `clarify`, or extract style traits verbally as a text fallback
|
||||
- No reference → skip this section
|
||||
|
||||
**Usage modes** (per reference):
|
||||
|
||||
| Usage | Effect |
|
||||
|-------|--------|
|
||||
| `style` | Extract style traits (line treatment, texture, mood) and append to every page's prompt body |
|
||||
| `palette` | Extract hex colors and append to every page's prompt body |
|
||||
| `scene` | Extract scene composition or subject notes and append to the relevant page(s) |
|
||||
|
||||
**Record in each page's prompt frontmatter** when refs exist:
|
||||
|
||||
```yaml
|
||||
references:
|
||||
- ref_id: 01
|
||||
filename: 01-ref-scene.png
|
||||
usage: style
|
||||
traits: "muted earth tones, soft-edged ink wash, low-contrast backgrounds"
|
||||
```
|
||||
|
||||
Character consistency is driven by **text descriptions** in `characters/characters.md` (written in Step 3) that get embedded inline in every page prompt (Step 5). The optional PNG character sheet generated in Step 7.1 is a human-facing review artifact, not an input to `image_generate`.
|
||||
|
||||
## Options
|
||||
|
||||
### Visual Dimensions
|
||||
|
||||
| Option | Values | Description |
|
||||
|--------|--------|-------------|
|
||||
| Art | ligne-claire (default), manga, realistic, ink-brush, chalk, minimalist | Art style / rendering technique |
|
||||
| Tone | neutral (default), warm, dramatic, romantic, energetic, vintage, action | Mood / atmosphere |
|
||||
| Layout | standard (default), cinematic, dense, splash, mixed, webtoon, four-panel | Panel arrangement |
|
||||
| Aspect | 3:4 (default, portrait), 4:3 (landscape), 16:9 (widescreen) | Page aspect ratio |
|
||||
| Language | auto (default), zh, en, ja, etc. | Output language |
|
||||
| Refs | File paths | Reference images used for style / palette trait extraction (not passed to the image model). See [Reference Images](#reference-images) above. |
|
||||
|
||||
### Partial Workflow Options
|
||||
|
||||
| Option | Description |
|
||||
|--------|-------------|
|
||||
| Storyboard only | Generate storyboard only, skip prompts and images |
|
||||
| Prompts only | Generate storyboard + prompts, skip images |
|
||||
| Images only | Generate images from existing prompts directory |
|
||||
| Regenerate N | Regenerate specific page(s) only (e.g., `3` or `2,5,8`) |
|
||||
|
||||
Details: [references/partial-workflows.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/partial-workflows.md)
|
||||
|
||||
### Art, Tone & Preset Catalogue
|
||||
|
||||
- **Art styles** (6): `ligne-claire`, `manga`, `realistic`, `ink-brush`, `chalk`, `minimalist`. Full definitions at `references/art-styles/<style>.md`.
|
||||
- **Tones** (7): `neutral`, `warm`, `dramatic`, `romantic`, `energetic`, `vintage`, `action`. Full definitions at `references/tones/<tone>.md`.
|
||||
- **Presets** (5) with special rules beyond plain art+tone:
|
||||
|
||||
| Preset | Equivalent | Hook |
|
||||
|--------|-----------|------|
|
||||
| `ohmsha` | manga + neutral | Visual metaphors, no talking heads, gadget reveals |
|
||||
| `wuxia` | ink-brush + action | Qi effects, combat visuals, atmospheric |
|
||||
| `shoujo` | manga + romantic | Decorative elements, eye details, romantic beats |
|
||||
| `concept-story` | manga + warm | Visual symbol system, growth arc, dialogue+action balance |
|
||||
| `four-panel` | minimalist + neutral + four-panel layout | 起承转合 structure, B&W + spot color, stick-figure characters |
|
||||
|
||||
Full rules at `references/presets/<preset>.md` — load the file when a preset is picked.
|
||||
|
||||
- **Compatibility matrix** and **content-signal → preset** table live in [references/auto-selection.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/auto-selection.md). Read it before recommending combinations in Step 2.
|
||||
|
||||
## File Structure
|
||||
|
||||
Output directory: `comic/{topic-slug}/`
|
||||
- Slug: 2-4 words kebab-case from topic (e.g., `alan-turing-bio`)
|
||||
- Conflict: append timestamp (e.g., `turing-story-20260118-143052`)
|
||||
|
||||
**Contents**:
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `source-{slug}.md` | Saved source content (kebab-case slug matches the output directory) |
|
||||
| `analysis.md` | Content analysis |
|
||||
| `storyboard.md` | Storyboard with panel breakdown |
|
||||
| `characters/characters.md` | Character definitions |
|
||||
| `characters/characters.png` | Character reference sheet (downloaded from `image_generate`) |
|
||||
| `prompts/NN-{cover\|page}-[slug].md` | Generation prompts |
|
||||
| `NN-{cover\|page}-[slug].png` | Generated images (downloaded from `image_generate`) |
|
||||
| `refs/NN-ref-{slug}.{ext}` | User-supplied reference images (optional, for provenance) |
|
||||
|
||||
## Language Handling
|
||||
|
||||
**Detection Priority**:
|
||||
1. User-specified language (explicit option)
|
||||
2. User's conversation language
|
||||
3. Source content language
|
||||
|
||||
**Rule**: Use user's input language for ALL interactions:
|
||||
- Storyboard outlines and scene descriptions
|
||||
- Image generation prompts
|
||||
- User selection options and confirmations
|
||||
- Progress updates, questions, errors, summaries
|
||||
|
||||
Technical terms remain in English.
|
||||
|
||||
## Workflow
|
||||
|
||||
### Progress Checklist
|
||||
|
||||
```
|
||||
Comic Progress:
|
||||
- [ ] Step 1: Setup & Analyze
|
||||
- [ ] 1.1 Analyze content
|
||||
- [ ] 1.2 Check existing directory
|
||||
- [ ] Step 2: Confirmation - Style & options ⚠️ REQUIRED
|
||||
- [ ] Step 3: Generate storyboard + characters
|
||||
- [ ] Step 4: Review outline (conditional)
|
||||
- [ ] Step 5: Generate prompts
|
||||
- [ ] Step 6: Review prompts (conditional)
|
||||
- [ ] Step 7: Generate images
|
||||
- [ ] 7.1 Generate character sheet (if needed) → characters/characters.png
|
||||
- [ ] 7.2 Generate pages (with character descriptions embedded in prompt)
|
||||
- [ ] Step 8: Completion report
|
||||
```
|
||||
|
||||
### Flow
|
||||
|
||||
```
|
||||
Input → Analyze → [Check Existing?] → [Confirm: Style + Reviews] → Storyboard → [Review?] → Prompts → [Review?] → Images → Complete
|
||||
```
|
||||
|
||||
### Step Summary
|
||||
|
||||
| Step | Action | Key Output |
|
||||
|------|--------|------------|
|
||||
| 1.1 | Analyze content | `analysis.md`, `source-{slug}.md` |
|
||||
| 1.2 | Check existing directory | Handle conflicts |
|
||||
| 2 | Confirm style, focus, audience, reviews | User preferences |
|
||||
| 3 | Generate storyboard + characters | `storyboard.md`, `characters/` |
|
||||
| 4 | Review outline (if requested) | User approval |
|
||||
| 5 | Generate prompts | `prompts/*.md` |
|
||||
| 6 | Review prompts (if requested) | User approval |
|
||||
| 7.1 | Generate character sheet (if needed) | `characters/characters.png` |
|
||||
| 7.2 | Generate pages | `*.png` files |
|
||||
| 8 | Completion report | Summary |
|
||||
|
||||
### User Questions
|
||||
|
||||
Use the `clarify` tool to confirm options. Since `clarify` handles one question at a time, ask the most important question first and proceed sequentially. See [references/workflow.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/workflow.md) for the full Step 2 question set.
|
||||
|
||||
**Timeout handling (CRITICAL)**: `clarify` can return `"The user did not provide a response within the time limit. Use your best judgement to make the choice and proceed."` — this is NOT user consent to default everything.
|
||||
|
||||
- Treat it as a default **for that one question only**. Continue asking the remaining Step 2 questions in sequence; each question is an independent consent point.
|
||||
- **Surface the default to the user visibly** in your next message so they have a chance to correct it: e.g. `"Style: defaulted to ohmsha preset (clarify timed out). Say the word to switch."` — an unreported default is indistinguishable from never having asked.
|
||||
- Do NOT collapse Step 2 into a single "use all defaults" pass after one timeout. If the user is genuinely absent, they will be equally absent for all five questions — but they can correct visible defaults when they return, and cannot correct invisible ones.
|
||||
|
||||
### Step 7: Image Generation
|
||||
|
||||
Use Hermes' built-in `image_generate` tool for all image rendering. Its schema accepts only `prompt` and `aspect_ratio` (`landscape` | `portrait` | `square`); it **returns a URL**, not a local file. Every generated page or character sheet must therefore be downloaded to the output directory.
|
||||
|
||||
**Prompt file requirement (hard)**: write each image's full, final prompt to a standalone file under `prompts/` (naming: `NN-{type}-[slug].md`) BEFORE calling `image_generate`. The prompt file is the reproducibility record.
|
||||
|
||||
**Aspect ratio mapping** — the storyboard's `aspect_ratio` field maps to `image_generate`'s format as follows:
|
||||
|
||||
| Storyboard ratio | `image_generate` format |
|
||||
|------------------|-------------------------|
|
||||
| `3:4`, `9:16`, `2:3` | `portrait` |
|
||||
| `4:3`, `16:9`, `3:2` | `landscape` |
|
||||
| `1:1` | `square` |
|
||||
|
||||
**Download step** — after every `image_generate` call:
|
||||
1. Read the URL from the tool result
|
||||
2. Fetch the image bytes using an **absolute** output path, e.g.
|
||||
`curl -fsSL "<url>" -o /abs/path/to/comic/<slug>/NN-page-<slug>.png`
|
||||
3. Verify the file exists and is non-empty at that exact path before proceeding to the next page
|
||||
|
||||
**Never rely on shell CWD persistence for `-o` paths.** The terminal tool's persistent-shell CWD can change between batches (session expiry, `TERMINAL_LIFETIME_SECONDS`, a failed `cd` that leaves you in the wrong directory). `curl -o relative/path.png` is a silent footgun: if CWD has drifted, the file lands somewhere else with no error. **Always pass a fully-qualified absolute path to `-o`**, or pass `workdir=<abs path>` to the terminal tool. Incident Apr 2026: pages 06-09 of a 10-page comic landed at the repo root instead of `comic/<slug>/` because batch 3 inherited a stale CWD from batch 2 and `curl -o 06-page-skills.png` wrote to the wrong directory. The agent then spent several turns claiming the files existed where they didn't.
|
||||
|
||||
**7.1 Character sheet** — generate it (to `characters/characters.png`, aspect `landscape`) when the comic is multi-page with recurring characters. Skip for simple presets (e.g., four-panel minimalist) or single-page comics. The prompt file at `characters/characters.md` must exist before invoking `image_generate`. The rendered PNG is a **human-facing review artifact** (so the user can visually verify character design) and a reference for later regenerations or manual prompt edits — it does **not** drive Step 7.2. Page prompts are already written in Step 5 from the **text descriptions** in `characters/characters.md`; `image_generate` cannot accept images as visual input.
|
||||
|
||||
**7.2 Pages** — each page's prompt MUST already be at `prompts/NN-{cover|page}-[slug].md` before invoking `image_generate`. Because `image_generate` is prompt-only, character consistency is enforced by **embedding character descriptions (sourced from `characters/characters.md`) inline in every page prompt during Step 5**. The embedding is done uniformly whether or not a PNG sheet is produced in 7.1; the PNG is only a review/regeneration aid.
|
||||
|
||||
**Backup rule**: existing `prompts/…md` and `…png` files → rename with `-backup-YYYYMMDD-HHMMSS` suffix before regenerating.
|
||||
|
||||
Full step-by-step workflow (analysis, storyboard, review gates, regeneration variants): [references/workflow.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/workflow.md).
|
||||
|
||||
## References
|
||||
|
||||
**Core Templates**:
|
||||
- [analysis-framework.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/analysis-framework.md) - Deep content analysis
|
||||
- [character-template.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/character-template.md) - Character definition format
|
||||
- [storyboard-template.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/storyboard-template.md) - Storyboard structure
|
||||
- [ohmsha-guide.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/ohmsha-guide.md) - Ohmsha manga specifics
|
||||
|
||||
**Style Definitions**:
|
||||
- `references/art-styles/` - Art styles (ligne-claire, manga, realistic, ink-brush, chalk, minimalist)
|
||||
- `references/tones/` - Tones (neutral, warm, dramatic, romantic, energetic, vintage, action)
|
||||
- `references/presets/` - Presets with special rules (ohmsha, wuxia, shoujo, concept-story, four-panel)
|
||||
- `references/layouts/` - Layouts (standard, cinematic, dense, splash, mixed, webtoon, four-panel)
|
||||
|
||||
**Workflow**:
|
||||
- [workflow.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/workflow.md) - Full workflow details
|
||||
- [auto-selection.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/auto-selection.md) - Content signal analysis
|
||||
- [partial-workflows.md](https://github.com/NousResearch/hermes-agent/blob/main/skills/creative/baoyu-comic/references/partial-workflows.md) - Partial workflow options
|
||||
|
||||
## Page Modification
|
||||
|
||||
| Action | Steps |
|
||||
|--------|-------|
|
||||
| **Edit** | **Update prompt file FIRST** → regenerate image → download new PNG |
|
||||
| **Add** | Create prompt at position → generate with character descriptions embedded → renumber subsequent → update storyboard |
|
||||
| **Delete** | Remove files → renumber subsequent → update storyboard |
|
||||
|
||||
**IMPORTANT**: When updating pages, ALWAYS update the prompt file (`prompts/NN-{cover|page}-[slug].md`) FIRST before regenerating. This ensures changes are documented and reproducible.
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- Image generation: 10-30 seconds per page; auto-retry once on failure
|
||||
- **Always download** the URL returned by `image_generate` to a local PNG — downstream tooling (and the user's review) expects files in the output directory, not ephemeral URLs
|
||||
- **Use absolute paths for `curl -o`** — never rely on persistent-shell CWD across batches. Silent footgun: files land in the wrong directory and subsequent `ls` on the intended path shows nothing. See Step 7 "Download step".
|
||||
- Use stylized alternatives for sensitive public figures
|
||||
- **Step 2 confirmation required** - do not skip
|
||||
- **Steps 4/6 conditional** - only if user requested in Step 2
|
||||
- **Step 7.1 character sheet** - recommended for multi-page comics, optional for simple presets. The PNG is a review/regeneration aid; page prompts (written in Step 5) use the text descriptions in `characters/characters.md`, not the PNG. `image_generate` does not accept images as visual input
|
||||
- **Strip secrets** — scan source content for API keys, tokens, or credentials before writing any output file
|
||||
@@ -0,0 +1,253 @@
|
||||
---
|
||||
title: "Baoyu Infographic — Generate professional infographics with 21 layout types and 21 visual styles"
|
||||
sidebar_label: "Baoyu Infographic"
|
||||
description: "Generate professional infographics with 21 layout types and 21 visual styles"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Baoyu Infographic
|
||||
|
||||
Generate professional infographics with 21 layout types and 21 visual styles. Analyzes content, recommends layout×style combinations, and generates publication-ready infographics. Use when user asks to create "infographic", "visual summary", "信息图", "可视化", or "高密度信息大图".
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/baoyu-infographic` |
|
||||
| Version | `1.56.1` |
|
||||
| Author | 宝玉 (JimLiu) |
|
||||
| License | MIT |
|
||||
| Tags | `infographic`, `visual-summary`, `creative`, `image-generation` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Infographic Generator
|
||||
|
||||
Adapted from [baoyu-infographic](https://github.com/JimLiu/baoyu-skills) for Hermes Agent's tool ecosystem.
|
||||
|
||||
Two dimensions: **layout** (information structure) × **style** (visual aesthetics). Freely combine any layout with any style.
|
||||
|
||||
## When to Use
|
||||
|
||||
Trigger this skill when the user asks to create an infographic, visual summary, information graphic, or uses terms like "信息图", "可视化", or "高密度信息大图". The user provides content (text, file path, URL, or topic) and optionally specifies layout, style, aspect ratio, or language.
|
||||
|
||||
## Options
|
||||
|
||||
| Option | Values |
|
||||
|--------|--------|
|
||||
| Layout | 21 options (see Layout Gallery), default: bento-grid |
|
||||
| Style | 21 options (see Style Gallery), default: craft-handmade |
|
||||
| Aspect | Named: landscape (16:9), portrait (9:16), square (1:1). Custom: any W:H ratio (e.g., 3:4, 4:3, 2.35:1) |
|
||||
| Language | en, zh, ja, etc. |
|
||||
|
||||
## Layout Gallery
|
||||
|
||||
| Layout | Best For |
|
||||
|--------|----------|
|
||||
| `linear-progression` | Timelines, processes, tutorials |
|
||||
| `binary-comparison` | A vs B, before-after, pros-cons |
|
||||
| `comparison-matrix` | Multi-factor comparisons |
|
||||
| `hierarchical-layers` | Pyramids, priority levels |
|
||||
| `tree-branching` | Categories, taxonomies |
|
||||
| `hub-spoke` | Central concept with related items |
|
||||
| `structural-breakdown` | Exploded views, cross-sections |
|
||||
| `bento-grid` | Multiple topics, overview (default) |
|
||||
| `iceberg` | Surface vs hidden aspects |
|
||||
| `bridge` | Problem-solution |
|
||||
| `funnel` | Conversion, filtering |
|
||||
| `isometric-map` | Spatial relationships |
|
||||
| `dashboard` | Metrics, KPIs |
|
||||
| `periodic-table` | Categorized collections |
|
||||
| `comic-strip` | Narratives, sequences |
|
||||
| `story-mountain` | Plot structure, tension arcs |
|
||||
| `jigsaw` | Interconnected parts |
|
||||
| `venn-diagram` | Overlapping concepts |
|
||||
| `winding-roadmap` | Journey, milestones |
|
||||
| `circular-flow` | Cycles, recurring processes |
|
||||
| `dense-modules` | High-density modules, data-rich guides |
|
||||
|
||||
Full definitions: `references/layouts/<layout>.md`
|
||||
|
||||
## Style Gallery
|
||||
|
||||
| Style | Description |
|
||||
|-------|-------------|
|
||||
| `craft-handmade` | Hand-drawn, paper craft (default) |
|
||||
| `claymation` | 3D clay figures, stop-motion |
|
||||
| `kawaii` | Japanese cute, pastels |
|
||||
| `storybook-watercolor` | Soft painted, whimsical |
|
||||
| `chalkboard` | Chalk on black board |
|
||||
| `cyberpunk-neon` | Neon glow, futuristic |
|
||||
| `bold-graphic` | Comic style, halftone |
|
||||
| `aged-academia` | Vintage science, sepia |
|
||||
| `corporate-memphis` | Flat vector, vibrant |
|
||||
| `technical-schematic` | Blueprint, engineering |
|
||||
| `origami` | Folded paper, geometric |
|
||||
| `pixel-art` | Retro 8-bit |
|
||||
| `ui-wireframe` | Grayscale interface mockup |
|
||||
| `subway-map` | Transit diagram |
|
||||
| `ikea-manual` | Minimal line art |
|
||||
| `knolling` | Organized flat-lay |
|
||||
| `lego-brick` | Toy brick construction |
|
||||
| `pop-laboratory` | Blueprint grid, coordinate markers, lab precision |
|
||||
| `morandi-journal` | Hand-drawn doodle, warm Morandi tones |
|
||||
| `retro-pop-grid` | 1970s retro pop art, Swiss grid, thick outlines |
|
||||
| `hand-drawn-edu` | Macaron pastels, hand-drawn wobble, stick figures |
|
||||
|
||||
Full definitions: `references/styles/<style>.md`
|
||||
|
||||
## Recommended Combinations
|
||||
|
||||
| Content Type | Layout + Style |
|
||||
|--------------|----------------|
|
||||
| Timeline/History | `linear-progression` + `craft-handmade` |
|
||||
| Step-by-step | `linear-progression` + `ikea-manual` |
|
||||
| A vs B | `binary-comparison` + `corporate-memphis` |
|
||||
| Hierarchy | `hierarchical-layers` + `craft-handmade` |
|
||||
| Overlap | `venn-diagram` + `craft-handmade` |
|
||||
| Conversion | `funnel` + `corporate-memphis` |
|
||||
| Cycles | `circular-flow` + `craft-handmade` |
|
||||
| Technical | `structural-breakdown` + `technical-schematic` |
|
||||
| Metrics | `dashboard` + `corporate-memphis` |
|
||||
| Educational | `bento-grid` + `chalkboard` |
|
||||
| Journey | `winding-roadmap` + `storybook-watercolor` |
|
||||
| Categories | `periodic-table` + `bold-graphic` |
|
||||
| Product Guide | `dense-modules` + `morandi-journal` |
|
||||
| Technical Guide | `dense-modules` + `pop-laboratory` |
|
||||
| Trendy Guide | `dense-modules` + `retro-pop-grid` |
|
||||
| Educational Diagram | `hub-spoke` + `hand-drawn-edu` |
|
||||
| Process Tutorial | `linear-progression` + `hand-drawn-edu` |
|
||||
|
||||
Default: `bento-grid` + `craft-handmade`
|
||||
|
||||
## Keyword Shortcuts
|
||||
|
||||
When user input contains these keywords, **auto-select** the associated layout and offer associated styles as top recommendations in Step 3. Skip content-based layout inference for matched keywords.
|
||||
|
||||
If a shortcut has **Prompt Notes**, append them to the generated prompt (Step 5) as additional style instructions.
|
||||
|
||||
| User Keyword | Layout | Recommended Styles | Default Aspect | Prompt Notes |
|
||||
|--------------|--------|--------------------|----------------|--------------|
|
||||
| 高密度信息大图 / high-density-info | `dense-modules` | `morandi-journal`, `pop-laboratory`, `retro-pop-grid` | portrait | — |
|
||||
| 信息图 / infographic | `bento-grid` | `craft-handmade` | landscape | Minimalist: clean canvas, ample whitespace, no complex background textures. Simple cartoon elements and icons only. |
|
||||
|
||||
## Output Structure
|
||||
|
||||
```
|
||||
infographic/{topic-slug}/
|
||||
├── source-{slug}.{ext}
|
||||
├── analysis.md
|
||||
├── structured-content.md
|
||||
├── prompts/infographic.md
|
||||
└── infographic.png
|
||||
```
|
||||
|
||||
Slug: 2-4 words kebab-case from topic. Conflict: append `-YYYYMMDD-HHMMSS`.
|
||||
|
||||
## Core Principles
|
||||
|
||||
- Preserve source data faithfully — no summarization or rephrasing (but **strip any credentials, API keys, tokens, or secrets** before including in outputs)
|
||||
- Define learning objectives before structuring content
|
||||
- Structure for visual communication (headlines, labels, visual elements)
|
||||
|
||||
## Workflow
|
||||
|
||||
### Step 1: Analyze Content
|
||||
|
||||
**Load references**: Read `references/analysis-framework.md` from this skill.
|
||||
|
||||
1. Save source content (file path or paste → `source.md` using `write_file`)
|
||||
- **Backup rule**: If `source.md` exists, rename to `source-backup-YYYYMMDD-HHMMSS.md`
|
||||
2. Analyze: topic, data type, complexity, tone, audience
|
||||
3. Detect source language and user language
|
||||
4. Extract design instructions from user input
|
||||
5. Save analysis to `analysis.md`
|
||||
- **Backup rule**: If `analysis.md` exists, rename to `analysis-backup-YYYYMMDD-HHMMSS.md`
|
||||
|
||||
See `references/analysis-framework.md` for detailed format.
|
||||
|
||||
### Step 2: Generate Structured Content → `structured-content.md`
|
||||
|
||||
Transform content into infographic structure:
|
||||
1. Title and learning objectives
|
||||
2. Sections with: key concept, content (verbatim), visual element, text labels
|
||||
3. Data points (all statistics/quotes copied exactly)
|
||||
4. Design instructions from user
|
||||
|
||||
**Rules**: Markdown only. No new information. Preserve data faithfully. Strip any credentials or secrets from output.
|
||||
|
||||
See `references/structured-content-template.md` for detailed format.
|
||||
|
||||
### Step 3: Recommend Combinations
|
||||
|
||||
**3.1 Check Keyword Shortcuts first**: If user input matches a keyword from the **Keyword Shortcuts** table, auto-select the associated layout and prioritize associated styles as top recommendations. Skip content-based layout inference.
|
||||
|
||||
**3.2 Otherwise**, recommend 3-5 layout×style combinations based on:
|
||||
- Data structure → matching layout
|
||||
- Content tone → matching style
|
||||
- Audience expectations
|
||||
- User design instructions
|
||||
|
||||
### Step 4: Confirm Options
|
||||
|
||||
Use the `clarify` tool to confirm options with the user. Since `clarify` handles one question at a time, ask the most important question first:
|
||||
|
||||
**Q1 — Combination**: Present 3+ layout×style combos with rationale. Ask user to pick one.
|
||||
|
||||
**Q2 — Aspect**: Ask for aspect ratio preference (landscape/portrait/square or custom W:H).
|
||||
|
||||
**Q3 — Language** (only if source ≠ user language): Ask which language the text content should use.
|
||||
|
||||
### Step 5: Generate Prompt → `prompts/infographic.md`
|
||||
|
||||
**Backup rule**: If `prompts/infographic.md` exists, rename to `prompts/infographic-backup-YYYYMMDD-HHMMSS.md`
|
||||
|
||||
**Load references**: Read the selected layout from `references/layouts/<layout>.md` and style from `references/styles/<style>.md`.
|
||||
|
||||
Combine:
|
||||
1. Layout definition from `references/layouts/<layout>.md`
|
||||
2. Style definition from `references/styles/<style>.md`
|
||||
3. Base template from `references/base-prompt.md`
|
||||
4. Structured content from Step 2
|
||||
5. All text in confirmed language
|
||||
|
||||
**Aspect ratio resolution** for `{{ASPECT_RATIO}}`:
|
||||
- Named presets → ratio string: landscape→`16:9`, portrait→`9:16`, square→`1:1`
|
||||
- Custom W:H ratios → use as-is (e.g., `3:4`, `4:3`, `2.35:1`)
|
||||
|
||||
Save the assembled prompt to `prompts/infographic.md` using `write_file`.
|
||||
|
||||
### Step 6: Generate Image
|
||||
|
||||
Use the `image_generate` tool with the assembled prompt from Step 5.
|
||||
|
||||
- Map aspect ratio to image_generate's format: `16:9` → `landscape`, `9:16` → `portrait`, `1:1` → `square`
|
||||
- For custom ratios, pick the closest named aspect
|
||||
- On failure, auto-retry once
|
||||
- Save the resulting image URL/path to the output directory
|
||||
|
||||
### Step 7: Output Summary
|
||||
|
||||
Report: topic, layout, style, aspect, language, output path, files created.
|
||||
|
||||
## References
|
||||
|
||||
- `references/analysis-framework.md` — Analysis methodology
|
||||
- `references/structured-content-template.md` — Content format
|
||||
- `references/base-prompt.md` — Prompt template
|
||||
- `references/layouts/<layout>.md` — 21 layout definitions
|
||||
- `references/styles/<style>.md` — 21 style definitions
|
||||
|
||||
## Pitfalls
|
||||
|
||||
1. **Data integrity is paramount** — never summarize, paraphrase, or alter source statistics. "73% increase" must stay "73% increase", not "significant increase".
|
||||
2. **Strip secrets** — always scan source content for API keys, tokens, or credentials before including in any output file.
|
||||
3. **One message per section** — each infographic section should convey one clear concept. Overloading sections reduces readability.
|
||||
4. **Style consistency** — the style definition from the references file must be applied consistently across the entire infographic. Don't mix styles.
|
||||
5. **image_generate aspect ratios** — the tool only supports `landscape`, `portrait`, and `square`. Custom ratios like `3:4` should map to the nearest option (portrait in that case).
|
||||
@@ -0,0 +1,162 @@
|
||||
---
|
||||
title: "Ideation — Generate project ideas through creative constraints"
|
||||
sidebar_label: "Ideation"
|
||||
description: "Generate project ideas through creative constraints"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Ideation
|
||||
|
||||
Generate project ideas through creative constraints. Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/creative-ideation` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | SHL0MS |
|
||||
| License | MIT |
|
||||
| Tags | `Creative`, `Ideation`, `Projects`, `Brainstorming`, `Inspiration` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Creative Ideation
|
||||
|
||||
Generate project ideas through creative constraints. Constraint + direction = creativity.
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Pick a constraint** from the library below — random, or matched to the user's domain/mood
|
||||
2. **Interpret it broadly** — a coding prompt can become a hardware project, an art prompt can become a CLI tool
|
||||
3. **Generate 3 concrete project ideas** that satisfy the constraint
|
||||
4. **If they pick one, build it** — create the project, write the code, ship it
|
||||
|
||||
## The Rule
|
||||
|
||||
Every prompt is interpreted as broadly as possible. "Does this include X?" → Yes. The prompts provide direction and mild constraint. Without either, there is no creativity.
|
||||
|
||||
## Constraint Library
|
||||
|
||||
### For Developers
|
||||
|
||||
**Solve your own itch:**
|
||||
Build the tool you wished existed this week. Under 50 lines. Ship it today.
|
||||
|
||||
**Automate the annoying thing:**
|
||||
What's the most tedious part of your workflow? Script it away. Two hours to fix a problem that costs you five minutes a day.
|
||||
|
||||
**The CLI tool that should exist:**
|
||||
Think of a command you've wished you could type. `git undo-that-thing-i-just-did`. `docker why-is-this-broken`. `npm explain-yourself`. Now build it.
|
||||
|
||||
**Nothing new except glue:**
|
||||
Make something entirely from existing APIs, libraries, and datasets. The only original contribution is how you connect them.
|
||||
|
||||
**Frankenstein week:**
|
||||
Take something that does X and make it do Y. A git repo that plays music. A Dockerfile that generates poetry. A cron job that sends compliments.
|
||||
|
||||
**Subtract:**
|
||||
How much can you remove from a codebase before it breaks? Strip a tool to its minimum viable function. Delete until only the essence remains.
|
||||
|
||||
**High concept, low effort:**
|
||||
A deep idea, lazily executed. The concept should be brilliant. The implementation should take an afternoon. If it takes longer, you're overthinking it.
|
||||
|
||||
### For Makers & Artists
|
||||
|
||||
**Blatantly copy something:**
|
||||
Pick something you admire — a tool, an artwork, an interface. Recreate it from scratch. The learning is in the gap between your version and theirs.
|
||||
|
||||
**One million of something:**
|
||||
One million is both a lot and not that much. One million pixels is a 1MB photo. One million API calls is a Tuesday. One million of anything becomes interesting at scale.
|
||||
|
||||
**Make something that dies:**
|
||||
A website that loses a feature every day. A chatbot that forgets. A countdown to nothing. An exercise in rot, killing, or letting go.
|
||||
|
||||
**Do a lot of math:**
|
||||
Generative geometry, shader golf, mathematical art, computational origami. Time to re-learn what an arcsin is.
|
||||
|
||||
### For Anyone
|
||||
|
||||
**Text is the universal interface:**
|
||||
Build something where text is the only interface. No buttons, no graphics, just words in and words out. Text can go in and out of almost anything.
|
||||
|
||||
**Start at the punchline:**
|
||||
Think of something that would be a funny sentence. Work backwards to make it real. "I taught my thermostat to gaslight me" → now build it.
|
||||
|
||||
**Hostile UI:**
|
||||
Make something intentionally painful to use. A password field that requires 47 conditions. A form where every label lies. A CLI that judges your commands.
|
||||
|
||||
**Take two:**
|
||||
Remember an old project. Do it again from scratch. No looking at the original. See what changed about how you think.
|
||||
|
||||
See `references/full-prompt-library.md` for 30+ additional constraints across communication, scale, philosophy, transformation, and more.
|
||||
|
||||
## Matching Constraints to Users
|
||||
|
||||
| User says | Pick from |
|
||||
|-----------|-----------|
|
||||
| "I want to build something" (no direction) | Random — any constraint |
|
||||
| "I'm learning [language]" | Blatantly copy something, Automate the annoying thing |
|
||||
| "I want something weird" | Hostile UI, Frankenstein week, Start at the punchline |
|
||||
| "I want something useful" | Solve your own itch, The CLI that should exist, Automate the annoying thing |
|
||||
| "I want something beautiful" | Do a lot of math, One million of something |
|
||||
| "I'm burned out" | High concept low effort, Make something that dies |
|
||||
| "Weekend project" | Nothing new except glue, Start at the punchline |
|
||||
| "I want a challenge" | One million of something, Subtract, Take two |
|
||||
|
||||
## Output Format
|
||||
|
||||
```
|
||||
## Constraint: [Name]
|
||||
> [The constraint, one sentence]
|
||||
|
||||
### Ideas
|
||||
|
||||
1. **[One-line pitch]**
|
||||
[2-3 sentences: what you'd build and why it's interesting]
|
||||
⏱ [weekend / week / month] • 🔧 [stack]
|
||||
|
||||
2. **[One-line pitch]**
|
||||
[2-3 sentences]
|
||||
⏱ ... • 🔧 ...
|
||||
|
||||
3. **[One-line pitch]**
|
||||
[2-3 sentences]
|
||||
⏱ ... • 🔧 ...
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```
|
||||
## Constraint: The CLI tool that should exist
|
||||
> Think of a command you've wished you could type. Now build it.
|
||||
|
||||
### Ideas
|
||||
|
||||
1. **`git whatsup` — show what happened while you were away**
|
||||
Compares your last active commit to HEAD and summarizes what changed,
|
||||
who committed, and what PRs merged. Like a morning standup from your repo.
|
||||
⏱ weekend • 🔧 Python, GitPython, click
|
||||
|
||||
2. **`explain 503` — HTTP status codes for humans**
|
||||
Pipe any status code or error message and get a plain-English explanation
|
||||
with common causes and fixes. Pulls from a curated database, not an LLM.
|
||||
⏱ weekend • 🔧 Rust or Go, static dataset
|
||||
|
||||
3. **`deps why <package>` — why is this in my dependency tree**
|
||||
Traces a transitive dependency back to the direct dependency that pulled
|
||||
it in. Answers "why do I have 47 copies of lodash" in one command.
|
||||
⏱ weekend • 🔧 Node.js, npm/yarn lockfile parsing
|
||||
```
|
||||
|
||||
After the user picks one, start building — create the project, write the code, iterate.
|
||||
|
||||
## Attribution
|
||||
|
||||
Constraint approach inspired by [wttdotm.com/prompts.html](https://wttdotm.com/prompts.html). Adapted and expanded for software development and general-purpose ideation.
|
||||
@@ -0,0 +1,214 @@
|
||||
---
|
||||
title: "Design Md — Author, validate, diff, and export DESIGN"
|
||||
sidebar_label: "Design Md"
|
||||
description: "Author, validate, diff, and export DESIGN"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Design Md
|
||||
|
||||
Author, validate, diff, and export DESIGN.md files — Google's open-source format spec that gives coding agents a persistent, structured understanding of a design system (tokens + rationale in one file). Use when building a design system, porting style rules between projects, generating UI with consistent brand, or auditing accessibility/contrast.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/design-md` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `design`, `design-system`, `tokens`, `ui`, `accessibility`, `wcag`, `tailwind`, `dtcg`, `google` |
|
||||
| Related skills | [`popular-web-designs`](/docs/user-guide/skills/bundled/creative/creative-popular-web-designs), [`excalidraw`](/docs/user-guide/skills/bundled/creative/creative-excalidraw), [`architecture-diagram`](/docs/user-guide/skills/bundled/creative/creative-architecture-diagram) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# DESIGN.md Skill
|
||||
|
||||
DESIGN.md is Google's open spec (Apache-2.0, `google-labs-code/design.md`) for
|
||||
describing a visual identity to coding agents. One file combines:
|
||||
|
||||
- **YAML front matter** — machine-readable design tokens (normative values)
|
||||
- **Markdown body** — human-readable rationale, organized into canonical sections
|
||||
|
||||
Tokens give exact values. Prose tells agents *why* those values exist and how to
|
||||
apply them. The CLI (`npx @google/design.md`) lints structure + WCAG contrast,
|
||||
diffs versions for regressions, and exports to Tailwind or W3C DTCG JSON.
|
||||
|
||||
## When to use this skill
|
||||
|
||||
- User asks for a DESIGN.md file, design tokens, or a design system spec
|
||||
- User wants consistent UI/brand across multiple projects or tools
|
||||
- User pastes an existing DESIGN.md and asks to lint, diff, export, or extend it
|
||||
- User asks to port a style guide into a format agents can consume
|
||||
- User wants contrast / WCAG accessibility validation on their color palette
|
||||
|
||||
For purely visual inspiration or layout examples, use `popular-web-designs`
|
||||
instead. This skill is for the *formal spec file* itself.
|
||||
|
||||
## File anatomy
|
||||
|
||||
```md
|
||||
---
|
||||
version: alpha
|
||||
name: Heritage
|
||||
description: Architectural minimalism meets journalistic gravitas.
|
||||
colors:
|
||||
primary: "#1A1C1E"
|
||||
secondary: "#6C7278"
|
||||
tertiary: "#B8422E"
|
||||
neutral: "#F7F5F2"
|
||||
typography:
|
||||
h1:
|
||||
fontFamily: Public Sans
|
||||
fontSize: 3rem
|
||||
fontWeight: 700
|
||||
lineHeight: 1.1
|
||||
letterSpacing: "-0.02em"
|
||||
body-md:
|
||||
fontFamily: Public Sans
|
||||
fontSize: 1rem
|
||||
rounded:
|
||||
sm: 4px
|
||||
md: 8px
|
||||
lg: 16px
|
||||
spacing:
|
||||
sm: 8px
|
||||
md: 16px
|
||||
lg: 24px
|
||||
components:
|
||||
button-primary:
|
||||
backgroundColor: "{colors.tertiary}"
|
||||
textColor: "#FFFFFF"
|
||||
rounded: "{rounded.sm}"
|
||||
padding: 12px
|
||||
button-primary-hover:
|
||||
backgroundColor: "{colors.primary}"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Architectural Minimalism meets Journalistic Gravitas...
|
||||
|
||||
## Colors
|
||||
|
||||
- **Primary (#1A1C1E):** Deep ink for headlines and core text.
|
||||
- **Tertiary (#B8422E):** "Boston Clay" — the sole driver for interaction.
|
||||
|
||||
## Typography
|
||||
|
||||
Public Sans for everything except small all-caps labels...
|
||||
|
||||
## Components
|
||||
|
||||
`button-primary` is the only high-emphasis action on a page...
|
||||
```
|
||||
|
||||
## Token types
|
||||
|
||||
| Type | Format | Example |
|
||||
|------|--------|---------|
|
||||
| Color | `#` + hex (sRGB) | `"#1A1C1E"` |
|
||||
| Dimension | number + unit (`px`, `em`, `rem`) | `48px`, `-0.02em` |
|
||||
| Token reference | `{path.to.token}` | `{colors.primary}` |
|
||||
| Typography | object with `fontFamily`, `fontSize`, `fontWeight`, `lineHeight`, `letterSpacing`, `fontFeature`, `fontVariation` | see above |
|
||||
|
||||
Component property whitelist: `backgroundColor`, `textColor`, `typography`,
|
||||
`rounded`, `padding`, `size`, `height`, `width`. Variants (hover, active,
|
||||
pressed) are **separate component entries** with related key names
|
||||
(`button-primary-hover`), not nested.
|
||||
|
||||
## Canonical section order
|
||||
|
||||
Sections are optional, but present ones MUST appear in this order. Duplicate
|
||||
headings reject the file.
|
||||
|
||||
1. Overview (alias: Brand & Style)
|
||||
2. Colors
|
||||
3. Typography
|
||||
4. Layout (alias: Layout & Spacing)
|
||||
5. Elevation & Depth (alias: Elevation)
|
||||
6. Shapes
|
||||
7. Components
|
||||
8. Do's and Don'ts
|
||||
|
||||
Unknown sections are preserved, not errored. Unknown token names are accepted
|
||||
if the value type is valid. Unknown component properties produce a warning.
|
||||
|
||||
## Workflow: authoring a new DESIGN.md
|
||||
|
||||
1. **Ask the user** (or infer) the brand tone, accent color, and typography
|
||||
direction. If they provided a site, image, or vibe, translate it to the
|
||||
token shape above.
|
||||
2. **Write `DESIGN.md`** in their project root using `write_file`. Always
|
||||
include `name:` and `colors:`; other sections optional but encouraged.
|
||||
3. **Use token references** (`{colors.primary}`) in the `components:` section
|
||||
instead of re-typing hex values. Keeps the palette single-source.
|
||||
4. **Lint it** (see below). Fix any broken references or WCAG failures
|
||||
before returning.
|
||||
5. **If the user has an existing project**, also write Tailwind or DTCG
|
||||
exports next to the file (`tailwind.theme.json`, `tokens.json`).
|
||||
|
||||
## Workflow: lint / diff / export
|
||||
|
||||
The CLI is `@google/design.md` (Node). Use `npx` — no global install needed.
|
||||
|
||||
```bash
|
||||
# Validate structure + token references + WCAG contrast
|
||||
npx -y @google/design.md lint DESIGN.md
|
||||
|
||||
# Compare two versions, fail on regression (exit 1 = regression)
|
||||
npx -y @google/design.md diff DESIGN.md DESIGN-v2.md
|
||||
|
||||
# Export to Tailwind theme JSON
|
||||
npx -y @google/design.md export --format tailwind DESIGN.md > tailwind.theme.json
|
||||
|
||||
# Export to W3C DTCG (Design Tokens Format Module) JSON
|
||||
npx -y @google/design.md export --format dtcg DESIGN.md > tokens.json
|
||||
|
||||
# Print the spec itself — useful when injecting into an agent prompt
|
||||
npx -y @google/design.md spec --rules-only --format json
|
||||
```
|
||||
|
||||
All commands accept `-` for stdin. `lint` returns exit 1 on errors. Use the
|
||||
`--format json` flag and parse the output if you need to report findings
|
||||
structurally.
|
||||
|
||||
### Lint rule reference (what the 7 rules catch)
|
||||
|
||||
- `broken-ref` (error) — `{colors.missing}` points at a non-existent token
|
||||
- `duplicate-section` (error) — same `## Heading` appears twice
|
||||
- `invalid-color`, `invalid-dimension`, `invalid-typography` (error)
|
||||
- `wcag-contrast` (warning/info) — component `textColor` vs `backgroundColor`
|
||||
ratio against WCAG AA (4.5:1) and AAA (7:1)
|
||||
- `unknown-component-property` (warning) — outside the whitelist above
|
||||
|
||||
When the user cares about accessibility, call this out explicitly in your
|
||||
summary — WCAG findings are the most load-bearing reason to use the CLI.
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **Don't nest component variants.** `button-primary.hover` is wrong;
|
||||
`button-primary-hover` as a sibling key is right.
|
||||
- **Hex colors must be quoted strings.** YAML will otherwise choke on `#` or
|
||||
truncate values like `#1A1C1E` oddly.
|
||||
- **Negative dimensions need quotes too.** `letterSpacing: -0.02em` parses as
|
||||
a YAML flow — write `letterSpacing: "-0.02em"`.
|
||||
- **Section order is enforced.** If the user gives you prose in a random order,
|
||||
reorder it to match the canonical list before saving.
|
||||
- **`version: alpha` is the current spec version** (as of Apr 2026). The spec
|
||||
is marked alpha — watch for breaking changes.
|
||||
- **Token references resolve by dotted path.** `{colors.primary}` works;
|
||||
`{primary}` does not.
|
||||
|
||||
## Spec source of truth
|
||||
|
||||
- Repo: https://github.com/google-labs-code/design.md (Apache-2.0)
|
||||
- CLI: `@google/design.md` on npm
|
||||
- License of generated DESIGN.md files: whatever the user's project uses;
|
||||
the spec itself is Apache-2.0.
|
||||
@@ -0,0 +1,207 @@
|
||||
---
|
||||
title: "Excalidraw — Create hand-drawn style diagrams using Excalidraw JSON format"
|
||||
sidebar_label: "Excalidraw"
|
||||
description: "Create hand-drawn style diagrams using Excalidraw JSON format"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Excalidraw
|
||||
|
||||
Create hand-drawn style diagrams using Excalidraw JSON format. Generate .excalidraw files for architecture diagrams, flowcharts, sequence diagrams, concept maps, and more. Files can be opened at excalidraw.com or uploaded for shareable links.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/excalidraw` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `Excalidraw`, `Diagrams`, `Flowcharts`, `Architecture`, `Visualization`, `JSON` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Excalidraw Diagram Skill
|
||||
|
||||
Create diagrams by writing standard Excalidraw element JSON and saving as `.excalidraw` files. These files can be drag-and-dropped onto [excalidraw.com](https://excalidraw.com) for viewing and editing. No accounts, no API keys, no rendering libraries -- just JSON.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Load this skill** (you already did)
|
||||
2. **Write the elements JSON** -- an array of Excalidraw element objects
|
||||
3. **Save the file** using `write_file` to create a `.excalidraw` file
|
||||
4. **Optionally upload** for a shareable link using `scripts/upload.py` via `terminal`
|
||||
|
||||
### Saving a Diagram
|
||||
|
||||
Wrap your elements array in the standard `.excalidraw` envelope and save with `write_file`:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "excalidraw",
|
||||
"version": 2,
|
||||
"source": "hermes-agent",
|
||||
"elements": [ ...your elements array here... ],
|
||||
"appState": {
|
||||
"viewBackgroundColor": "#ffffff"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Save to any path, e.g. `~/diagrams/my_diagram.excalidraw`.
|
||||
|
||||
### Uploading for a Shareable Link
|
||||
|
||||
Run the upload script (located in this skill's `scripts/` directory) via terminal:
|
||||
|
||||
```bash
|
||||
python skills/diagramming/excalidraw/scripts/upload.py ~/diagrams/my_diagram.excalidraw
|
||||
```
|
||||
|
||||
This uploads to excalidraw.com (no account needed) and prints a shareable URL. Requires the `cryptography` pip package (`pip install cryptography`).
|
||||
|
||||
---
|
||||
|
||||
## Element Format Reference
|
||||
|
||||
### Required Fields (all elements)
|
||||
`type`, `id` (unique string), `x`, `y`, `width`, `height`
|
||||
|
||||
### Defaults (skip these -- they're applied automatically)
|
||||
- `strokeColor`: `"#1e1e1e"`
|
||||
- `backgroundColor`: `"transparent"`
|
||||
- `fillStyle`: `"solid"`
|
||||
- `strokeWidth`: `2`
|
||||
- `roughness`: `1` (hand-drawn look)
|
||||
- `opacity`: `100`
|
||||
|
||||
Canvas background is white.
|
||||
|
||||
### Element Types
|
||||
|
||||
**Rectangle**:
|
||||
```json
|
||||
{ "type": "rectangle", "id": "r1", "x": 100, "y": 100, "width": 200, "height": 100 }
|
||||
```
|
||||
- `roundness: { "type": 3 }` for rounded corners
|
||||
- `backgroundColor: "#a5d8ff"`, `fillStyle: "solid"` for filled
|
||||
|
||||
**Ellipse**:
|
||||
```json
|
||||
{ "type": "ellipse", "id": "e1", "x": 100, "y": 100, "width": 150, "height": 150 }
|
||||
```
|
||||
|
||||
**Diamond**:
|
||||
```json
|
||||
{ "type": "diamond", "id": "d1", "x": 100, "y": 100, "width": 150, "height": 150 }
|
||||
```
|
||||
|
||||
**Labeled shape (container binding)** -- create a text element bound to the shape:
|
||||
|
||||
> **WARNING:** Do NOT use `"label": { "text": "..." }` on shapes. This is NOT a valid
|
||||
> Excalidraw property and will be silently ignored, producing blank shapes. You MUST
|
||||
> use the container binding approach below.
|
||||
|
||||
The shape needs `boundElements` listing the text, and the text needs `containerId` pointing back:
|
||||
```json
|
||||
{ "type": "rectangle", "id": "r1", "x": 100, "y": 100, "width": 200, "height": 80,
|
||||
"roundness": { "type": 3 }, "backgroundColor": "#a5d8ff", "fillStyle": "solid",
|
||||
"boundElements": [{ "id": "t_r1", "type": "text" }] },
|
||||
{ "type": "text", "id": "t_r1", "x": 105, "y": 110, "width": 190, "height": 25,
|
||||
"text": "Hello", "fontSize": 20, "fontFamily": 1, "strokeColor": "#1e1e1e",
|
||||
"textAlign": "center", "verticalAlign": "middle",
|
||||
"containerId": "r1", "originalText": "Hello", "autoResize": true }
|
||||
```
|
||||
- Works on rectangle, ellipse, diamond
|
||||
- Text is auto-centered by Excalidraw when `containerId` is set
|
||||
- The text `x`/`y`/`width`/`height` are approximate -- Excalidraw recalculates them on load
|
||||
- `originalText` should match `text`
|
||||
- Always include `fontFamily: 1` (Virgil/hand-drawn font)
|
||||
|
||||
**Labeled arrow** -- same container binding approach:
|
||||
```json
|
||||
{ "type": "arrow", "id": "a1", "x": 300, "y": 150, "width": 200, "height": 0,
|
||||
"points": [[0,0],[200,0]], "endArrowhead": "arrow",
|
||||
"boundElements": [{ "id": "t_a1", "type": "text" }] },
|
||||
{ "type": "text", "id": "t_a1", "x": 370, "y": 130, "width": 60, "height": 20,
|
||||
"text": "connects", "fontSize": 16, "fontFamily": 1, "strokeColor": "#1e1e1e",
|
||||
"textAlign": "center", "verticalAlign": "middle",
|
||||
"containerId": "a1", "originalText": "connects", "autoResize": true }
|
||||
```
|
||||
|
||||
**Standalone text** (titles and annotations only -- no container):
|
||||
```json
|
||||
{ "type": "text", "id": "t1", "x": 150, "y": 138, "text": "Hello", "fontSize": 20,
|
||||
"fontFamily": 1, "strokeColor": "#1e1e1e", "originalText": "Hello", "autoResize": true }
|
||||
```
|
||||
- `x` is the LEFT edge. To center at position `cx`: `x = cx - (text.length * fontSize * 0.5) / 2`
|
||||
- Do NOT rely on `textAlign` or `width` for positioning
|
||||
|
||||
**Arrow**:
|
||||
```json
|
||||
{ "type": "arrow", "id": "a1", "x": 300, "y": 150, "width": 200, "height": 0,
|
||||
"points": [[0,0],[200,0]], "endArrowhead": "arrow" }
|
||||
```
|
||||
- `points`: `[dx, dy]` offsets from element `x`, `y`
|
||||
- `endArrowhead`: `null` | `"arrow"` | `"bar"` | `"dot"` | `"triangle"`
|
||||
- `strokeStyle`: `"solid"` (default) | `"dashed"` | `"dotted"`
|
||||
|
||||
### Arrow Bindings (connect arrows to shapes)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "arrow", "id": "a1", "x": 300, "y": 150, "width": 150, "height": 0,
|
||||
"points": [[0,0],[150,0]], "endArrowhead": "arrow",
|
||||
"startBinding": { "elementId": "r1", "fixedPoint": [1, 0.5] },
|
||||
"endBinding": { "elementId": "r2", "fixedPoint": [0, 0.5] }
|
||||
}
|
||||
```
|
||||
|
||||
`fixedPoint` coordinates: `top=[0.5,0]`, `bottom=[0.5,1]`, `left=[0,0.5]`, `right=[1,0.5]`
|
||||
|
||||
### Drawing Order (z-order)
|
||||
- Array order = z-order (first = back, last = front)
|
||||
- Emit progressively: background zones → shape → its bound text → its arrows → next shape
|
||||
- BAD: all rectangles, then all texts, then all arrows
|
||||
- GOOD: bg_zone → shape1 → text_for_shape1 → arrow1 → arrow_label_text → shape2 → text_for_shape2 → ...
|
||||
- Always place the bound text element immediately after its container shape
|
||||
|
||||
### Sizing Guidelines
|
||||
|
||||
**Font sizes:**
|
||||
- Minimum `fontSize`: **16** for body text, labels, descriptions
|
||||
- Minimum `fontSize`: **20** for titles and headings
|
||||
- Minimum `fontSize`: **14** for secondary annotations only (sparingly)
|
||||
- NEVER use `fontSize` below 14
|
||||
|
||||
**Element sizes:**
|
||||
- Minimum shape size: 120x60 for labeled rectangles/ellipses
|
||||
- Leave 20-30px gaps between elements minimum
|
||||
- Prefer fewer, larger elements over many tiny ones
|
||||
|
||||
### Color Palette
|
||||
|
||||
See `references/colors.md` for full color tables. Quick reference:
|
||||
|
||||
| Use | Fill Color | Hex |
|
||||
|-----|-----------|-----|
|
||||
| Primary / Input | Light Blue | `#a5d8ff` |
|
||||
| Success / Output | Light Green | `#b2f2bb` |
|
||||
| Warning / External | Light Orange | `#ffd8a8` |
|
||||
| Processing / Special | Light Purple | `#d0bfff` |
|
||||
| Error / Critical | Light Red | `#ffc9c9` |
|
||||
| Notes / Decisions | Light Yellow | `#fff3bf` |
|
||||
| Storage / Data | Light Teal | `#c3fae8` |
|
||||
|
||||
### Tips
|
||||
- Use the color palette consistently across the diagram
|
||||
- **Text contrast is CRITICAL** -- never use light gray on white backgrounds. Minimum text color on white: `#757575`
|
||||
- Do NOT use emoji in text -- they don't render in Excalidraw's font
|
||||
- For dark mode diagrams, see `references/dark-mode.md`
|
||||
- For larger examples, see `references/examples.md`
|
||||
@@ -0,0 +1,284 @@
|
||||
---
|
||||
title: "Manim Video — Production pipeline for mathematical and technical animations using Manim Community Edition"
|
||||
sidebar_label: "Manim Video"
|
||||
description: "Production pipeline for mathematical and technical animations using Manim Community Edition"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Manim Video
|
||||
|
||||
Production pipeline for mathematical and technical animations using Manim Community Edition. Creates 3Blue1Brown-style explainer videos, algorithm visualizations, equation derivations, architecture diagrams, and data stories. Use when users request: animated explanations, math animations, concept visualizations, algorithm walkthroughs, technical explainers, 3Blue1Brown style videos, or any programmatic animation with geometric/mathematical content.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/manim-video` |
|
||||
| Version | `1.0.0` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Manim Video Production Pipeline
|
||||
|
||||
## Creative Standard
|
||||
|
||||
This is educational cinema. Every frame teaches. Every animation reveals structure.
|
||||
|
||||
**Before writing a single line of code**, articulate the narrative arc. What misconception does this correct? What is the "aha moment"? What visual story takes the viewer from confusion to understanding? The user's prompt is a starting point — interpret it with pedagogical ambition.
|
||||
|
||||
**Geometry before algebra.** Show the shape first, the equation second. Visual memory encodes faster than symbolic memory. When the viewer sees the geometric pattern before the formula, the equation feels earned.
|
||||
|
||||
**First-render excellence is non-negotiable.** The output must be visually clear and aesthetically cohesive without revision rounds. If something looks cluttered, poorly timed, or like "AI-generated slides," it is wrong.
|
||||
|
||||
**Opacity layering directs attention.** Never show everything at full brightness. Primary elements at 1.0, contextual elements at 0.4, structural elements (axes, grids) at 0.15. The brain processes visual salience in layers.
|
||||
|
||||
**Breathing room.** Every animation needs `self.wait()` after it. The viewer needs time to absorb what just appeared. Never rush from one animation to the next. A 2-second pause after a key reveal is never wasted.
|
||||
|
||||
**Cohesive visual language.** All scenes share a color palette, consistent typography sizing, matching animation speeds. A technically correct video where every scene uses random different colors is an aesthetic failure.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Run `scripts/setup.sh` to verify all dependencies. Requires: Python 3.10+, Manim Community Edition v0.20+ (`pip install manim`), LaTeX (`texlive-full` on Linux, `mactex` on macOS), and ffmpeg. Reference docs tested against Manim CE v0.20.1.
|
||||
|
||||
## Modes
|
||||
|
||||
| Mode | Input | Output | Reference |
|
||||
|------|-------|--------|-----------|
|
||||
| **Concept explainer** | Topic/concept | Animated explanation with geometric intuition | `references/scene-planning.md` |
|
||||
| **Equation derivation** | Math expressions | Step-by-step animated proof | `references/equations.md` |
|
||||
| **Algorithm visualization** | Algorithm description | Step-by-step execution with data structures | `references/graphs-and-data.md` |
|
||||
| **Data story** | Data/metrics | Animated charts, comparisons, counters | `references/graphs-and-data.md` |
|
||||
| **Architecture diagram** | System description | Components building up with connections | `references/mobjects.md` |
|
||||
| **Paper explainer** | Research paper | Key findings and methods animated | `references/scene-planning.md` |
|
||||
| **3D visualization** | 3D concept | Rotating surfaces, parametric curves, spatial geometry | `references/camera-and-3d.md` |
|
||||
|
||||
## Stack
|
||||
|
||||
Single Python script per project. No browser, no Node.js, no GPU required.
|
||||
|
||||
| Layer | Tool | Purpose |
|
||||
|-------|------|---------|
|
||||
| Core | Manim Community Edition | Scene rendering, animation engine |
|
||||
| Math | LaTeX (texlive/MiKTeX) | Equation rendering via `MathTex` |
|
||||
| Video I/O | ffmpeg | Scene stitching, format conversion, audio muxing |
|
||||
| TTS | ElevenLabs / Qwen3-TTS (optional) | Narration voiceover |
|
||||
|
||||
## Pipeline
|
||||
|
||||
```
|
||||
PLAN --> CODE --> RENDER --> STITCH --> AUDIO (optional) --> REVIEW
|
||||
```
|
||||
|
||||
1. **PLAN** — Write `plan.md` with narrative arc, scene list, visual elements, color palette, voiceover script
|
||||
2. **CODE** — Write `script.py` with one class per scene, each independently renderable
|
||||
3. **RENDER** — `manim -ql script.py Scene1 Scene2 ...` for draft, `-qh` for production
|
||||
4. **STITCH** — ffmpeg concat of scene clips into `final.mp4`
|
||||
5. **AUDIO** (optional) — Add voiceover and/or background music via ffmpeg. See `references/rendering.md`
|
||||
6. **REVIEW** — Render preview stills, verify against plan, adjust
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
project-name/
|
||||
plan.md # Narrative arc, scene breakdown
|
||||
script.py # All scenes in one file
|
||||
concat.txt # ffmpeg scene list
|
||||
final.mp4 # Stitched output
|
||||
media/ # Auto-generated by Manim
|
||||
videos/script/480p15/
|
||||
```
|
||||
|
||||
## Creative Direction
|
||||
|
||||
### Color Palettes
|
||||
|
||||
| Palette | Background | Primary | Secondary | Accent | Use case |
|
||||
|---------|-----------|---------|-----------|--------|----------|
|
||||
| **Classic 3B1B** | `#1C1C1C` | `#58C4DD` (BLUE) | `#83C167` (GREEN) | `#FFFF00` (YELLOW) | General math/CS |
|
||||
| **Warm academic** | `#2D2B55` | `#FF6B6B` | `#FFD93D` | `#6BCB77` | Approachable |
|
||||
| **Neon tech** | `#0A0A0A` | `#00F5FF` | `#FF00FF` | `#39FF14` | Systems, architecture |
|
||||
| **Monochrome** | `#1A1A2E` | `#EAEAEA` | `#888888` | `#FFFFFF` | Minimalist |
|
||||
|
||||
### Animation Speed
|
||||
|
||||
| Context | run_time | self.wait() after |
|
||||
|---------|----------|-------------------|
|
||||
| Title/intro appear | 1.5s | 1.0s |
|
||||
| Key equation reveal | 2.0s | 2.0s |
|
||||
| Transform/morph | 1.5s | 1.5s |
|
||||
| Supporting label | 0.8s | 0.5s |
|
||||
| FadeOut cleanup | 0.5s | 0.3s |
|
||||
| "Aha moment" reveal | 2.5s | 3.0s |
|
||||
|
||||
### Typography Scale
|
||||
|
||||
| Role | Font size | Usage |
|
||||
|------|-----------|-------|
|
||||
| Title | 48 | Scene titles, opening text |
|
||||
| Heading | 36 | Section headers within a scene |
|
||||
| Body | 30 | Explanatory text |
|
||||
| Label | 24 | Annotations, axis labels |
|
||||
| Caption | 20 | Subtitles, fine print |
|
||||
|
||||
### Fonts
|
||||
|
||||
**Use monospace fonts for all text.** Manim's Pango renderer produces broken kerning with proportional fonts at all sizes. See `references/visual-design.md` for full recommendations.
|
||||
|
||||
```python
|
||||
MONO = "Menlo" # define once at top of file
|
||||
|
||||
Text("Fourier Series", font_size=48, font=MONO, weight=BOLD) # titles
|
||||
Text("n=1: sin(x)", font_size=20, font=MONO) # labels
|
||||
MathTex(r"\nabla L") # math (uses LaTeX)
|
||||
```
|
||||
|
||||
Minimum `font_size=18` for readability.
|
||||
|
||||
### Per-Scene Variation
|
||||
|
||||
Never use identical config for all scenes. For each scene:
|
||||
- **Different dominant color** from the palette
|
||||
- **Different layout** — don't always center everything
|
||||
- **Different animation entry** — vary between Write, FadeIn, GrowFromCenter, Create
|
||||
- **Different visual weight** — some scenes dense, others sparse
|
||||
|
||||
## Workflow
|
||||
|
||||
### Step 1: Plan (plan.md)
|
||||
|
||||
Before any code, write `plan.md`. See `references/scene-planning.md` for the comprehensive template.
|
||||
|
||||
### Step 2: Code (script.py)
|
||||
|
||||
One class per scene. Every scene is independently renderable.
|
||||
|
||||
```python
|
||||
from manim import *
|
||||
|
||||
BG = "#1C1C1C"
|
||||
PRIMARY = "#58C4DD"
|
||||
SECONDARY = "#83C167"
|
||||
ACCENT = "#FFFF00"
|
||||
MONO = "Menlo"
|
||||
|
||||
class Scene1_Introduction(Scene):
|
||||
def construct(self):
|
||||
self.camera.background_color = BG
|
||||
title = Text("Why Does This Work?", font_size=48, color=PRIMARY, weight=BOLD, font=MONO)
|
||||
self.add_subcaption("Why does this work?", duration=2)
|
||||
self.play(Write(title), run_time=1.5)
|
||||
self.wait(1.0)
|
||||
self.play(FadeOut(title), run_time=0.5)
|
||||
```
|
||||
|
||||
Key patterns:
|
||||
- **Subtitles** on every animation: `self.add_subcaption("text", duration=N)` or `subcaption="text"` on `self.play()`
|
||||
- **Shared color constants** at file top for cross-scene consistency
|
||||
- **`self.camera.background_color`** set in every scene
|
||||
- **Clean exits** — FadeOut all mobjects at scene end: `self.play(FadeOut(Group(*self.mobjects)))`
|
||||
|
||||
### Step 3: Render
|
||||
|
||||
```bash
|
||||
manim -ql script.py Scene1_Introduction Scene2_CoreConcept # draft
|
||||
manim -qh script.py Scene1_Introduction Scene2_CoreConcept # production
|
||||
```
|
||||
|
||||
### Step 4: Stitch
|
||||
|
||||
```bash
|
||||
cat > concat.txt << 'EOF'
|
||||
file 'media/videos/script/480p15/Scene1_Introduction.mp4'
|
||||
file 'media/videos/script/480p15/Scene2_CoreConcept.mp4'
|
||||
EOF
|
||||
ffmpeg -y -f concat -safe 0 -i concat.txt -c copy final.mp4
|
||||
```
|
||||
|
||||
### Step 5: Review
|
||||
|
||||
```bash
|
||||
manim -ql --format=png -s script.py Scene2_CoreConcept # preview still
|
||||
```
|
||||
|
||||
## Critical Implementation Notes
|
||||
|
||||
### Raw Strings for LaTeX
|
||||
```python
|
||||
# WRONG: MathTex("\frac{1}{2}")
|
||||
# RIGHT:
|
||||
MathTex(r"\frac{1}{2}")
|
||||
```
|
||||
|
||||
### buff >= 0.5 for Edge Text
|
||||
```python
|
||||
label.to_edge(DOWN, buff=0.5) # never < 0.5
|
||||
```
|
||||
|
||||
### FadeOut Before Replacing Text
|
||||
```python
|
||||
self.play(ReplacementTransform(note1, note2)) # not Write(note2) on top
|
||||
```
|
||||
|
||||
### Never Animate Non-Added Mobjects
|
||||
```python
|
||||
self.play(Create(circle)) # must add first
|
||||
self.play(circle.animate.set_color(RED)) # then animate
|
||||
```
|
||||
|
||||
## Performance Targets
|
||||
|
||||
| Quality | Resolution | FPS | Speed |
|
||||
|---------|-----------|-----|-------|
|
||||
| `-ql` (draft) | 854x480 | 15 | 5-15s/scene |
|
||||
| `-qm` (medium) | 1280x720 | 30 | 15-60s/scene |
|
||||
| `-qh` (production) | 1920x1080 | 60 | 30-120s/scene |
|
||||
|
||||
Always iterate at `-ql`. Only render `-qh` for final output.
|
||||
|
||||
## References
|
||||
|
||||
| File | Contents |
|
||||
|------|----------|
|
||||
| `references/animations.md` | Core animations, rate functions, composition, `.animate` syntax, timing patterns |
|
||||
| `references/mobjects.md` | Text, shapes, VGroup/Group, positioning, styling, custom mobjects |
|
||||
| `references/visual-design.md` | 12 design principles, opacity layering, layout templates, color palettes |
|
||||
| `references/equations.md` | LaTeX in Manim, TransformMatchingTex, derivation patterns |
|
||||
| `references/graphs-and-data.md` | Axes, plotting, BarChart, animated data, algorithm visualization |
|
||||
| `references/camera-and-3d.md` | MovingCameraScene, ThreeDScene, 3D surfaces, camera control |
|
||||
| `references/scene-planning.md` | Narrative arcs, layout templates, scene transitions, planning template |
|
||||
| `references/rendering.md` | CLI reference, quality presets, ffmpeg, voiceover workflow, GIF export |
|
||||
| `references/troubleshooting.md` | LaTeX errors, animation errors, common mistakes, debugging |
|
||||
| `references/animation-design-thinking.md` | When to animate vs show static, decomposition, pacing, narration sync |
|
||||
| `references/updaters-and-trackers.md` | ValueTracker, add_updater, always_redraw, time-based updaters, patterns |
|
||||
| `references/paper-explainer.md` | Turning research papers into animations — workflow, templates, domain patterns |
|
||||
| `references/decorations.md` | SurroundingRectangle, Brace, arrows, DashedLine, Angle, annotation lifecycle |
|
||||
| `references/production-quality.md` | Pre-code, pre-render, post-render checklists, spatial layout, color, tempo |
|
||||
|
||||
---
|
||||
|
||||
## Creative Divergence (use only when user requests experimental/creative/unique output)
|
||||
|
||||
If the user asks for creative, experimental, or unconventional explanatory approaches, select a strategy and reason through it BEFORE designing the animation.
|
||||
|
||||
- **SCAMPER** — when the user wants a fresh take on a standard explanation
|
||||
- **Assumption Reversal** — when the user wants to challenge how something is typically taught
|
||||
|
||||
### SCAMPER Transformation
|
||||
Take a standard mathematical/technical visualization and transform it:
|
||||
- **Substitute**: replace the standard visual metaphor (number line → winding path, matrix → city grid)
|
||||
- **Combine**: merge two explanation approaches (algebraic + geometric simultaneously)
|
||||
- **Reverse**: derive backward — start from the result and deconstruct to axioms
|
||||
- **Modify**: exaggerate a parameter to show why it matters (10x the learning rate, 1000x the sample size)
|
||||
- **Eliminate**: remove all notation — explain purely through animation and spatial relationships
|
||||
|
||||
### Assumption Reversal
|
||||
1. List what's "standard" about how this topic is visualized (left-to-right, 2D, discrete steps, formal notation)
|
||||
2. Pick the most fundamental assumption
|
||||
3. Reverse it (right-to-left derivation, 3D embedding of a 2D concept, continuous morphing instead of steps, zero notation)
|
||||
4. Explore what the reversal reveals that the standard approach hides
|
||||
@@ -0,0 +1,565 @@
|
||||
---
|
||||
title: "P5Js — Production pipeline for interactive and generative visual art using p5"
|
||||
sidebar_label: "P5Js"
|
||||
description: "Production pipeline for interactive and generative visual art using p5"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# P5Js
|
||||
|
||||
Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export. Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/p5js` |
|
||||
| Version | `1.0.0` |
|
||||
| Tags | `creative-coding`, `generative-art`, `p5js`, `canvas`, `interactive`, `visualization`, `webgl`, `shaders`, `animation` |
|
||||
| Related skills | [`ascii-video`](/docs/user-guide/skills/bundled/creative/creative-ascii-video), [`manim-video`](/docs/user-guide/skills/bundled/creative/creative-manim-video), [`excalidraw`](/docs/user-guide/skills/bundled/creative/creative-excalidraw) |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# p5.js Production Pipeline
|
||||
|
||||
## Creative Standard
|
||||
|
||||
This is visual art rendered in the browser. The canvas is the medium; the algorithm is the brush.
|
||||
|
||||
**Before writing a single line of code**, articulate the creative concept. What does this piece communicate? What makes the viewer stop scrolling? What separates this from a code tutorial example? The user's prompt is a starting point — interpret it with creative ambition.
|
||||
|
||||
**First-render excellence is non-negotiable.** The output must be visually striking on first load. If it looks like a p5.js tutorial exercise, a default configuration, or "AI-generated creative coding," it is wrong. Rethink before shipping.
|
||||
|
||||
**Go beyond the reference vocabulary.** The noise functions, particle systems, color palettes, and shader effects in the references are a starting vocabulary. For every project, combine, layer, and invent. The catalog is a palette of paints — you write the painting.
|
||||
|
||||
**Be proactively creative.** If the user asks for "a particle system," deliver a particle system with emergent flocking behavior, trailing ghost echoes, palette-shifted depth fog, and a background noise field that breathes. Include at least one visual detail the user didn't ask for but will appreciate.
|
||||
|
||||
**Dense, layered, considered.** Every frame should reward viewing. Never flat white backgrounds. Always compositional hierarchy. Always intentional color. Always micro-detail that only appears on close inspection.
|
||||
|
||||
**Cohesive aesthetic over feature count.** All elements must serve a unified visual language — shared color temperature, consistent stroke weight vocabulary, harmonious motion speeds. A sketch with ten unrelated effects is worse than one with three that belong together.
|
||||
|
||||
## Modes
|
||||
|
||||
| Mode | Input | Output | Reference |
|
||||
|------|-------|--------|-----------|
|
||||
| **Generative art** | Seed / parameters | Procedural visual composition (still or animated) | `references/visual-effects.md` |
|
||||
| **Data visualization** | Dataset / API | Interactive charts, graphs, custom data displays | `references/interaction.md` |
|
||||
| **Interactive experience** | None (user drives) | Mouse/keyboard/touch-driven sketch | `references/interaction.md` |
|
||||
| **Animation / motion graphics** | Timeline / storyboard | Timed sequences, kinetic typography, transitions | `references/animation.md` |
|
||||
| **3D scene** | Concept description | WebGL geometry, lighting, camera, materials | `references/webgl-and-3d.md` |
|
||||
| **Image processing** | Image file(s) | Pixel manipulation, filters, mosaic, pointillism | `references/visual-effects.md` § Pixel Manipulation |
|
||||
| **Audio-reactive** | Audio file / mic | Sound-driven generative visuals | `references/interaction.md` § Audio Input |
|
||||
|
||||
## Stack
|
||||
|
||||
Single self-contained HTML file per project. No build step required.
|
||||
|
||||
| Layer | Tool | Purpose |
|
||||
|-------|------|---------|
|
||||
| Core | p5.js 1.11.3 (CDN) | Canvas rendering, math, transforms, event handling |
|
||||
| 3D | p5.js WebGL mode | 3D geometry, camera, lighting, GLSL shaders |
|
||||
| Audio | p5.sound.js (CDN) | FFT analysis, amplitude, mic input, oscillators |
|
||||
| Export | Built-in `saveCanvas()` / `saveGif()` / `saveFrames()` | PNG, GIF, frame sequence output |
|
||||
| Capture | CCapture.js (optional) | Deterministic framerate video capture (WebM, GIF) |
|
||||
| Headless | Puppeteer + Node.js (optional) | Automated high-res rendering, MP4 via ffmpeg |
|
||||
| SVG | p5.js-svg 1.6.0 (optional) | Vector output for print — requires p5.js 1.x |
|
||||
| Natural media | p5.brush (optional) | Watercolor, charcoal, pen — requires p5.js 2.x + WEBGL |
|
||||
| Texture | p5.grain (optional) | Film grain, texture overlays |
|
||||
| Fonts | Google Fonts / `loadFont()` | Custom typography via OTF/TTF/WOFF2 |
|
||||
|
||||
### Version Note
|
||||
|
||||
**p5.js 1.x** (1.11.3) is the default — stable, well-documented, broadest library compatibility. Use this unless a project requires 2.x features.
|
||||
|
||||
**p5.js 2.x** (2.2+) adds: `async setup()` replacing `preload()`, OKLCH/OKLAB color modes, `splineVertex()`, shader `.modify()` API, variable fonts, `textToContours()`, pointer events. Required for p5.brush. See `references/core-api.md` § p5.js 2.0.
|
||||
|
||||
## Pipeline
|
||||
|
||||
Every project follows the same 6-stage path:
|
||||
|
||||
```
|
||||
CONCEPT → DESIGN → CODE → PREVIEW → EXPORT → VERIFY
|
||||
```
|
||||
|
||||
1. **CONCEPT** — Articulate the creative vision: mood, color world, motion vocabulary, what makes this unique
|
||||
2. **DESIGN** — Choose mode, canvas size, interaction model, color system, export format. Map concept to technical decisions
|
||||
3. **CODE** — Write single HTML file with inline p5.js. Structure: globals → `preload()` → `setup()` → `draw()` → helpers → classes → event handlers
|
||||
4. **PREVIEW** — Open in browser, verify visual quality. Test at target resolution. Check performance
|
||||
5. **EXPORT** — Capture output: `saveCanvas()` for PNG, `saveGif()` for GIF, `saveFrames()` + ffmpeg for MP4, Puppeteer for headless batch
|
||||
6. **VERIFY** — Does the output match the concept? Is it visually striking at the intended display size? Would you frame it?
|
||||
|
||||
## Creative Direction
|
||||
|
||||
### Aesthetic Dimensions
|
||||
|
||||
| Dimension | Options | Reference |
|
||||
|-----------|---------|-----------|
|
||||
| **Color system** | HSB/HSL, RGB, named palettes, procedural harmony, gradient interpolation | `references/color-systems.md` |
|
||||
| **Noise vocabulary** | Perlin noise, simplex, fractal (octaved), domain warping, curl noise | `references/visual-effects.md` § Noise |
|
||||
| **Particle systems** | Physics-based, flocking, trail-drawing, attractor-driven, flow-field following | `references/visual-effects.md` § Particles |
|
||||
| **Shape language** | Geometric primitives, custom vertices, bezier curves, SVG paths | `references/shapes-and-geometry.md` |
|
||||
| **Motion style** | Eased, spring-based, noise-driven, physics sim, lerped, stepped | `references/animation.md` |
|
||||
| **Typography** | System fonts, loaded OTF, `textToPoints()` particle text, kinetic | `references/typography.md` |
|
||||
| **Shader effects** | GLSL fragment/vertex, filter shaders, post-processing, feedback loops | `references/webgl-and-3d.md` § Shaders |
|
||||
| **Composition** | Grid, radial, golden ratio, rule of thirds, organic scatter, tiled | `references/core-api.md` § Composition |
|
||||
| **Interaction model** | Mouse follow, click spawn, drag, keyboard state, scroll-driven, mic input | `references/interaction.md` |
|
||||
| **Blend modes** | `BLEND`, `ADD`, `MULTIPLY`, `SCREEN`, `DIFFERENCE`, `EXCLUSION`, `OVERLAY` | `references/color-systems.md` § Blend Modes |
|
||||
| **Layering** | `createGraphics()` offscreen buffers, alpha compositing, masking | `references/core-api.md` § Offscreen Buffers |
|
||||
| **Texture** | Perlin surface, stippling, hatching, halftone, pixel sorting | `references/visual-effects.md` § Texture Generation |
|
||||
|
||||
### Per-Project Variation Rules
|
||||
|
||||
Never use default configurations. For every project:
|
||||
- **Custom color palette** — never raw `fill(255, 0, 0)`. Always a designed palette with 3-7 colors
|
||||
- **Custom stroke weight vocabulary** — thin accents (0.5), medium structure (1-2), bold emphasis (3-5)
|
||||
- **Background treatment** — never plain `background(0)` or `background(255)`. Always textured, gradient, or layered
|
||||
- **Motion variety** — different speeds for different elements. Primary at 1x, secondary at 0.3x, ambient at 0.1x
|
||||
- **At least one invented element** — a custom particle behavior, a novel noise application, a unique interaction response
|
||||
|
||||
### Project-Specific Invention
|
||||
|
||||
For every project, invent at least one of:
|
||||
- A custom color palette matching the mood (not a preset)
|
||||
- A novel noise field combination (e.g., curl noise + domain warp + feedback)
|
||||
- A unique particle behavior (custom forces, custom trails, custom spawning)
|
||||
- An interaction mechanic the user didn't request but that elevates the piece
|
||||
- A compositional technique that creates visual hierarchy
|
||||
|
||||
### Parameter Design Philosophy
|
||||
|
||||
Parameters should emerge from the algorithm, not from a generic menu. Ask: "What properties of *this* system should be tunable?"
|
||||
|
||||
**Good parameters** expose the algorithm's character:
|
||||
- **Quantities** — how many particles, branches, cells (controls density)
|
||||
- **Scales** — noise frequency, element size, spacing (controls texture)
|
||||
- **Rates** — speed, growth rate, decay (controls energy)
|
||||
- **Thresholds** — when does behavior change? (controls drama)
|
||||
- **Ratios** — proportions, balance between forces (controls harmony)
|
||||
|
||||
**Bad parameters** are generic controls unrelated to the algorithm:
|
||||
- "color1", "color2", "size" — meaningless without context
|
||||
- Toggle switches for unrelated effects
|
||||
- Parameters that only change cosmetics, not behavior
|
||||
|
||||
Every parameter should change how the algorithm *thinks*, not just how it *looks*. A "turbulence" parameter that changes noise octaves is good. A "particle size" slider that only changes `ellipse()` radius is shallow.
|
||||
|
||||
## Workflow
|
||||
|
||||
### Step 1: Creative Vision
|
||||
|
||||
Before any code, articulate:
|
||||
|
||||
- **Mood / atmosphere**: What should the viewer feel? Contemplative? Energized? Unsettled? Playful?
|
||||
- **Visual story**: What happens over time (or on interaction)? Build? Decay? Transform? Oscillate?
|
||||
- **Color world**: Warm/cool? Monochrome? Complementary? What's the dominant hue? The accent?
|
||||
- **Shape language**: Organic curves? Sharp geometry? Dots? Lines? Mixed?
|
||||
- **Motion vocabulary**: Slow drift? Explosive burst? Breathing pulse? Mechanical precision?
|
||||
- **What makes THIS different**: What is the one thing that makes this sketch unique?
|
||||
|
||||
Map the user's prompt to aesthetic choices. "Relaxing generative background" demands different everything from "glitch data visualization."
|
||||
|
||||
### Step 2: Technical Design
|
||||
|
||||
- **Mode** — which of the 7 modes from the table above
|
||||
- **Canvas size** — landscape 1920x1080, portrait 1080x1920, square 1080x1080, or responsive `windowWidth/windowHeight`
|
||||
- **Renderer** — `P2D` (default) or `WEBGL` (for 3D, shaders, advanced blend modes)
|
||||
- **Frame rate** — 60fps (interactive), 30fps (ambient animation), or `noLoop()` (static generative)
|
||||
- **Export target** — browser display, PNG still, GIF loop, MP4 video, SVG vector
|
||||
- **Interaction model** — passive (no input), mouse-driven, keyboard-driven, audio-reactive, scroll-driven
|
||||
- **Viewer UI** — for interactive generative art, start from `templates/viewer.html` which provides seed navigation, parameter sliders, and download. For simple sketches or video export, use bare HTML
|
||||
|
||||
### Step 3: Code the Sketch
|
||||
|
||||
For **interactive generative art** (seed exploration, parameter tuning): start from `templates/viewer.html`. Read the template first, keep the fixed sections (seed nav, actions), replace the algorithm and parameter controls. This gives the user seed prev/next/random/jump, parameter sliders with live update, and PNG download — all wired up.
|
||||
|
||||
For **animations, video export, or simple sketches**: use bare HTML:
|
||||
|
||||
Single HTML file. Structure:
|
||||
|
||||
```html
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Project Name</title>
|
||||
<script>p5.disableFriendlyErrors = true;</script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.11.3/p5.min.js"></script>
|
||||
<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.11.3/addons/p5.sound.min.js"></script> -->
|
||||
<!-- <script src="https://unpkg.com/p5.js-svg@1.6.0"></script> --> <!-- SVG export -->
|
||||
<!-- <script src="https://cdn.jsdelivr.net/npm/ccapture.js-npmfixed/build/CCapture.all.min.js"></script> --> <!-- video capture -->
|
||||
<style>
|
||||
html, body { margin: 0; padding: 0; overflow: hidden; }
|
||||
canvas { display: block; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<script>
|
||||
// === Configuration ===
|
||||
const CONFIG = {
|
||||
seed: 42,
|
||||
// ... project-specific params
|
||||
};
|
||||
|
||||
// === Color Palette ===
|
||||
const PALETTE = {
|
||||
bg: '#0a0a0f',
|
||||
primary: '#e8d5b7',
|
||||
// ...
|
||||
};
|
||||
|
||||
// === Global State ===
|
||||
let particles = [];
|
||||
|
||||
// === Preload (fonts, images, data) ===
|
||||
function preload() {
|
||||
// font = loadFont('...');
|
||||
}
|
||||
|
||||
// === Setup ===
|
||||
function setup() {
|
||||
createCanvas(1920, 1080);
|
||||
randomSeed(CONFIG.seed);
|
||||
noiseSeed(CONFIG.seed);
|
||||
colorMode(HSB, 360, 100, 100, 100);
|
||||
// Initialize state...
|
||||
}
|
||||
|
||||
// === Draw Loop ===
|
||||
function draw() {
|
||||
// Render frame...
|
||||
}
|
||||
|
||||
// === Helper Functions ===
|
||||
// ...
|
||||
|
||||
// === Classes ===
|
||||
class Particle {
|
||||
// ...
|
||||
}
|
||||
|
||||
// === Event Handlers ===
|
||||
function mousePressed() { /* ... */ }
|
||||
function keyPressed() { /* ... */ }
|
||||
function windowResized() { resizeCanvas(windowWidth, windowHeight); }
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
```
|
||||
|
||||
Key implementation patterns:
|
||||
- **Seeded randomness**: Always `randomSeed()` + `noiseSeed()` for reproducibility
|
||||
- **Color mode**: Use `colorMode(HSB, 360, 100, 100, 100)` for intuitive color control
|
||||
- **State separation**: CONFIG for parameters, PALETTE for colors, globals for mutable state
|
||||
- **Class-based entities**: Particles, agents, shapes as classes with `update()` + `display()` methods
|
||||
- **Offscreen buffers**: `createGraphics()` for layered composition, trails, masks
|
||||
|
||||
### Step 4: Preview & Iterate
|
||||
|
||||
- Open HTML file directly in browser — no server needed for basic sketches
|
||||
- For `loadImage()`/`loadFont()` from local files: use `scripts/serve.sh` or `python3 -m http.server`
|
||||
- Chrome DevTools Performance tab to verify 60fps
|
||||
- Test at target export resolution, not just the window size
|
||||
- Adjust parameters until the visual matches the concept from Step 1
|
||||
|
||||
### Step 5: Export
|
||||
|
||||
| Format | Method | Command |
|
||||
|--------|--------|---------|
|
||||
| **PNG** | `saveCanvas('output', 'png')` in `keyPressed()` | Press 's' to save |
|
||||
| **High-res PNG** | Puppeteer headless capture | `node scripts/export-frames.js sketch.html --width 3840 --height 2160 --frames 1` |
|
||||
| **GIF** | `saveGif('output', 5)` — captures N seconds | Press 'g' to save |
|
||||
| **Frame sequence** | `saveFrames('frame', 'png', 10, 30)` — 10s at 30fps | Then `ffmpeg -i frame-%04d.png -c:v libx264 output.mp4` |
|
||||
| **MP4** | Puppeteer frame capture + ffmpeg | `bash scripts/render.sh sketch.html output.mp4 --duration 30 --fps 30` |
|
||||
| **SVG** | `createCanvas(w, h, SVG)` with p5.js-svg | `save('output.svg')` |
|
||||
|
||||
### Step 6: Quality Verification
|
||||
|
||||
- **Does it match the vision?** Compare output to the creative concept. If it looks generic, go back to Step 1
|
||||
- **Resolution check**: Is it sharp at the target display size? No aliasing artifacts?
|
||||
- **Performance check**: Does it hold 60fps in browser? (30fps minimum for animations)
|
||||
- **Color check**: Do the colors work together? Test on both light and dark monitors
|
||||
- **Edge cases**: What happens at canvas edges? On resize? After running for 10 minutes?
|
||||
|
||||
## Critical Implementation Notes
|
||||
|
||||
### Performance — Disable FES First
|
||||
|
||||
The Friendly Error System (FES) adds up to 10x overhead. Disable it in every production sketch:
|
||||
|
||||
```javascript
|
||||
p5.disableFriendlyErrors = true; // BEFORE setup()
|
||||
|
||||
function setup() {
|
||||
pixelDensity(1); // prevent 2x-4x overdraw on retina
|
||||
createCanvas(1920, 1080);
|
||||
}
|
||||
```
|
||||
|
||||
In hot loops (particles, pixel ops), use `Math.*` instead of p5 wrappers — measurably faster:
|
||||
|
||||
```javascript
|
||||
// In draw() or update() hot paths:
|
||||
let a = Math.sin(t); // not sin(t)
|
||||
let r = Math.sqrt(dx*dx+dy*dy); // not dist() — or better: skip sqrt, compare magSq
|
||||
let v = Math.random(); // not random() — when seed not needed
|
||||
let m = Math.min(a, b); // not min(a, b)
|
||||
```
|
||||
|
||||
Never `console.log()` inside `draw()`. Never manipulate DOM in `draw()`. See `references/troubleshooting.md` § Performance.
|
||||
|
||||
### Seeded Randomness — Always
|
||||
|
||||
Every generative sketch must be reproducible. Same seed, same output.
|
||||
|
||||
```javascript
|
||||
function setup() {
|
||||
randomSeed(CONFIG.seed);
|
||||
noiseSeed(CONFIG.seed);
|
||||
// All random() and noise() calls now deterministic
|
||||
}
|
||||
```
|
||||
|
||||
Never use `Math.random()` for generative content — only for performance-critical non-visual code. Always `random()` for visual elements. If you need a random seed: `CONFIG.seed = floor(random(99999))`.
|
||||
|
||||
### Generative Art Platform Support (fxhash / Art Blocks)
|
||||
|
||||
For generative art platforms, replace p5's PRNG with the platform's deterministic random:
|
||||
|
||||
```javascript
|
||||
// fxhash convention
|
||||
const SEED = $fx.hash; // unique per mint
|
||||
const rng = $fx.rand; // deterministic PRNG
|
||||
$fx.features({ palette: 'warm', complexity: 'high' });
|
||||
|
||||
// In setup():
|
||||
randomSeed(SEED); // for p5's noise()
|
||||
noiseSeed(SEED);
|
||||
|
||||
// Replace random() with rng() for platform determinism
|
||||
let x = rng() * width; // instead of random(width)
|
||||
```
|
||||
|
||||
See `references/export-pipeline.md` § Platform Export.
|
||||
|
||||
### Color Mode — Use HSB
|
||||
|
||||
HSB (Hue, Saturation, Brightness) is dramatically easier to work with than RGB for generative art:
|
||||
|
||||
```javascript
|
||||
colorMode(HSB, 360, 100, 100, 100);
|
||||
// Now: fill(hue, sat, bri, alpha)
|
||||
// Rotate hue: fill((baseHue + offset) % 360, 80, 90)
|
||||
// Desaturate: fill(hue, sat * 0.3, bri)
|
||||
// Darken: fill(hue, sat, bri * 0.5)
|
||||
```
|
||||
|
||||
Never hardcode raw RGB values. Define a palette object, derive variations procedurally. See `references/color-systems.md`.
|
||||
|
||||
### Noise — Multi-Octave, Not Raw
|
||||
|
||||
Raw `noise(x, y)` looks like smooth blobs. Layer octaves for natural texture:
|
||||
|
||||
```javascript
|
||||
function fbm(x, y, octaves = 4) {
|
||||
let val = 0, amp = 1, freq = 1, sum = 0;
|
||||
for (let i = 0; i < octaves; i++) {
|
||||
val += noise(x * freq, y * freq) * amp;
|
||||
sum += amp;
|
||||
amp *= 0.5;
|
||||
freq *= 2;
|
||||
}
|
||||
return val / sum;
|
||||
}
|
||||
```
|
||||
|
||||
For flowing organic forms, use **domain warping**: feed noise output back as noise input coordinates. See `references/visual-effects.md`.
|
||||
|
||||
### createGraphics() for Layers — Not Optional
|
||||
|
||||
Flat single-pass rendering looks flat. Use offscreen buffers for composition:
|
||||
|
||||
```javascript
|
||||
let bgLayer, fgLayer, trailLayer;
|
||||
function setup() {
|
||||
createCanvas(1920, 1080);
|
||||
bgLayer = createGraphics(width, height);
|
||||
fgLayer = createGraphics(width, height);
|
||||
trailLayer = createGraphics(width, height);
|
||||
}
|
||||
function draw() {
|
||||
renderBackground(bgLayer);
|
||||
renderTrails(trailLayer); // persistent, fading
|
||||
renderForeground(fgLayer); // cleared each frame
|
||||
image(bgLayer, 0, 0);
|
||||
image(trailLayer, 0, 0);
|
||||
image(fgLayer, 0, 0);
|
||||
}
|
||||
```
|
||||
|
||||
### Performance — Vectorize Where Possible
|
||||
|
||||
p5.js draw calls are expensive. For thousands of particles:
|
||||
|
||||
```javascript
|
||||
// SLOW: individual shapes
|
||||
for (let p of particles) {
|
||||
ellipse(p.x, p.y, p.size);
|
||||
}
|
||||
|
||||
// FAST: single shape with beginShape()
|
||||
beginShape(POINTS);
|
||||
for (let p of particles) {
|
||||
vertex(p.x, p.y);
|
||||
}
|
||||
endShape();
|
||||
|
||||
// FASTEST: pixel buffer for massive counts
|
||||
loadPixels();
|
||||
for (let p of particles) {
|
||||
let idx = 4 * (floor(p.y) * width + floor(p.x));
|
||||
pixels[idx] = r; pixels[idx+1] = g; pixels[idx+2] = b; pixels[idx+3] = 255;
|
||||
}
|
||||
updatePixels();
|
||||
```
|
||||
|
||||
See `references/troubleshooting.md` § Performance.
|
||||
|
||||
### Instance Mode for Multiple Sketches
|
||||
|
||||
Global mode pollutes `window`. For production, use instance mode:
|
||||
|
||||
```javascript
|
||||
const sketch = (p) => {
|
||||
p.setup = function() {
|
||||
p.createCanvas(800, 800);
|
||||
};
|
||||
p.draw = function() {
|
||||
p.background(0);
|
||||
p.ellipse(p.mouseX, p.mouseY, 50);
|
||||
};
|
||||
};
|
||||
new p5(sketch, 'canvas-container');
|
||||
```
|
||||
|
||||
Required when embedding multiple sketches on one page or integrating with frameworks.
|
||||
|
||||
### WebGL Mode Gotchas
|
||||
|
||||
- `createCanvas(w, h, WEBGL)` — origin is center, not top-left
|
||||
- Y-axis is inverted (positive Y goes up in WEBGL, down in P2D)
|
||||
- `translate(-width/2, -height/2)` to get P2D-like coordinates
|
||||
- `push()`/`pop()` around every transform — matrix stack overflows silently
|
||||
- `texture()` before `rect()`/`plane()` — not after
|
||||
- Custom shaders: `createShader(vert, frag)` — test on multiple browsers
|
||||
|
||||
### Export — Key Bindings Convention
|
||||
|
||||
Every sketch should include these in `keyPressed()`:
|
||||
|
||||
```javascript
|
||||
function keyPressed() {
|
||||
if (key === 's' || key === 'S') saveCanvas('output', 'png');
|
||||
if (key === 'g' || key === 'G') saveGif('output', 5);
|
||||
if (key === 'r' || key === 'R') { randomSeed(millis()); noiseSeed(millis()); }
|
||||
if (key === ' ') CONFIG.paused = !CONFIG.paused;
|
||||
}
|
||||
```
|
||||
|
||||
### Headless Video Export — Use noLoop()
|
||||
|
||||
For headless rendering via Puppeteer, the sketch **must** use `noLoop()` in setup. Without it, p5's draw loop runs freely while screenshots are slow — the sketch races ahead and you get skipped/duplicate frames.
|
||||
|
||||
```javascript
|
||||
function setup() {
|
||||
createCanvas(1920, 1080);
|
||||
pixelDensity(1);
|
||||
noLoop(); // capture script controls frame advance
|
||||
window._p5Ready = true; // signal readiness to capture script
|
||||
}
|
||||
```
|
||||
|
||||
The bundled `scripts/export-frames.js` detects `_p5Ready` and calls `redraw()` once per capture for exact 1:1 frame correspondence. See `references/export-pipeline.md` § Deterministic Capture.
|
||||
|
||||
For multi-scene videos, use the per-clip architecture: one HTML per scene, render independently, stitch with `ffmpeg -f concat`. See `references/export-pipeline.md` § Per-Clip Architecture.
|
||||
|
||||
### Agent Workflow
|
||||
|
||||
When building p5.js sketches:
|
||||
|
||||
1. **Write the HTML file** — single self-contained file, all code inline
|
||||
2. **Open in browser** — `open sketch.html` (macOS) or `xdg-open sketch.html` (Linux)
|
||||
3. **Local assets** (fonts, images) require a server: `python3 -m http.server 8080` in the project directory, then open `http://localhost:8080/sketch.html`
|
||||
4. **Export PNG/GIF** — add `keyPressed()` shortcuts as shown above, tell the user which key to press
|
||||
5. **Headless export** — `node scripts/export-frames.js sketch.html --frames 300` for automated frame capture (sketch must use `noLoop()` + `_p5Ready`)
|
||||
6. **MP4 rendering** — `bash scripts/render.sh sketch.html output.mp4 --duration 30`
|
||||
7. **Iterative refinement** — edit the HTML file, user refreshes browser to see changes
|
||||
8. **Load references on demand** — use `skill_view(name="p5js", file_path="references/...")` to load specific reference files as needed during implementation
|
||||
|
||||
## Performance Targets
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Frame rate (interactive) | 60fps sustained |
|
||||
| Frame rate (animated export) | 30fps minimum |
|
||||
| Particle count (P2D shapes) | 5,000-10,000 at 60fps |
|
||||
| Particle count (pixel buffer) | 50,000-100,000 at 60fps |
|
||||
| Canvas resolution | Up to 3840x2160 (export), 1920x1080 (interactive) |
|
||||
| File size (HTML) | < 100KB (excluding CDN libraries) |
|
||||
| Load time | < 2s to first frame |
|
||||
|
||||
## References
|
||||
|
||||
| File | Contents |
|
||||
|------|----------|
|
||||
| `references/core-api.md` | Canvas setup, coordinate system, draw loop, `push()`/`pop()`, offscreen buffers, composition patterns, `pixelDensity()`, responsive design |
|
||||
| `references/shapes-and-geometry.md` | 2D primitives, `beginShape()`/`endShape()`, Bezier/Catmull-Rom curves, `vertex()` systems, custom shapes, `p5.Vector`, signed distance fields, SVG path conversion |
|
||||
| `references/visual-effects.md` | Noise (Perlin, fractal, domain warp, curl), flow fields, particle systems (physics, flocking, trails), pixel manipulation, texture generation (stipple, hatch, halftone), feedback loops, reaction-diffusion |
|
||||
| `references/animation.md` | Frame-based animation, easing functions, `lerp()`/`map()`, spring physics, state machines, timeline sequencing, `millis()`-based timing, transition patterns |
|
||||
| `references/typography.md` | `text()`, `loadFont()`, `textToPoints()`, kinetic typography, text masks, font metrics, responsive text sizing |
|
||||
| `references/color-systems.md` | `colorMode()`, HSB/HSL/RGB, `lerpColor()`, `paletteLerp()`, procedural palettes, color harmony, `blendMode()`, gradient rendering, curated palette library |
|
||||
| `references/webgl-and-3d.md` | WEBGL renderer, 3D primitives, camera, lighting, materials, custom geometry, GLSL shaders (`createShader()`, `createFilterShader()`), framebuffers, post-processing |
|
||||
| `references/interaction.md` | Mouse events, keyboard state, touch input, DOM elements, `createSlider()`/`createButton()`, audio input (p5.sound FFT/amplitude), scroll-driven animation, responsive events |
|
||||
| `references/export-pipeline.md` | `saveCanvas()`, `saveGif()`, `saveFrames()`, deterministic headless capture, ffmpeg frame-to-video, CCapture.js, SVG export, per-clip architecture, platform export (fxhash), video gotchas |
|
||||
| `references/troubleshooting.md` | Performance profiling, per-pixel budgets, common mistakes, browser compatibility, WebGL debugging, font loading issues, pixel density traps, memory leaks, CORS |
|
||||
| `templates/viewer.html` | Interactive viewer template: seed navigation (prev/next/random/jump), parameter sliders, download PNG, responsive canvas. Start from this for explorable generative art |
|
||||
|
||||
---
|
||||
|
||||
## Creative Divergence (use only when user requests experimental/creative/unique output)
|
||||
|
||||
If the user asks for creative, experimental, surprising, or unconventional output, select the strategy that best fits and reason through its steps BEFORE generating code.
|
||||
|
||||
- **Conceptual Blending** — when the user names two things to combine or wants hybrid aesthetics
|
||||
- **SCAMPER** — when the user wants a twist on a known generative art pattern
|
||||
- **Distance Association** — when the user gives a single concept and wants exploration ("make something about time")
|
||||
|
||||
### Conceptual Blending
|
||||
1. Name two distinct visual systems (e.g., particle physics + handwriting)
|
||||
2. Map correspondences (particles = ink drops, forces = pen pressure, fields = letterforms)
|
||||
3. Blend selectively — keep mappings that produce interesting emergent visuals
|
||||
4. Code the blend as a unified system, not two systems side-by-side
|
||||
|
||||
### SCAMPER Transformation
|
||||
Take a known generative pattern (flow field, particle system, L-system, cellular automata) and systematically transform it:
|
||||
- **Substitute**: replace circles with text characters, lines with gradients
|
||||
- **Combine**: merge two patterns (flow field + voronoi)
|
||||
- **Adapt**: apply a 2D pattern to a 3D projection
|
||||
- **Modify**: exaggerate scale, warp the coordinate space
|
||||
- **Purpose**: use a physics sim for typography, a sorting algorithm for color
|
||||
- **Eliminate**: remove the grid, remove color, remove symmetry
|
||||
- **Reverse**: run the simulation backward, invert the parameter space
|
||||
|
||||
### Distance Association
|
||||
1. Anchor on the user's concept (e.g., "loneliness")
|
||||
2. Generate associations at three distances:
|
||||
- Close (obvious): empty room, single figure, silence
|
||||
- Medium (interesting): one fish in a school swimming the wrong way, a phone with no notifications, the gap between subway cars
|
||||
- Far (abstract): prime numbers, asymptotic curves, the color of 3am
|
||||
3. Develop the medium-distance associations — they're specific enough to visualize but unexpected enough to be interesting
|
||||
@@ -0,0 +1,232 @@
|
||||
---
|
||||
title: "Pixel Art — Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc"
|
||||
sidebar_label: "Pixel Art"
|
||||
description: "Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Pixel Art
|
||||
|
||||
Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc.), and animate them into short videos. Presets cover arcade, SNES, and 10+ era-correct looks. Use `clarify` to let the user pick a style before generating.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/pixel-art` |
|
||||
| Version | `2.0.0` |
|
||||
| Author | dodo-reach |
|
||||
| License | MIT |
|
||||
| Tags | `creative`, `pixel-art`, `arcade`, `snes`, `nes`, `gameboy`, `retro`, `image`, `video` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Pixel Art
|
||||
|
||||
Convert any image into retro pixel art, then optionally animate it into a short
|
||||
MP4 or GIF with era-appropriate effects (rain, fireflies, snow, embers).
|
||||
|
||||
Two scripts ship with this skill:
|
||||
|
||||
- `scripts/pixel_art.py` — photo → pixel-art PNG (Floyd-Steinberg dithering)
|
||||
- `scripts/pixel_art_video.py` — pixel-art PNG → animated MP4 (+ optional GIF)
|
||||
|
||||
Each is importable or runnable directly. Presets snap to hardware palettes
|
||||
when you want era-accurate colors (NES, Game Boy, PICO-8, etc.), or use
|
||||
adaptive N-color quantization for arcade/SNES-style looks.
|
||||
|
||||
## When to Use
|
||||
|
||||
- User wants retro pixel art from a source image
|
||||
- User asks for NES / Game Boy / PICO-8 / C64 / arcade / SNES styling
|
||||
- User wants a short looping animation (rain scene, night sky, snow, etc.)
|
||||
- Posters, album covers, social posts, sprites, characters, avatars
|
||||
|
||||
## Workflow
|
||||
|
||||
Before generating, confirm the style with the user. Different presets produce
|
||||
very different outputs and regenerating is costly.
|
||||
|
||||
### Step 1 — Offer a style
|
||||
|
||||
Call `clarify` with 4 representative presets. Pick the set based on what the
|
||||
user asked for — don't just dump all 14.
|
||||
|
||||
Default menu when the user's intent is unclear:
|
||||
|
||||
```python
|
||||
clarify(
|
||||
question="Which pixel-art style do you want?",
|
||||
choices=[
|
||||
"arcade — bold, chunky 80s cabinet feel (16 colors, 8px)",
|
||||
"nes — Nintendo 8-bit hardware palette (54 colors, 8px)",
|
||||
"gameboy — 4-shade green Game Boy DMG",
|
||||
"snes — cleaner 16-bit look (32 colors, 4px)",
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
When the user already named an era (e.g. "80s arcade", "Gameboy"), skip
|
||||
`clarify` and use the matching preset directly.
|
||||
|
||||
### Step 2 — Offer animation (optional)
|
||||
|
||||
If the user asked for a video/GIF, or the output might benefit from motion,
|
||||
ask which scene:
|
||||
|
||||
```python
|
||||
clarify(
|
||||
question="Want to animate it? Pick a scene or skip.",
|
||||
choices=[
|
||||
"night — stars + fireflies + leaves",
|
||||
"urban — rain + neon pulse",
|
||||
"snow — falling snowflakes",
|
||||
"skip — just the image",
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
Do NOT call `clarify` more than twice in a row. One for style, one for scene if
|
||||
animation is on the table. If the user explicitly asked for a specific style
|
||||
and scene in their message, skip `clarify` entirely.
|
||||
|
||||
### Step 3 — Generate
|
||||
|
||||
Run `pixel_art()` first; if animation was requested, chain into
|
||||
`pixel_art_video()` on the result.
|
||||
|
||||
## Preset Catalog
|
||||
|
||||
| Preset | Era | Palette | Block | Best for |
|
||||
|--------|-----|---------|-------|----------|
|
||||
| `arcade` | 80s arcade | adaptive 16 | 8px | Bold posters, hero art |
|
||||
| `snes` | 16-bit | adaptive 32 | 4px | Characters, detailed scenes |
|
||||
| `nes` | 8-bit | NES (54) | 8px | True NES look |
|
||||
| `gameboy` | DMG handheld | 4 green shades | 8px | Monochrome Game Boy |
|
||||
| `gameboy_pocket` | Pocket handheld | 4 grey shades | 8px | Mono GB Pocket |
|
||||
| `pico8` | PICO-8 | 16 fixed | 6px | Fantasy-console look |
|
||||
| `c64` | Commodore 64 | 16 fixed | 8px | 8-bit home computer |
|
||||
| `apple2` | Apple II hi-res | 6 fixed | 10px | Extreme retro, 6 colors |
|
||||
| `teletext` | BBC Teletext | 8 pure | 10px | Chunky primary colors |
|
||||
| `mspaint` | Windows MS Paint | 24 fixed | 8px | Nostalgic desktop |
|
||||
| `mono_green` | CRT phosphor | 2 green | 6px | Terminal/CRT aesthetic |
|
||||
| `mono_amber` | CRT amber | 2 amber | 6px | Amber monitor look |
|
||||
| `neon` | Cyberpunk | 10 neons | 6px | Vaporwave/cyber |
|
||||
| `pastel` | Soft pastel | 10 pastels | 6px | Kawaii / gentle |
|
||||
|
||||
Named palettes live in `scripts/palettes.py` (see `references/palettes.md` for
|
||||
the complete list — 28 named palettes total). Any preset can be overridden:
|
||||
|
||||
```python
|
||||
pixel_art("in.png", "out.png", preset="snes", palette="PICO_8", block=6)
|
||||
```
|
||||
|
||||
## Scene Catalog (for video)
|
||||
|
||||
| Scene | Effects |
|
||||
|-------|---------|
|
||||
| `night` | Twinkling stars + fireflies + drifting leaves |
|
||||
| `dusk` | Fireflies + sparkles |
|
||||
| `tavern` | Dust motes + warm sparkles |
|
||||
| `indoor` | Dust motes |
|
||||
| `urban` | Rain + neon pulse |
|
||||
| `nature` | Leaves + fireflies |
|
||||
| `magic` | Sparkles + fireflies |
|
||||
| `storm` | Rain + lightning |
|
||||
| `underwater` | Bubbles + light sparkles |
|
||||
| `fire` | Embers + sparkles |
|
||||
| `snow` | Snowflakes + sparkles |
|
||||
| `desert` | Heat shimmer + dust |
|
||||
|
||||
## Invocation Patterns
|
||||
|
||||
### Python (import)
|
||||
|
||||
```python
|
||||
import sys
|
||||
sys.path.insert(0, "/home/teknium/.hermes/skills/creative/pixel-art/scripts")
|
||||
from pixel_art import pixel_art
|
||||
from pixel_art_video import pixel_art_video
|
||||
|
||||
# 1. Convert to pixel art
|
||||
pixel_art("/path/to/photo.jpg", "/tmp/pixel.png", preset="nes")
|
||||
|
||||
# 2. Animate (optional)
|
||||
pixel_art_video(
|
||||
"/tmp/pixel.png",
|
||||
"/tmp/pixel.mp4",
|
||||
scene="night",
|
||||
duration=6,
|
||||
fps=15,
|
||||
seed=42,
|
||||
export_gif=True,
|
||||
)
|
||||
```
|
||||
|
||||
### CLI
|
||||
|
||||
```bash
|
||||
cd /home/teknium/.hermes/skills/creative/pixel-art/scripts
|
||||
|
||||
python pixel_art.py in.jpg out.png --preset gameboy
|
||||
python pixel_art.py in.jpg out.png --preset snes --palette PICO_8 --block 6
|
||||
|
||||
python pixel_art_video.py out.png out.mp4 --scene night --duration 6 --gif
|
||||
```
|
||||
|
||||
## Pipeline Rationale
|
||||
|
||||
**Pixel conversion:**
|
||||
1. Boost contrast/color/sharpness (stronger for smaller palettes)
|
||||
2. Posterize to simplify tonal regions before quantization
|
||||
3. Downscale by `block` with `Image.NEAREST` (hard pixels, no interpolation)
|
||||
4. Quantize with Floyd-Steinberg dithering — against either an adaptive
|
||||
N-color palette OR a named hardware palette
|
||||
5. Upscale back with `Image.NEAREST`
|
||||
|
||||
Quantizing AFTER downscale keeps dithering aligned with the final pixel grid.
|
||||
Quantizing before would waste error-diffusion on detail that disappears.
|
||||
|
||||
**Video overlay:**
|
||||
- Copies the base frame each tick (static background)
|
||||
- Overlays stateless-per-frame particle draws (one function per effect)
|
||||
- Encodes via ffmpeg `libx264 -pix_fmt yuv420p -crf 18`
|
||||
- Optional GIF via `palettegen` + `paletteuse`
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Python 3.9+
|
||||
- Pillow (`pip install Pillow`)
|
||||
- ffmpeg on PATH (only needed for video — Hermes installs package this)
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- Pallet keys are case-sensitive (`"NES"`, `"PICO_8"`, `"GAMEBOY_ORIGINAL"`).
|
||||
- Very small sources (<100px wide) collapse under 8-10px blocks. Upscale the
|
||||
source first if it's tiny.
|
||||
- Fractional `block` or `palette` will break quantization — keep them positive ints.
|
||||
- Animation particle counts are tuned for ~640x480 canvases. On very large
|
||||
images you may want a second pass with a different seed for density.
|
||||
- `mono_green` / `mono_amber` force `color=0.0` (desaturate). If you override
|
||||
and keep chroma, the 2-color palette can produce stripes on smooth regions.
|
||||
- `clarify` loop: call it at most twice per turn (style, then scene). Don't
|
||||
pepper the user with more picks.
|
||||
|
||||
## Verification
|
||||
|
||||
- PNG is created at the output path
|
||||
- Clear square pixel blocks visible at the preset's block size
|
||||
- Color count matches preset (eyeball the image or run `Image.open(p).getcolors()`)
|
||||
- Video is a valid MP4 (`ffprobe` can open it) with non-zero size
|
||||
|
||||
## Attribution
|
||||
|
||||
Named hardware palettes and the procedural animation loops in `pixel_art_video.py`
|
||||
are ported from [pixel-art-studio](https://github.com/Synero/pixel-art-studio)
|
||||
(MIT). See `ATTRIBUTION.md` in this skill directory for details.
|
||||
@@ -0,0 +1,212 @@
|
||||
---
|
||||
title: "Popular Web Designs — 54 production-quality design systems extracted from real websites"
|
||||
sidebar_label: "Popular Web Designs"
|
||||
description: "54 production-quality design systems extracted from real websites"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Popular Web Designs
|
||||
|
||||
54 production-quality design systems extracted from real websites. Load a template to generate HTML/CSS that matches the visual identity of sites like Stripe, Linear, Vercel, Notion, Airbnb, and more. Each template includes colors, typography, components, layout rules, and ready-to-use CSS values.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/popular-web-designs` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent + Teknium (design systems sourced from VoltAgent/awesome-design-md) |
|
||||
| License | MIT |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Popular Web Designs
|
||||
|
||||
54 real-world design systems ready for use when generating HTML/CSS. Each template captures a
|
||||
site's complete visual language: color palette, typography hierarchy, component styles, spacing
|
||||
system, shadows, responsive behavior, and practical agent prompts with exact CSS values.
|
||||
|
||||
## How to Use
|
||||
|
||||
1. Pick a design from the catalog below
|
||||
2. Load it: `skill_view(name="popular-web-designs", file_path="templates/<site>.md")`
|
||||
3. Use the design tokens and component specs when generating HTML
|
||||
4. Pair with the `generative-widgets` skill to serve the result via cloudflared tunnel
|
||||
|
||||
Each template includes a **Hermes Implementation Notes** block at the top with:
|
||||
- CDN font substitute and Google Fonts `<link>` tag (ready to paste)
|
||||
- CSS font-family stacks for primary and monospace
|
||||
- Reminders to use `write_file` for HTML creation and `browser_vision` for verification
|
||||
|
||||
## HTML Generation Pattern
|
||||
|
||||
```html
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Page Title</title>
|
||||
<!-- Paste the Google Fonts <link> from the template's Hermes notes -->
|
||||
<link href="https://fonts.googleapis.com/css2?family=..." rel="stylesheet">
|
||||
<style>
|
||||
/* Apply the template's color palette as CSS custom properties */
|
||||
:root {
|
||||
--color-bg: #ffffff;
|
||||
--color-text: #171717;
|
||||
--color-accent: #533afd;
|
||||
/* ... more from template Section 2 */
|
||||
}
|
||||
/* Apply typography from template Section 3 */
|
||||
body {
|
||||
font-family: 'Inter', system-ui, sans-serif;
|
||||
color: var(--color-text);
|
||||
background: var(--color-bg);
|
||||
}
|
||||
/* Apply component styles from template Section 4 */
|
||||
/* Apply layout from template Section 5 */
|
||||
/* Apply shadows from template Section 6 */
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<!-- Build using component specs from the template -->
|
||||
</body>
|
||||
</html>
|
||||
```
|
||||
|
||||
Write the file with `write_file`, serve with the `generative-widgets` workflow (cloudflared tunnel),
|
||||
and verify the result with `browser_vision` to confirm visual accuracy.
|
||||
|
||||
## Font Substitution Reference
|
||||
|
||||
Most sites use proprietary fonts unavailable via CDN. Each template maps to a Google Fonts
|
||||
substitute that preserves the design's character. Common mappings:
|
||||
|
||||
| Proprietary Font | CDN Substitute | Character |
|
||||
|---|---|---|
|
||||
| Geist / Geist Sans | Geist (on Google Fonts) | Geometric, compressed tracking |
|
||||
| Geist Mono | Geist Mono (on Google Fonts) | Clean monospace, ligatures |
|
||||
| sohne-var (Stripe) | Source Sans 3 | Light weight elegance |
|
||||
| Berkeley Mono | JetBrains Mono | Technical monospace |
|
||||
| Airbnb Cereal VF | DM Sans | Rounded, friendly geometric |
|
||||
| Circular (Spotify) | DM Sans | Geometric, warm |
|
||||
| figmaSans | Inter | Clean humanist |
|
||||
| Pin Sans (Pinterest) | DM Sans | Friendly, rounded |
|
||||
| NVIDIA-EMEA | Inter (or Arial system) | Industrial, clean |
|
||||
| CoinbaseDisplay/Sans | DM Sans | Geometric, trustworthy |
|
||||
| UberMove | DM Sans | Bold, tight |
|
||||
| HashiCorp Sans | Inter | Enterprise, neutral |
|
||||
| waldenburgNormal (Sanity) | Space Grotesk | Geometric, slightly condensed |
|
||||
| IBM Plex Sans/Mono | IBM Plex Sans/Mono | Available on Google Fonts |
|
||||
| Rubik (Sentry) | Rubik | Available on Google Fonts |
|
||||
|
||||
When a template's CDN font matches the original (Inter, IBM Plex, Rubik, Geist), no
|
||||
substitution loss occurs. When a substitute is used (DM Sans for Circular, Source Sans 3
|
||||
for sohne-var), follow the template's weight, size, and letter-spacing values closely —
|
||||
those carry more visual identity than the specific font face.
|
||||
|
||||
## Design Catalog
|
||||
|
||||
### AI & Machine Learning
|
||||
|
||||
| Template | Site | Style |
|
||||
|---|---|---|
|
||||
| `claude.md` | Anthropic Claude | Warm terracotta accent, clean editorial layout |
|
||||
| `cohere.md` | Cohere | Vibrant gradients, data-rich dashboard aesthetic |
|
||||
| `elevenlabs.md` | ElevenLabs | Dark cinematic UI, audio-waveform aesthetics |
|
||||
| `minimax.md` | Minimax | Bold dark interface with neon accents |
|
||||
| `mistral.ai.md` | Mistral AI | French-engineered minimalism, purple-toned |
|
||||
| `ollama.md` | Ollama | Terminal-first, monochrome simplicity |
|
||||
| `opencode.ai.md` | OpenCode AI | Developer-centric dark theme, full monospace |
|
||||
| `replicate.md` | Replicate | Clean white canvas, code-forward |
|
||||
| `runwayml.md` | RunwayML | Cinematic dark UI, media-rich layout |
|
||||
| `together.ai.md` | Together AI | Technical, blueprint-style design |
|
||||
| `voltagent.md` | VoltAgent | Void-black canvas, emerald accent, terminal-native |
|
||||
| `x.ai.md` | xAI | Stark monochrome, futuristic minimalism, full monospace |
|
||||
|
||||
### Developer Tools & Platforms
|
||||
|
||||
| Template | Site | Style |
|
||||
|---|---|---|
|
||||
| `cursor.md` | Cursor | Sleek dark interface, gradient accents |
|
||||
| `expo.md` | Expo | Dark theme, tight letter-spacing, code-centric |
|
||||
| `linear.app.md` | Linear | Ultra-minimal dark-mode, precise, purple accent |
|
||||
| `lovable.md` | Lovable | Playful gradients, friendly dev aesthetic |
|
||||
| `mintlify.md` | Mintlify | Clean, green-accented, reading-optimized |
|
||||
| `posthog.md` | PostHog | Playful branding, developer-friendly dark UI |
|
||||
| `raycast.md` | Raycast | Sleek dark chrome, vibrant gradient accents |
|
||||
| `resend.md` | Resend | Minimal dark theme, monospace accents |
|
||||
| `sentry.md` | Sentry | Dark dashboard, data-dense, pink-purple accent |
|
||||
| `supabase.md` | Supabase | Dark emerald theme, code-first developer tool |
|
||||
| `superhuman.md` | Superhuman | Premium dark UI, keyboard-first, purple glow |
|
||||
| `vercel.md` | Vercel | Black and white precision, Geist font system |
|
||||
| `warp.md` | Warp | Dark IDE-like interface, block-based command UI |
|
||||
| `zapier.md` | Zapier | Warm orange, friendly illustration-driven |
|
||||
|
||||
### Infrastructure & Cloud
|
||||
|
||||
| Template | Site | Style |
|
||||
|---|---|---|
|
||||
| `clickhouse.md` | ClickHouse | Yellow-accented, technical documentation style |
|
||||
| `composio.md` | Composio | Modern dark with colorful integration icons |
|
||||
| `hashicorp.md` | HashiCorp | Enterprise-clean, black and white |
|
||||
| `mongodb.md` | MongoDB | Green leaf branding, developer documentation focus |
|
||||
| `sanity.md` | Sanity | Red accent, content-first editorial layout |
|
||||
| `stripe.md` | Stripe | Signature purple gradients, weight-300 elegance |
|
||||
|
||||
### Design & Productivity
|
||||
|
||||
| Template | Site | Style |
|
||||
|---|---|---|
|
||||
| `airtable.md` | Airtable | Colorful, friendly, structured data aesthetic |
|
||||
| `cal.md` | Cal.com | Clean neutral UI, developer-oriented simplicity |
|
||||
| `clay.md` | Clay | Organic shapes, soft gradients, art-directed layout |
|
||||
| `figma.md` | Figma | Vibrant multi-color, playful yet professional |
|
||||
| `framer.md` | Framer | Bold black and blue, motion-first, design-forward |
|
||||
| `intercom.md` | Intercom | Friendly blue palette, conversational UI patterns |
|
||||
| `miro.md` | Miro | Bright yellow accent, infinite canvas aesthetic |
|
||||
| `notion.md` | Notion | Warm minimalism, serif headings, soft surfaces |
|
||||
| `pinterest.md` | Pinterest | Red accent, masonry grid, image-first layout |
|
||||
| `webflow.md` | Webflow | Blue-accented, polished marketing site aesthetic |
|
||||
|
||||
### Fintech & Crypto
|
||||
|
||||
| Template | Site | Style |
|
||||
|---|---|---|
|
||||
| `coinbase.md` | Coinbase | Clean blue identity, trust-focused, institutional feel |
|
||||
| `kraken.md` | Kraken | Purple-accented dark UI, data-dense dashboards |
|
||||
| `revolut.md` | Revolut | Sleek dark interface, gradient cards, fintech precision |
|
||||
| `wise.md` | Wise | Bright green accent, friendly and clear |
|
||||
|
||||
### Enterprise & Consumer
|
||||
|
||||
| Template | Site | Style |
|
||||
|---|---|---|
|
||||
| `airbnb.md` | Airbnb | Warm coral accent, photography-driven, rounded UI |
|
||||
| `apple.md` | Apple | Premium white space, SF Pro, cinematic imagery |
|
||||
| `bmw.md` | BMW | Dark premium surfaces, precise engineering aesthetic |
|
||||
| `ibm.md` | IBM | Carbon design system, structured blue palette |
|
||||
| `nvidia.md` | NVIDIA | Green-black energy, technical power aesthetic |
|
||||
| `spacex.md` | SpaceX | Stark black and white, full-bleed imagery, futuristic |
|
||||
| `spotify.md` | Spotify | Vibrant green on dark, bold type, album-art-driven |
|
||||
| `uber.md` | Uber | Bold black and white, tight type, urban energy |
|
||||
|
||||
## Choosing a Design
|
||||
|
||||
Match the design to the content:
|
||||
|
||||
- **Developer tools / dashboards:** Linear, Vercel, Supabase, Raycast, Sentry
|
||||
- **Documentation / content sites:** Mintlify, Notion, Sanity, MongoDB
|
||||
- **Marketing / landing pages:** Stripe, Framer, Apple, SpaceX
|
||||
- **Dark mode UIs:** Linear, Cursor, ElevenLabs, Warp, Superhuman
|
||||
- **Light / clean UIs:** Vercel, Stripe, Notion, Cal.com, Replicate
|
||||
- **Playful / friendly:** PostHog, Figma, Lovable, Zapier, Miro
|
||||
- **Premium / luxury:** Apple, BMW, Stripe, Superhuman, Revolut
|
||||
- **Data-dense / dashboards:** Sentry, Kraken, Cohere, ClickHouse
|
||||
- **Monospace / terminal aesthetic:** Ollama, OpenCode, x.ai, VoltAgent
|
||||
@@ -0,0 +1,297 @@
|
||||
---
|
||||
title: "Songwriting And Ai Music"
|
||||
sidebar_label: "Songwriting And Ai Music"
|
||||
description: "Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation techniques, phonetic tricks, and lessons learned"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Songwriting And Ai Music
|
||||
|
||||
Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation techniques, phonetic tricks, and lessons learned. These are tools and ideas, not rules. Break any of them when the art calls for it.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/creative/songwriting-and-ai-music` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Songwriting & AI Music Generation
|
||||
|
||||
Everything here is a GUIDELINE, not a rule. Art breaks rules on purpose.
|
||||
Use what serves the song. Ignore what doesn't.
|
||||
|
||||
---
|
||||
|
||||
## 1. Song Structure (Pick One or Invent Your Own)
|
||||
|
||||
Common skeletons — mix, modify, or throw out as needed:
|
||||
|
||||
```
|
||||
ABABCB Verse/Chorus/Verse/Chorus/Bridge/Chorus (most pop/rock)
|
||||
AABA Verse/Verse/Bridge/Verse (refrain-based) (jazz standards, ballads)
|
||||
ABAB Verse/Chorus alternating (simple, direct)
|
||||
AAA Verse/Verse/Verse (strophic, no chorus) (folk, storytelling)
|
||||
```
|
||||
|
||||
The six building blocks:
|
||||
- Intro — set the mood, pull the listener in
|
||||
- Verse — the story, the details, the world-building
|
||||
- Pre-Chorus — optional tension ramp before the payoff
|
||||
- Chorus — the emotional core, the part people remember
|
||||
- Bridge — a detour, a shift in perspective or key
|
||||
- Outro — the farewell, can echo or subvert the rest
|
||||
|
||||
You don't need all of these. Some great songs are just one section
|
||||
that evolves. Structure serves the emotion, not the other way around.
|
||||
|
||||
---
|
||||
|
||||
## 2. Rhyme, Meter, and Sound
|
||||
|
||||
RHYME TYPES (from tight to loose):
|
||||
- Perfect: lean/mean
|
||||
- Family: crate/braid
|
||||
- Assonance: had/glass (same vowels, different endings)
|
||||
- Consonance: scene/when (different vowels, similar endings)
|
||||
- Near/slant: enough to suggest connection without locking it down
|
||||
|
||||
Mix them. All perfect rhymes can sound like a nursery rhyme.
|
||||
All slant rhymes can sound lazy. The blend is where it lives.
|
||||
|
||||
INTERNAL RHYME: Rhyming within a line, not just at the ends.
|
||||
"We pruned the lies from bleeding trees / Distilled the storm
|
||||
from entropy" — "lies/flies," "trees/entropy" create internal echoes.
|
||||
|
||||
METER: The rhythm of stressed vs unstressed syllables.
|
||||
- Matching syllable counts between parallel lines helps singability
|
||||
- The STRESSED syllables matter more than total count
|
||||
- Say it out loud. If you stumble, the meter needs work.
|
||||
- Intentionally breaking meter can create emphasis or surprise
|
||||
|
||||
---
|
||||
|
||||
## 3. Emotional Arc and Dynamics
|
||||
|
||||
Think of a song as a journey, not a flat road.
|
||||
|
||||
ENERGY MAPPING (rough idea, not prescription):
|
||||
Intro: 2-3 | Verse: 5-6 | Pre-Chorus: 7
|
||||
Chorus: 8-9 | Bridge: varies | Final Chorus: 9-10
|
||||
|
||||
The most powerful dynamic trick: CONTRAST.
|
||||
- Whisper before a scream hits harder than just screaming
|
||||
- Sparse before dense. Slow before fast. Low before high.
|
||||
- The drop only works because of the buildup
|
||||
- Silence is an instrument
|
||||
|
||||
"Whisper to roar to whisper" — start intimate, build to full power,
|
||||
strip back to vulnerability. Works for ballads, epics, anthems.
|
||||
|
||||
---
|
||||
|
||||
## 4. Writing Lyrics That Work
|
||||
|
||||
SHOW, DON'T TELL (usually):
|
||||
- "I was sad" = flat
|
||||
- "Your hoodie's still on the hook by the door" = alive
|
||||
- But sometimes "I give my life" said plainly IS the power
|
||||
|
||||
THE HOOK:
|
||||
- The line people remember, hum, repeat
|
||||
- Usually the title or core phrase
|
||||
- Works best when melody + lyric + emotion all align
|
||||
- Place it where it lands hardest (often first/last line of chorus)
|
||||
|
||||
PROSODY — lyrics and music supporting each other:
|
||||
- Stable feelings (resolution, peace) pair with settled melodies,
|
||||
perfect rhymes, resolved chords
|
||||
- Unstable feelings (longing, doubt) pair with wandering melodies,
|
||||
near-rhymes, unresolved chords
|
||||
- Verse melody typically sits lower, chorus goes higher
|
||||
- But flip this if it serves the song
|
||||
|
||||
AVOID (unless you're doing it on purpose):
|
||||
- Cliches on autopilot ("heart of gold" without earning it)
|
||||
- Forcing word order to hit a rhyme ("Yoda-speak")
|
||||
- Same energy in every section (flat dynamics)
|
||||
- Treating your first draft as sacred — revision is creation
|
||||
|
||||
---
|
||||
|
||||
## 5. Parody and Adaptation
|
||||
|
||||
When rewriting an existing song with new lyrics:
|
||||
|
||||
THE SKELETON: Map the original's structure first.
|
||||
- Count syllables per line
|
||||
- Mark the rhyme scheme (ABAB, AABB, etc.)
|
||||
- Identify which syllables are STRESSED
|
||||
- Note where held/sustained notes fall
|
||||
|
||||
FITTING NEW WORDS:
|
||||
- Match stressed syllables to the same beats as the original
|
||||
- Total syllable count can flex by 1-2 unstressed syllables
|
||||
- On long held notes, try to match the VOWEL SOUND of the original
|
||||
(if original holds "LOOOVE" with an "oo" vowel, "FOOOD" fits
|
||||
better than "LIFE")
|
||||
- Monosyllabic swaps in key spots keep rhythm intact
|
||||
(Crime -> Code, Snake -> Noose)
|
||||
- Sing your new words over the original — if you stumble, revise
|
||||
|
||||
CONCEPT:
|
||||
- Pick a concept strong enough to sustain the whole song
|
||||
- Start from the title/hook and build outward
|
||||
- Generate lots of raw material (puns, phrases, images) FIRST,
|
||||
then fit the best ones into the structure
|
||||
- If you need a specific line somewhere, reverse-engineer the
|
||||
rhyme scheme backward to set it up
|
||||
|
||||
KEEP SOME ORIGINALS: Leaving a few original lines or structures
|
||||
intact adds recognizability and lets the audience feel the connection.
|
||||
|
||||
---
|
||||
|
||||
## 6. Suno AI Prompt Engineering
|
||||
|
||||
### Style/Genre Description Field
|
||||
|
||||
FORMULA (adapt as needed):
|
||||
Genre + Mood + Era + Instruments + Vocal Style + Production + Dynamics
|
||||
|
||||
```
|
||||
BAD: "sad rock song"
|
||||
GOOD: "Cinematic orchestral spy thriller, 1960s Cold War era, smoky
|
||||
sultry female vocalist, big band jazz, brass section with
|
||||
trumpets and french horns, sweeping strings, minor key,
|
||||
vintage analog warmth"
|
||||
```
|
||||
|
||||
DESCRIBE THE JOURNEY, not just the genre:
|
||||
```
|
||||
"Begins as a haunting whisper over sparse piano. Gradually layers
|
||||
in muted brass. Builds through the chorus with full orchestra.
|
||||
Second verse erupts with raw belting intensity. Outro strips back
|
||||
to a lone piano and a fragile whisper fading to silence."
|
||||
```
|
||||
|
||||
TIPS:
|
||||
- V4.5+ supports up to 1,000 chars in Style field — use them
|
||||
- NO artist names or trademarks. Describe the sound instead.
|
||||
"1960s Cold War spy thriller brass" not "James Bond style"
|
||||
"90s grunge" not "Nirvana-style"
|
||||
- Specify BPM and key when you have a preference
|
||||
- Use Exclude Styles field for what you DON'T want
|
||||
- Unexpected genre combos can be gold: "bossa nova trap",
|
||||
"Appalachian gothic", "chiptune jazz"
|
||||
- Build a vocal PERSONA, not just a gender:
|
||||
"A weathered torch singer with a smoky alto, slight rasp,
|
||||
who starts vulnerable and builds to devastating power"
|
||||
|
||||
### Metatags (place in [brackets] inside lyrics field)
|
||||
|
||||
STRUCTURE:
|
||||
[Intro] [Verse] [Verse 1] [Pre-Chorus] [Chorus]
|
||||
[Post-Chorus] [Hook] [Bridge] [Interlude]
|
||||
[Instrumental] [Instrumental Break] [Guitar Solo]
|
||||
[Breakdown] [Build-up] [Outro] [Silence] [End]
|
||||
|
||||
VOCAL PERFORMANCE:
|
||||
[Whispered] [Spoken Word] [Belted] [Falsetto] [Powerful]
|
||||
[Soulful] [Raspy] [Breathy] [Smooth] [Gritty]
|
||||
[Staccato] [Legato] [Vibrato] [Melismatic]
|
||||
[Harmonies] [Choir] [Harmonized Chorus]
|
||||
|
||||
DYNAMICS:
|
||||
[High Energy] [Low Energy] [Building Energy] [Explosive]
|
||||
[Emotional Climax] [Gradual swell] [Orchestral swell]
|
||||
[Quiet arrangement] [Falling tension] [Slow Down]
|
||||
|
||||
GENDER:
|
||||
[Female Vocals] [Male Vocals]
|
||||
|
||||
ATMOSPHERE:
|
||||
[Melancholic] [Euphoric] [Nostalgic] [Aggressive]
|
||||
[Dreamy] [Intimate] [Dark Atmosphere]
|
||||
|
||||
SFX:
|
||||
[Vinyl Crackle] [Rain] [Applause] [Static] [Thunder]
|
||||
|
||||
Put tags in BOTH style field AND lyrics for reinforcement.
|
||||
Keep to 5-8 tags per section max — too many confuses the AI.
|
||||
Don't contradict yourself ([Calm] + [Aggressive] in same section).
|
||||
|
||||
### Custom Mode
|
||||
- Always use Custom Mode for serious work (separate Style + Lyrics)
|
||||
- Lyrics field limit: ~3,000 chars (~40-60 lines)
|
||||
- Always add structural tags — without them Suno defaults to
|
||||
flat verse/chorus/verse with no emotional arc
|
||||
|
||||
---
|
||||
|
||||
## 7. Phonetic Tricks for AI Singers
|
||||
|
||||
AI vocalists don't read — they pronounce. Help them:
|
||||
|
||||
PHONETIC RESPELLING:
|
||||
- Spell words as they SOUND: "through" -> "thru"
|
||||
- Proper nouns are highest failure rate — test early
|
||||
- "Nous" -> "Noose" (forces correct pronunciation)
|
||||
- Hyphenate to guide syllables: "Re-search", "bio-engineering"
|
||||
|
||||
DELIVERY CONTROL:
|
||||
- ALL CAPS = louder, more intense
|
||||
- Vowel extension: "lo-o-o-ove" = sustained/melisma
|
||||
- Ellipses: "I... need... you" = dramatic pauses
|
||||
- Hyphenated stretch: "ne-e-ed" = emotional stretch
|
||||
|
||||
ALWAYS:
|
||||
- Spell out numbers: "24/7" -> "twenty four seven"
|
||||
- Space acronyms: "AI" -> "A I" or "A-I"
|
||||
- Test proper nouns/unusual words in a short 30-second clip first
|
||||
- Once generated, pronunciation is baked in — fix in lyrics BEFORE
|
||||
|
||||
---
|
||||
|
||||
## 8. Workflow
|
||||
|
||||
1. Write the concept/hook first — what's the emotional core?
|
||||
2. If adapting, map the original structure (syllables, rhyme, stress)
|
||||
3. Generate raw material — brainstorm freely before structuring
|
||||
4. Draft lyrics into the structure
|
||||
5. Read/sing aloud — catch stumbles, fix meter
|
||||
6. Build the Suno style description — paint the dynamic journey
|
||||
7. Add metatags to lyrics for performance direction
|
||||
8. Generate 3-5 variations minimum — treat them like recording takes
|
||||
9. Pick the best, use Extend/Continue to build on promising sections
|
||||
10. If something great happens by accident, keep it
|
||||
|
||||
EXPECT: ~3-5 generations per 1 good result. Revision is normal.
|
||||
Style can drift in extensions — restate genre/mood when extending.
|
||||
|
||||
---
|
||||
|
||||
## 9. Lessons Learned
|
||||
|
||||
- Describing the dynamic ARC in the style field matters way more
|
||||
than just listing genres. "Whisper to roar to whisper" gives
|
||||
Suno a performance map.
|
||||
- Keeping some original lines intact in a parody adds recognizability
|
||||
and emotional weight — the audience feels the ghost of the original.
|
||||
- The bridge slot in a song is where you can transform imagery.
|
||||
Swap the original's specific references for your theme's metaphors
|
||||
while keeping the emotional function (reflection, shift, revelation).
|
||||
- Monosyllabic word swaps in hooks/tags are the cleanest way to
|
||||
maintain rhythm while changing meaning.
|
||||
- A strong vocal persona description in the style field makes a
|
||||
bigger difference than any single metatag.
|
||||
- Don't be precious about rules. If a line breaks meter but hits
|
||||
harder, keep it. The feeling is what matters. Craft serves art,
|
||||
not the other way around.
|
||||
+183
@@ -0,0 +1,183 @@
|
||||
---
|
||||
title: "Jupyter Live Kernel — Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb"
|
||||
sidebar_label: "Jupyter Live Kernel"
|
||||
description: "Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Jupyter Live Kernel
|
||||
|
||||
Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb. Load this skill when the task involves exploration, iteration, or inspecting intermediate results — data science, ML experimentation, API exploration, or building up complex code step-by-step. Uses terminal to run CLI commands against a live Jupyter kernel. No new tools required.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/data-science/jupyter-live-kernel` |
|
||||
| Version | `1.0.0` |
|
||||
| Author | Hermes Agent |
|
||||
| License | MIT |
|
||||
| Tags | `jupyter`, `notebook`, `repl`, `data-science`, `exploration`, `iterative` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Jupyter Live Kernel (hamelnb)
|
||||
|
||||
Gives you a **stateful Python REPL** via a live Jupyter kernel. Variables persist
|
||||
across executions. Use this instead of `execute_code` when you need to build up
|
||||
state incrementally, explore APIs, inspect DataFrames, or iterate on complex code.
|
||||
|
||||
## When to Use This vs Other Tools
|
||||
|
||||
| Tool | Use When |
|
||||
|------|----------|
|
||||
| **This skill** | Iterative exploration, state across steps, data science, ML, "let me try this and check" |
|
||||
| `execute_code` | One-shot scripts needing hermes tool access (web_search, file ops). Stateless. |
|
||||
| `terminal` | Shell commands, builds, installs, git, process management |
|
||||
|
||||
**Rule of thumb:** If you'd want a Jupyter notebook for the task, use this skill.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **uv** must be installed (check: `which uv`)
|
||||
2. **JupyterLab** must be installed: `uv tool install jupyterlab`
|
||||
3. A Jupyter server must be running (see Setup below)
|
||||
|
||||
## Setup
|
||||
|
||||
The hamelnb script location:
|
||||
```
|
||||
SCRIPT="$HOME/.agent-skills/hamelnb/skills/jupyter-live-kernel/scripts/jupyter_live_kernel.py"
|
||||
```
|
||||
|
||||
If not cloned yet:
|
||||
```
|
||||
git clone https://github.com/hamelsmu/hamelnb.git ~/.agent-skills/hamelnb
|
||||
```
|
||||
|
||||
### Starting JupyterLab
|
||||
|
||||
Check if a server is already running:
|
||||
```
|
||||
uv run "$SCRIPT" servers
|
||||
```
|
||||
|
||||
If no servers found, start one:
|
||||
```
|
||||
jupyter-lab --no-browser --port=8888 --notebook-dir=$HOME/notebooks \
|
||||
--IdentityProvider.token='' --ServerApp.password='' > /tmp/jupyter.log 2>&1 &
|
||||
sleep 3
|
||||
```
|
||||
|
||||
Note: Token/password disabled for local agent access. The server runs headless.
|
||||
|
||||
### Creating a Notebook for REPL Use
|
||||
|
||||
If you just need a REPL (no existing notebook), create a minimal notebook file:
|
||||
```
|
||||
mkdir -p ~/notebooks
|
||||
```
|
||||
Write a minimal .ipynb JSON file with one empty code cell, then start a kernel
|
||||
session via the Jupyter REST API:
|
||||
```
|
||||
curl -s -X POST http://127.0.0.1:8888/api/sessions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"path":"scratch.ipynb","type":"notebook","name":"scratch.ipynb","kernel":{"name":"python3"}}'
|
||||
```
|
||||
|
||||
## Core Workflow
|
||||
|
||||
All commands return structured JSON. Always use `--compact` to save tokens.
|
||||
|
||||
### 1. Discover servers and notebooks
|
||||
|
||||
```
|
||||
uv run "$SCRIPT" servers --compact
|
||||
uv run "$SCRIPT" notebooks --compact
|
||||
```
|
||||
|
||||
### 2. Execute code (primary operation)
|
||||
|
||||
```
|
||||
uv run "$SCRIPT" execute --path <notebook.ipynb> --code '<python code>' --compact
|
||||
```
|
||||
|
||||
State persists across execute calls. Variables, imports, objects all survive.
|
||||
|
||||
Multi-line code works with $'...' quoting:
|
||||
```
|
||||
uv run "$SCRIPT" execute --path scratch.ipynb --code $'import os\nfiles = os.listdir(".")\nprint(f"Found {len(files)} files")' --compact
|
||||
```
|
||||
|
||||
### 3. Inspect live variables
|
||||
|
||||
```
|
||||
uv run "$SCRIPT" variables --path <notebook.ipynb> list --compact
|
||||
uv run "$SCRIPT" variables --path <notebook.ipynb> preview --name <varname> --compact
|
||||
```
|
||||
|
||||
### 4. Edit notebook cells
|
||||
|
||||
```
|
||||
# View current cells
|
||||
uv run "$SCRIPT" contents --path <notebook.ipynb> --compact
|
||||
|
||||
# Insert a new cell
|
||||
uv run "$SCRIPT" edit --path <notebook.ipynb> insert \
|
||||
--at-index <N> --cell-type code --source '<code>' --compact
|
||||
|
||||
# Replace cell source (use cell-id from contents output)
|
||||
uv run "$SCRIPT" edit --path <notebook.ipynb> replace-source \
|
||||
--cell-id <id> --source '<new code>' --compact
|
||||
|
||||
# Delete a cell
|
||||
uv run "$SCRIPT" edit --path <notebook.ipynb> delete --cell-id <id> --compact
|
||||
```
|
||||
|
||||
### 5. Verification (restart + run all)
|
||||
|
||||
Only use when the user asks for a clean verification or you need to confirm
|
||||
the notebook runs top-to-bottom:
|
||||
|
||||
```
|
||||
uv run "$SCRIPT" restart-run-all --path <notebook.ipynb> --save-outputs --compact
|
||||
```
|
||||
|
||||
## Practical Tips from Experience
|
||||
|
||||
1. **First execution after server start may timeout** — the kernel needs a moment
|
||||
to initialize. If you get a timeout, just retry.
|
||||
|
||||
2. **The kernel Python is JupyterLab's Python** — packages must be installed in
|
||||
that environment. If you need additional packages, install them into the
|
||||
JupyterLab tool environment first.
|
||||
|
||||
3. **--compact flag saves significant tokens** — always use it. JSON output can
|
||||
be very verbose without it.
|
||||
|
||||
4. **For pure REPL use**, create a scratch.ipynb and don't bother with cell editing.
|
||||
Just use `execute` repeatedly.
|
||||
|
||||
5. **Argument order matters** — subcommand flags like `--path` go BEFORE the
|
||||
sub-subcommand. E.g.: `variables --path nb.ipynb list` not `variables list --path nb.ipynb`.
|
||||
|
||||
6. **If a session doesn't exist yet**, you need to start one via the REST API
|
||||
(see Setup section). The tool can't execute without a live kernel session.
|
||||
|
||||
7. **Errors are returned as JSON** with traceback — read the `ename` and `evalue`
|
||||
fields to understand what went wrong.
|
||||
|
||||
8. **Occasional websocket timeouts** — some operations may timeout on first try,
|
||||
especially after a kernel restart. Retry once before escalating.
|
||||
|
||||
## Timeout Defaults
|
||||
|
||||
The script has a 30-second default timeout per execution. For long-running
|
||||
operations, pass `--timeout 120`. Use generous timeouts (60+) for initial
|
||||
setup or heavy computation.
|
||||
@@ -0,0 +1,221 @@
|
||||
---
|
||||
title: "Webhook Subscriptions"
|
||||
sidebar_label: "Webhook Subscriptions"
|
||||
description: "Create and manage webhook subscriptions for event-driven agent activation, or for direct push notifications (zero LLM cost)"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Webhook Subscriptions
|
||||
|
||||
Create and manage webhook subscriptions for event-driven agent activation, or for direct push notifications (zero LLM cost). Use when the user wants external services to trigger agent runs OR push notifications to chats.
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/devops/webhook-subscriptions` |
|
||||
| Version | `1.1.0` |
|
||||
| Tags | `webhook`, `events`, `automation`, `integrations`, `notifications`, `push` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Webhook Subscriptions
|
||||
|
||||
Create dynamic webhook subscriptions so external services (GitHub, GitLab, Stripe, CI/CD, IoT sensors, monitoring tools) can trigger Hermes agent runs by POSTing events to a URL.
|
||||
|
||||
## Setup (Required First)
|
||||
|
||||
The webhook platform must be enabled before subscriptions can be created. Check with:
|
||||
```bash
|
||||
hermes webhook list
|
||||
```
|
||||
|
||||
If it says "Webhook platform is not enabled", set it up:
|
||||
|
||||
### Option 1: Setup wizard
|
||||
```bash
|
||||
hermes gateway setup
|
||||
```
|
||||
Follow the prompts to enable webhooks, set the port, and set a global HMAC secret.
|
||||
|
||||
### Option 2: Manual config
|
||||
Add to `~/.hermes/config.yaml`:
|
||||
```yaml
|
||||
platforms:
|
||||
webhook:
|
||||
enabled: true
|
||||
extra:
|
||||
host: "0.0.0.0"
|
||||
port: 8644
|
||||
secret: "generate-a-strong-secret-here"
|
||||
```
|
||||
|
||||
### Option 3: Environment variables
|
||||
Add to `~/.hermes/.env`:
|
||||
```bash
|
||||
WEBHOOK_ENABLED=true
|
||||
WEBHOOK_PORT=8644
|
||||
WEBHOOK_SECRET=generate-a-strong-secret-here
|
||||
```
|
||||
|
||||
After configuration, start (or restart) the gateway:
|
||||
```bash
|
||||
hermes gateway run
|
||||
# Or if using systemd:
|
||||
systemctl --user restart hermes-gateway
|
||||
```
|
||||
|
||||
Verify it's running:
|
||||
```bash
|
||||
curl http://localhost:8644/health
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
All management is via the `hermes webhook` CLI command:
|
||||
|
||||
### Create a subscription
|
||||
```bash
|
||||
hermes webhook subscribe <name> \
|
||||
--prompt "Prompt template with {payload.fields}" \
|
||||
--events "event1,event2" \
|
||||
--description "What this does" \
|
||||
--skills "skill1,skill2" \
|
||||
--deliver telegram \
|
||||
--deliver-chat-id "12345" \
|
||||
--secret "optional-custom-secret"
|
||||
```
|
||||
|
||||
Returns the webhook URL and HMAC secret. The user configures their service to POST to that URL.
|
||||
|
||||
### List subscriptions
|
||||
```bash
|
||||
hermes webhook list
|
||||
```
|
||||
|
||||
### Remove a subscription
|
||||
```bash
|
||||
hermes webhook remove <name>
|
||||
```
|
||||
|
||||
### Test a subscription
|
||||
```bash
|
||||
hermes webhook test <name>
|
||||
hermes webhook test <name> --payload '{"key": "value"}'
|
||||
```
|
||||
|
||||
## Prompt Templates
|
||||
|
||||
Prompts support `{dot.notation}` for accessing nested payload fields:
|
||||
|
||||
- `{issue.title}` — GitHub issue title
|
||||
- `{pull_request.user.login}` — PR author
|
||||
- `{data.object.amount}` — Stripe payment amount
|
||||
- `{sensor.temperature}` — IoT sensor reading
|
||||
|
||||
If no prompt is specified, the full JSON payload is dumped into the agent prompt.
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### GitHub: new issues
|
||||
```bash
|
||||
hermes webhook subscribe github-issues \
|
||||
--events "issues" \
|
||||
--prompt "New GitHub issue #{issue.number}: {issue.title}\n\nAction: {action}\nAuthor: {issue.user.login}\nBody:\n{issue.body}\n\nPlease triage this issue." \
|
||||
--deliver telegram \
|
||||
--deliver-chat-id "-100123456789"
|
||||
```
|
||||
|
||||
Then in GitHub repo Settings → Webhooks → Add webhook:
|
||||
- Payload URL: the returned webhook_url
|
||||
- Content type: application/json
|
||||
- Secret: the returned secret
|
||||
- Events: "Issues"
|
||||
|
||||
### GitHub: PR reviews
|
||||
```bash
|
||||
hermes webhook subscribe github-prs \
|
||||
--events "pull_request" \
|
||||
--prompt "PR #{pull_request.number} {action}: {pull_request.title}\nBy: {pull_request.user.login}\nBranch: {pull_request.head.ref}\n\n{pull_request.body}" \
|
||||
--skills "github-code-review" \
|
||||
--deliver github_comment
|
||||
```
|
||||
|
||||
### Stripe: payment events
|
||||
```bash
|
||||
hermes webhook subscribe stripe-payments \
|
||||
--events "payment_intent.succeeded,payment_intent.payment_failed" \
|
||||
--prompt "Payment {data.object.status}: {data.object.amount} cents from {data.object.receipt_email}" \
|
||||
--deliver telegram \
|
||||
--deliver-chat-id "-100123456789"
|
||||
```
|
||||
|
||||
### CI/CD: build notifications
|
||||
```bash
|
||||
hermes webhook subscribe ci-builds \
|
||||
--events "pipeline" \
|
||||
--prompt "Build {object_attributes.status} on {project.name} branch {object_attributes.ref}\nCommit: {commit.message}" \
|
||||
--deliver discord \
|
||||
--deliver-chat-id "1234567890"
|
||||
```
|
||||
|
||||
### Generic monitoring alert
|
||||
```bash
|
||||
hermes webhook subscribe alerts \
|
||||
--prompt "Alert: {alert.name}\nSeverity: {alert.severity}\nMessage: {alert.message}\n\nPlease investigate and suggest remediation." \
|
||||
--deliver origin
|
||||
```
|
||||
|
||||
### Direct delivery (no agent, zero LLM cost)
|
||||
|
||||
For use cases where you just want to push a notification through to a user's chat — no reasoning, no agent loop — add `--deliver-only`. The rendered `--prompt` template becomes the literal message body and is dispatched directly to the target adapter.
|
||||
|
||||
Use this for:
|
||||
- External service push notifications (Supabase/Firebase webhooks → Telegram)
|
||||
- Monitoring alerts that should forward verbatim
|
||||
- Inter-agent pings where one agent is telling another agent's user something
|
||||
- Any webhook where an LLM round trip would be wasted effort
|
||||
|
||||
```bash
|
||||
hermes webhook subscribe antenna-matches \
|
||||
--deliver telegram \
|
||||
--deliver-chat-id "123456789" \
|
||||
--deliver-only \
|
||||
--prompt "🎉 New match: {match.user_name} matched with you!" \
|
||||
--description "Antenna match notifications"
|
||||
```
|
||||
|
||||
The POST returns `200 OK` on successful delivery, `502` on target failure — so upstream services can retry intelligently. HMAC auth, rate limits, and idempotency still apply.
|
||||
|
||||
Requires `--deliver` to be a real target (telegram, discord, slack, github_comment, etc.) — `--deliver log` is rejected because log-only direct delivery is pointless.
|
||||
|
||||
## Security
|
||||
|
||||
- Each subscription gets an auto-generated HMAC-SHA256 secret (or provide your own with `--secret`)
|
||||
- The webhook adapter validates signatures on every incoming POST
|
||||
- Static routes from config.yaml cannot be overwritten by dynamic subscriptions
|
||||
- Subscriptions persist to `~/.hermes/webhook_subscriptions.json`
|
||||
|
||||
## How It Works
|
||||
|
||||
1. `hermes webhook subscribe` writes to `~/.hermes/webhook_subscriptions.json`
|
||||
2. The webhook adapter hot-reloads this file on each incoming request (mtime-gated, negligible overhead)
|
||||
3. When a POST arrives matching a route, the adapter formats the prompt and triggers an agent run
|
||||
4. The agent's response is delivered to the configured target (Telegram, Discord, GitHub comment, etc.)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If webhooks aren't working:
|
||||
|
||||
1. **Is the gateway running?** Check with `systemctl --user status hermes-gateway` or `ps aux | grep gateway`
|
||||
2. **Is the webhook server listening?** `curl http://localhost:8644/health` should return `{"status": "ok"}`
|
||||
3. **Check gateway logs:** `grep webhook ~/.hermes/logs/gateway.log | tail -20`
|
||||
4. **Signature mismatch?** Verify the secret in your service matches the one from `hermes webhook list`. GitHub sends `X-Hub-Signature-256`, GitLab sends `X-Gitlab-Token`.
|
||||
5. **Firewall/NAT?** The webhook URL must be reachable from the service. For local development, use a tunnel (ngrok, cloudflared).
|
||||
6. **Wrong event type?** Check `--events` filter matches what the service sends. Use `hermes webhook test <name>` to verify the route works.
|
||||
@@ -0,0 +1,178 @@
|
||||
---
|
||||
title: "Dogfood"
|
||||
sidebar_label: "Dogfood"
|
||||
description: "Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports"
|
||||
---
|
||||
|
||||
{/* This page is auto-generated from the skill's SKILL.md by website/scripts/generate-skill-docs.py. Edit the source SKILL.md, not this page. */}
|
||||
|
||||
# Dogfood
|
||||
|
||||
Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports
|
||||
|
||||
## Skill metadata
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Source | Bundled (installed by default) |
|
||||
| Path | `skills/dogfood` |
|
||||
| Version | `1.0.0` |
|
||||
| Tags | `qa`, `testing`, `browser`, `web`, `dogfood` |
|
||||
|
||||
## Reference: full SKILL.md
|
||||
|
||||
:::info
|
||||
The following is the complete skill definition that Hermes loads when this skill is triggered. This is what the agent sees as instructions when the skill is active.
|
||||
:::
|
||||
|
||||
# Dogfood: Systematic Web Application QA Testing
|
||||
|
||||
## Overview
|
||||
|
||||
This skill guides you through systematic exploratory QA testing of web applications using the browser toolset. You will navigate the application, interact with elements, capture evidence of issues, and produce a structured bug report.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Browser toolset must be available (`browser_navigate`, `browser_snapshot`, `browser_click`, `browser_type`, `browser_vision`, `browser_console`, `browser_scroll`, `browser_back`, `browser_press`)
|
||||
- A target URL and testing scope from the user
|
||||
|
||||
## Inputs
|
||||
|
||||
The user provides:
|
||||
1. **Target URL** — the entry point for testing
|
||||
2. **Scope** — what areas/features to focus on (or "full site" for comprehensive testing)
|
||||
3. **Output directory** (optional) — where to save screenshots and the report (default: `./dogfood-output`)
|
||||
|
||||
## Workflow
|
||||
|
||||
Follow this 5-phase systematic workflow:
|
||||
|
||||
### Phase 1: Plan
|
||||
|
||||
1. Create the output directory structure:
|
||||
```
|
||||
{output_dir}/
|
||||
├── screenshots/ # Evidence screenshots
|
||||
└── report.md # Final report (generated in Phase 5)
|
||||
```
|
||||
2. Identify the testing scope based on user input.
|
||||
3. Build a rough sitemap by planning which pages and features to test:
|
||||
- Landing/home page
|
||||
- Navigation links (header, footer, sidebar)
|
||||
- Key user flows (sign up, login, search, checkout, etc.)
|
||||
- Forms and interactive elements
|
||||
- Edge cases (empty states, error pages, 404s)
|
||||
|
||||
### Phase 2: Explore
|
||||
|
||||
For each page or feature in your plan:
|
||||
|
||||
1. **Navigate** to the page:
|
||||
```
|
||||
browser_navigate(url="https://example.com/page")
|
||||
```
|
||||
|
||||
2. **Take a snapshot** to understand the DOM structure:
|
||||
```
|
||||
browser_snapshot()
|
||||
```
|
||||
|
||||
3. **Check the console** for JavaScript errors:
|
||||
```
|
||||
browser_console(clear=true)
|
||||
```
|
||||
Do this after every navigation and after every significant interaction. Silent JS errors are high-value findings.
|
||||
|
||||
4. **Take an annotated screenshot** to visually assess the page and identify interactive elements:
|
||||
```
|
||||
browser_vision(question="Describe the page layout, identify any visual issues, broken elements, or accessibility concerns", annotate=true)
|
||||
```
|
||||
The `annotate=true` flag overlays numbered `[N]` labels on interactive elements. Each `[N]` maps to ref `@eN` for subsequent browser commands.
|
||||
|
||||
5. **Test interactive elements** systematically:
|
||||
- Click buttons and links: `browser_click(ref="@eN")`
|
||||
- Fill forms: `browser_type(ref="@eN", text="test input")`
|
||||
- Test keyboard navigation: `browser_press(key="Tab")`, `browser_press(key="Enter")`
|
||||
- Scroll through content: `browser_scroll(direction="down")`
|
||||
- Test form validation with invalid inputs
|
||||
- Test empty submissions
|
||||
|
||||
6. **After each interaction**, check for:
|
||||
- Console errors: `browser_console()`
|
||||
- Visual changes: `browser_vision(question="What changed after the interaction?")`
|
||||
- Expected vs actual behavior
|
||||
|
||||
### Phase 3: Collect Evidence
|
||||
|
||||
For every issue found:
|
||||
|
||||
1. **Take a screenshot** showing the issue:
|
||||
```
|
||||
browser_vision(question="Capture and describe the issue visible on this page", annotate=false)
|
||||
```
|
||||
Save the `screenshot_path` from the response — you will reference it in the report.
|
||||
|
||||
2. **Record the details**:
|
||||
- URL where the issue occurs
|
||||
- Steps to reproduce
|
||||
- Expected behavior
|
||||
- Actual behavior
|
||||
- Console errors (if any)
|
||||
- Screenshot path
|
||||
|
||||
3. **Classify the issue** using the issue taxonomy (see `references/issue-taxonomy.md`):
|
||||
- Severity: Critical / High / Medium / Low
|
||||
- Category: Functional / Visual / Accessibility / Console / UX / Content
|
||||
|
||||
### Phase 4: Categorize
|
||||
|
||||
1. Review all collected issues.
|
||||
2. De-duplicate — merge issues that are the same bug manifesting in different places.
|
||||
3. Assign final severity and category to each issue.
|
||||
4. Sort by severity (Critical first, then High, Medium, Low).
|
||||
5. Count issues by severity and category for the executive summary.
|
||||
|
||||
### Phase 5: Report
|
||||
|
||||
Generate the final report using the template at `templates/dogfood-report-template.md`.
|
||||
|
||||
The report must include:
|
||||
1. **Executive summary** with total issue count, breakdown by severity, and testing scope
|
||||
2. **Per-issue sections** with:
|
||||
- Issue number and title
|
||||
- Severity and category badges
|
||||
- URL where observed
|
||||
- Description of the issue
|
||||
- Steps to reproduce
|
||||
- Expected vs actual behavior
|
||||
- Screenshot references (use `MEDIA:<screenshot_path>` for inline images)
|
||||
- Console errors if relevant
|
||||
3. **Summary table** of all issues
|
||||
4. **Testing notes** — what was tested, what was not, any blockers
|
||||
|
||||
Save the report to `{output_dir}/report.md`.
|
||||
|
||||
## Tools Reference
|
||||
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| `browser_navigate` | Go to a URL |
|
||||
| `browser_snapshot` | Get DOM text snapshot (accessibility tree) |
|
||||
| `browser_click` | Click an element by ref (`@eN`) or text |
|
||||
| `browser_type` | Type into an input field |
|
||||
| `browser_scroll` | Scroll up/down on the page |
|
||||
| `browser_back` | Go back in browser history |
|
||||
| `browser_press` | Press a keyboard key |
|
||||
| `browser_vision` | Screenshot + AI analysis; use `annotate=true` for element labels |
|
||||
| `browser_console` | Get JS console output and errors |
|
||||
|
||||
## Tips
|
||||
|
||||
- **Always check `browser_console()` after navigating and after significant interactions.** Silent JS errors are among the most valuable findings.
|
||||
- **Use `annotate=true` with `browser_vision`** when you need to reason about interactive element positions or when the snapshot refs are unclear.
|
||||
- **Test with both valid and invalid inputs** — form validation bugs are common.
|
||||
- **Scroll through long pages** — content below the fold may have rendering issues.
|
||||
- **Test navigation flows** — click through multi-step processes end-to-end.
|
||||
- **Check responsive behavior** by noting any layout issues visible in screenshots.
|
||||
- **Don't forget edge cases**: empty states, very long text, special characters, rapid clicking.
|
||||
- When reporting screenshots to the user, include `MEDIA:<screenshot_path>` so they can see the evidence inline.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user