Compare commits
70 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 18e6cd9938 | |||
| d5a89283b7 | |||
| 633f74504f | |||
| 27936ee02d | |||
| 3aa86717b6 | |||
| 492c4c6573 | |||
| 3824b03237 | |||
| 42b917c92c | |||
| 7ccfb97fee | |||
| 7a6128cc4f | |||
| 4b28140912 | |||
| 653b5ec128 | |||
| 164e33aa46 | |||
| cdfbd89ea5 | |||
| 730347e38f | |||
| 628ca99d9b | |||
| 460a8ce5d9 | |||
| aa53fb661a | |||
| 8402ba150e | |||
| 512c610058 | |||
| b479205396 | |||
| 60f2415a4a | |||
| 082acc75b0 | |||
| 4424a0e0f7 | |||
| 98d75dea5a | |||
| 9b55365f6f | |||
| a0b62e0c5a | |||
| ac0325c257 | |||
| 817633bc5d | |||
| 9692ce2072 | |||
| 008860a23f | |||
| 0046d170dc | |||
| 8ad29a938a | |||
| a59a98b180 | |||
| 500774e30e | |||
| c4ad2c33f4 | |||
| 75b460bc94 | |||
| a9033c9220 | |||
| ea3c5a14c3 | |||
| ec671c4154 | |||
| df3c9593f8 | |||
| 8ed599dc05 | |||
| 920ebd8303 | |||
| bb00b783fb | |||
| 5e92b67807 | |||
| ee1a07f9e9 | |||
| 65f648ee84 | |||
| 64a497bfa9 | |||
| 90a3e73daf | |||
| 2e6699b319 | |||
| 21f503c23c | |||
| a32d07529c | |||
| 3ff3dfb5ac | |||
| 8258f4dcb7 | |||
| 9f1b1977bc | |||
| e3921e7ca4 | |||
| 7d586ddb42 | |||
| a131c134bc | |||
| 55be532369 | |||
| 8c5d3a99d6 | |||
| af3d5150c1 | |||
| 4a2ee6c162 | |||
| bda2dbc29e | |||
| 943465235e | |||
| cfc8befe65 | |||
| 3e68809fe0 | |||
| a0fe73bada | |||
| 7c63c24613 | |||
| c5781d50c7 | |||
| 235bfb192b |
+6
-2
@@ -30,18 +30,22 @@ WORKDIR /opt/hermes
|
||||
# unless the lockfiles themselves change.
|
||||
COPY package.json package-lock.json ./
|
||||
COPY web/package.json web/package-lock.json web/
|
||||
COPY ui-tui/package.json ui-tui/package-lock.json ui-tui/
|
||||
COPY ui-tui/packages/hermes-ink/package.json ui-tui/packages/hermes-ink/package-lock.json ui-tui/packages/hermes-ink/
|
||||
|
||||
RUN npm install --prefer-offline --no-audit && \
|
||||
npx playwright install --with-deps chromium --only-shell && \
|
||||
(cd web && npm install --prefer-offline --no-audit) && \
|
||||
(cd ui-tui && npm install --prefer-offline --no-audit) && \
|
||||
npm cache clean --force
|
||||
|
||||
# ---------- Source code ----------
|
||||
# .dockerignore excludes node_modules, so the installs above survive.
|
||||
COPY --chown=hermes:hermes . .
|
||||
|
||||
# Build web dashboard (Vite outputs to hermes_cli/web_dist/)
|
||||
RUN cd web && npm run build
|
||||
# Build browser dashboard and terminal UI assets.
|
||||
RUN cd web && npm run build && \
|
||||
cd ../ui-tui && npm run build
|
||||
|
||||
# ---------- Permissions ----------
|
||||
# Make install dir world-readable so any HERMES_UID can read it at runtime.
|
||||
|
||||
+65
-31
@@ -1617,8 +1617,14 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
|
||||
# below — never look up auth env vars ad-hoc.
|
||||
|
||||
|
||||
def _to_async_client(sync_client, model: str):
|
||||
"""Convert a sync client to its async counterpart, preserving Codex routing."""
|
||||
def _to_async_client(sync_client, model: str, is_vision: bool = False):
|
||||
"""Convert a sync client to its async counterpart, preserving Codex routing.
|
||||
|
||||
When ``is_vision=True`` and the underlying base URL is Copilot, the
|
||||
resulting async client carries the ``Copilot-Vision-Request: true``
|
||||
header so the request is routed to Copilot's vision-capable
|
||||
infrastructure (otherwise vision payloads silently time out).
|
||||
"""
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
if isinstance(sync_client, CodexAuxiliaryClient):
|
||||
@@ -1647,9 +1653,11 @@ def _to_async_client(sync_client, model: str):
|
||||
if base_url_host_matches(sync_base_url, "openrouter.ai"):
|
||||
async_kwargs["default_headers"] = dict(_OR_HEADERS)
|
||||
elif base_url_host_matches(sync_base_url, "api.githubcopilot.com"):
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
from hermes_cli.copilot_auth import copilot_request_headers
|
||||
|
||||
async_kwargs["default_headers"] = copilot_default_headers()
|
||||
async_kwargs["default_headers"] = copilot_request_headers(
|
||||
is_agent_turn=True, is_vision=is_vision
|
||||
)
|
||||
elif base_url_host_matches(sync_base_url, "api.kimi.com"):
|
||||
async_kwargs["default_headers"] = {"User-Agent": "claude-code/0.1.0"}
|
||||
return AsyncOpenAI(**async_kwargs), model
|
||||
@@ -1676,6 +1684,7 @@ def resolve_provider_client(
|
||||
explicit_api_key: str = None,
|
||||
api_mode: str = None,
|
||||
main_runtime: Optional[Dict[str, Any]] = None,
|
||||
is_vision: bool = False,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Central router: given a provider name and optional model, return a
|
||||
configured client with the correct auth, base URL, and API format.
|
||||
@@ -1759,7 +1768,7 @@ def resolve_provider_client(
|
||||
"auxiliary provider (using %r instead)", model, resolved)
|
||||
model = None
|
||||
final_model = model or resolved
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# ── OpenRouter ───────────────────────────────────────────────────
|
||||
@@ -1772,7 +1781,7 @@ def resolve_provider_client(
|
||||
)
|
||||
return None, None
|
||||
final_model = _normalize_resolved_model(model or default, provider)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# ── Nous Portal (OAuth) ──────────────────────────────────────────
|
||||
@@ -1789,7 +1798,7 @@ def resolve_provider_client(
|
||||
"but Nous Portal not configured (run: hermes auth)")
|
||||
return None, None
|
||||
final_model = _normalize_resolved_model(model or default, provider)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# ── OpenAI Codex (OAuth → Responses API) ─────────────────────────
|
||||
@@ -1816,7 +1825,7 @@ def resolve_provider_client(
|
||||
"but no Codex OAuth token found (run: hermes model)")
|
||||
return None, None
|
||||
final_model = _normalize_resolved_model(model or default, provider)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
|
||||
@@ -1845,11 +1854,13 @@ def resolve_provider_client(
|
||||
if base_url_host_matches(custom_base, "api.kimi.com"):
|
||||
extra["default_headers"] = {"User-Agent": "claude-code/0.1.0"}
|
||||
elif base_url_host_matches(custom_base, "api.githubcopilot.com"):
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
extra["default_headers"] = copilot_default_headers()
|
||||
from hermes_cli.copilot_auth import copilot_request_headers
|
||||
extra["default_headers"] = copilot_request_headers(
|
||||
is_agent_turn=True, is_vision=is_vision
|
||||
)
|
||||
client = OpenAI(api_key=custom_key, base_url=_clean_base, **extra)
|
||||
client = _wrap_if_needed(client, final_model, custom_base)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
# Try custom first, then codex, then API-key providers
|
||||
for try_fn in (_try_custom_endpoint, _try_codex,
|
||||
@@ -1859,7 +1870,7 @@ def resolve_provider_client(
|
||||
final_model = _normalize_resolved_model(model or default, provider)
|
||||
_cbase = str(getattr(client, "base_url", "") or "")
|
||||
client = _wrap_if_needed(client, final_model, _cbase)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
logger.warning("resolve_provider_client: custom/main requested "
|
||||
"but no endpoint credentials found")
|
||||
@@ -1904,7 +1915,7 @@ def resolve_provider_client(
|
||||
provider,
|
||||
)
|
||||
client = OpenAI(api_key=custom_key, base_url=_clean_base2, **_extra2)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
sync_anthropic = AnthropicAuxiliaryClient(
|
||||
real_client, final_model, custom_key, custom_base, is_oauth=False,
|
||||
@@ -1923,7 +1934,7 @@ def resolve_provider_client(
|
||||
client = CodexAuxiliaryClient(client, final_model)
|
||||
else:
|
||||
client = _wrap_if_needed(client, final_model, custom_base)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
logger.warning(
|
||||
"resolve_provider_client: named custom provider %r has no base_url",
|
||||
@@ -1955,7 +1966,7 @@ def resolve_provider_client(
|
||||
logger.warning("resolve_provider_client: anthropic requested but no Anthropic credentials found")
|
||||
return None, None
|
||||
final_model = _normalize_resolved_model(model or default_model, provider)
|
||||
return (_to_async_client(client, final_model) if async_mode else (client, final_model))
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode else (client, final_model))
|
||||
|
||||
creds = resolve_api_key_provider_credentials(provider)
|
||||
api_key = str(creds.get("api_key", "")).strip()
|
||||
@@ -1981,7 +1992,7 @@ def resolve_provider_client(
|
||||
if is_native_gemini_base_url(base_url):
|
||||
client = GeminiNativeClient(api_key=api_key, base_url=base_url)
|
||||
logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# Provider-specific headers
|
||||
@@ -1989,9 +2000,11 @@ def resolve_provider_client(
|
||||
if base_url_host_matches(base_url, "api.kimi.com"):
|
||||
headers["User-Agent"] = "claude-code/0.1.0"
|
||||
elif base_url_host_matches(base_url, "api.githubcopilot.com"):
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
from hermes_cli.copilot_auth import copilot_request_headers
|
||||
|
||||
headers.update(copilot_default_headers())
|
||||
headers.update(copilot_request_headers(
|
||||
is_agent_turn=True, is_vision=is_vision
|
||||
))
|
||||
client = OpenAI(api_key=api_key, base_url=base_url,
|
||||
**({"default_headers": headers} if headers else {}))
|
||||
|
||||
@@ -2017,7 +2030,7 @@ def resolve_provider_client(
|
||||
client = _wrap_if_needed(client, final_model, base_url)
|
||||
|
||||
logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
if pconfig.auth_type == "external_process":
|
||||
@@ -2049,7 +2062,7 @@ def resolve_provider_client(
|
||||
args=args,
|
||||
)
|
||||
logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
logger.warning("resolve_provider_client: external-process provider %s not "
|
||||
"directly supported", provider)
|
||||
@@ -2085,7 +2098,7 @@ def resolve_provider_client(
|
||||
base_url=f"https://bedrock-runtime.{region}.amazonaws.com",
|
||||
)
|
||||
logger.debug("resolve_provider_client: bedrock (%s, %s)", final_model, region)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
|
||||
@@ -2160,8 +2173,13 @@ def _normalize_vision_provider(provider: Optional[str]) -> str:
|
||||
return _normalize_aux_provider(provider)
|
||||
|
||||
|
||||
def _resolve_strict_vision_backend(provider: str) -> Tuple[Optional[Any], Optional[str]]:
|
||||
def _resolve_strict_vision_backend(
|
||||
provider: str,
|
||||
model: Optional[str] = None,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
provider = _normalize_vision_provider(provider)
|
||||
if provider == "copilot":
|
||||
return resolve_provider_client("copilot", model, is_vision=True)
|
||||
if provider == "openrouter":
|
||||
return _try_openrouter()
|
||||
if provider == "nous":
|
||||
@@ -2229,7 +2247,7 @@ def resolve_vision_provider_client(
|
||||
return resolved_provider, None, None
|
||||
final_model = resolved_model or default_model
|
||||
if async_mode:
|
||||
async_client, async_model = _to_async_client(sync_client, final_model)
|
||||
async_client, async_model = _to_async_client(sync_client, final_model, is_vision=True)
|
||||
return resolved_provider, async_client, async_model
|
||||
return resolved_provider, sync_client, final_model
|
||||
|
||||
@@ -2261,8 +2279,11 @@ def resolve_vision_provider_client(
|
||||
main_provider = _read_main_provider()
|
||||
main_model = _read_main_model()
|
||||
if main_provider and main_provider not in ("auto", ""):
|
||||
vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model)
|
||||
if main_provider == "nous":
|
||||
sync_client, default_model = _resolve_strict_vision_backend(main_provider)
|
||||
sync_client, default_model = _resolve_strict_vision_backend(
|
||||
main_provider, vision_model
|
||||
)
|
||||
if sync_client is not None:
|
||||
logger.info(
|
||||
"Vision auto-detect: using main provider %s (%s)",
|
||||
@@ -2270,10 +2291,10 @@ def resolve_vision_provider_client(
|
||||
)
|
||||
return _finalize(main_provider, sync_client, default_model)
|
||||
else:
|
||||
vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model)
|
||||
rpc_client, rpc_model = resolve_provider_client(
|
||||
main_provider, vision_model,
|
||||
api_mode=resolved_api_mode)
|
||||
api_mode=resolved_api_mode,
|
||||
is_vision=True)
|
||||
if rpc_client is not None:
|
||||
logger.info(
|
||||
"Vision auto-detect: using main provider %s (%s)",
|
||||
@@ -2295,11 +2316,14 @@ def resolve_vision_provider_client(
|
||||
return None, None, None
|
||||
|
||||
if requested in _VISION_AUTO_PROVIDER_ORDER:
|
||||
sync_client, default_model = _resolve_strict_vision_backend(requested)
|
||||
sync_client, default_model = _resolve_strict_vision_backend(
|
||||
requested, resolved_model
|
||||
)
|
||||
return _finalize(requested, sync_client, default_model)
|
||||
|
||||
client, final_model = _get_cached_client(requested, resolved_model, async_mode,
|
||||
api_mode=resolved_api_mode)
|
||||
api_mode=resolved_api_mode,
|
||||
is_vision=True)
|
||||
if client is None:
|
||||
return requested, None, None
|
||||
return requested, client, final_model
|
||||
@@ -2363,10 +2387,11 @@ def _client_cache_key(
|
||||
api_key: Optional[str] = None,
|
||||
api_mode: Optional[str] = None,
|
||||
main_runtime: Optional[Dict[str, Any]] = None,
|
||||
is_vision: bool = False,
|
||||
) -> tuple:
|
||||
runtime = _normalize_main_runtime(main_runtime)
|
||||
runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else ()
|
||||
return (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key)
|
||||
return (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key, is_vision)
|
||||
|
||||
|
||||
def _store_cached_client(cache_key: tuple, client: Any, default_model: Optional[str], *, bound_loop: Any = None) -> None:
|
||||
@@ -2392,6 +2417,7 @@ def _refresh_nous_auxiliary_client(
|
||||
api_key: Optional[str] = None,
|
||||
api_mode: Optional[str] = None,
|
||||
main_runtime: Optional[Dict[str, Any]] = None,
|
||||
is_vision: bool = False,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Refresh Nous runtime creds, rebuild the client, and replace the cache entry."""
|
||||
runtime = _resolve_nous_runtime_api(force_refresh=True)
|
||||
@@ -2409,7 +2435,7 @@ def _refresh_nous_auxiliary_client(
|
||||
current_loop = _aio.get_event_loop()
|
||||
except RuntimeError:
|
||||
pass
|
||||
client, final_model = _to_async_client(sync_client, final_model or "")
|
||||
client, final_model = _to_async_client(sync_client, final_model or "", is_vision=is_vision)
|
||||
else:
|
||||
client = sync_client
|
||||
|
||||
@@ -2420,6 +2446,7 @@ def _refresh_nous_auxiliary_client(
|
||||
api_key=api_key,
|
||||
api_mode=api_mode,
|
||||
main_runtime=main_runtime,
|
||||
is_vision=is_vision,
|
||||
)
|
||||
_store_cached_client(cache_key, client, final_model, bound_loop=current_loop)
|
||||
return client, final_model
|
||||
@@ -2549,6 +2576,7 @@ def _get_cached_client(
|
||||
api_key: str = None,
|
||||
api_mode: str = None,
|
||||
main_runtime: Optional[Dict[str, Any]] = None,
|
||||
is_vision: bool = False,
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Get or create a cached client for the given provider.
|
||||
|
||||
@@ -2585,6 +2613,7 @@ def _get_cached_client(
|
||||
api_key=api_key,
|
||||
api_mode=api_mode,
|
||||
main_runtime=main_runtime,
|
||||
is_vision=is_vision,
|
||||
)
|
||||
with _client_cache_lock:
|
||||
if cache_key in _client_cache:
|
||||
@@ -2616,6 +2645,7 @@ def _get_cached_client(
|
||||
explicit_api_key=api_key,
|
||||
api_mode=api_mode,
|
||||
main_runtime=runtime,
|
||||
is_vision=is_vision,
|
||||
)
|
||||
if client is not None:
|
||||
# For async clients, remember which loop they were created on so we
|
||||
@@ -3079,6 +3109,7 @@ def call_llm(
|
||||
api_key=resolved_api_key,
|
||||
api_mode=resolved_api_mode,
|
||||
main_runtime=main_runtime,
|
||||
is_vision=(task == "vision"),
|
||||
)
|
||||
if refreshed_client is not None:
|
||||
logger.info("Auxiliary %s: refreshed Nous runtime credentials after 401, retrying",
|
||||
@@ -3369,6 +3400,7 @@ async def async_call_llm(
|
||||
base_url=resolved_base_url,
|
||||
api_key=resolved_api_key,
|
||||
api_mode=resolved_api_mode,
|
||||
is_vision=(task == "vision"),
|
||||
)
|
||||
if refreshed_client is not None:
|
||||
logger.info("Auxiliary %s (async): refreshed Nous runtime credentials after 401, retrying",
|
||||
@@ -3437,7 +3469,9 @@ async def async_call_llm(
|
||||
extra_body=effective_extra_body,
|
||||
base_url=str(getattr(fb_client, "base_url", "") or ""))
|
||||
# Convert sync fallback client to async
|
||||
async_fb, async_fb_model = _to_async_client(fb_client, fb_model or "")
|
||||
async_fb, async_fb_model = _to_async_client(
|
||||
fb_client, fb_model or "", is_vision=(task == "vision")
|
||||
)
|
||||
if async_fb_model and async_fb_model != fb_kwargs.get("model"):
|
||||
fb_kwargs["model"] = async_fb_model
|
||||
return _validate_llm_response(
|
||||
|
||||
@@ -61,9 +61,52 @@ _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||
|
||||
# Chars per token rough estimate
|
||||
_CHARS_PER_TOKEN = 4
|
||||
# Flat token cost per attached image part. Real cost varies by provider and
|
||||
# dimensions (Anthropic ≈ width×height/750, GPT-4o up to ~1700 for
|
||||
# high-detail 2048×2048, Gemini 258/tile), but 1600 is a realistic ceiling
|
||||
# that keeps compression budgeting honest for multi-image conversations.
|
||||
# Matches Claude Code's IMAGE_TOKEN_ESTIMATE constant.
|
||||
_IMAGE_TOKEN_ESTIMATE = 1600
|
||||
# Same figure expressed in the char-budget currency the rest of the
|
||||
# compressor speaks in. Used when accumulating message "content length"
|
||||
# for tail-cut decisions.
|
||||
_IMAGE_CHAR_EQUIVALENT = _IMAGE_TOKEN_ESTIMATE * _CHARS_PER_TOKEN
|
||||
_SUMMARY_FAILURE_COOLDOWN_SECONDS = 600
|
||||
|
||||
|
||||
def _content_length_for_budget(raw_content: Any) -> int:
|
||||
"""Return the effective char-length of a message's content for token budgeting.
|
||||
|
||||
Plain strings: ``len(content)``. Multimodal lists: sum of text-part
|
||||
``len(text)`` plus a flat ``_IMAGE_CHAR_EQUIVALENT`` per image part
|
||||
(``image_url`` / ``input_image`` / Anthropic-style ``image``). This
|
||||
keeps the compressor from treating a turn with 5 attached images as
|
||||
near-zero tokens just because the text part is empty.
|
||||
"""
|
||||
if isinstance(raw_content, str):
|
||||
return len(raw_content)
|
||||
if not isinstance(raw_content, list):
|
||||
return len(str(raw_content or ""))
|
||||
|
||||
total = 0
|
||||
for p in raw_content:
|
||||
if isinstance(p, str):
|
||||
total += len(p)
|
||||
continue
|
||||
if not isinstance(p, dict):
|
||||
total += len(str(p))
|
||||
continue
|
||||
ptype = p.get("type")
|
||||
if ptype in {"image_url", "input_image", "image"}:
|
||||
total += _IMAGE_CHAR_EQUIVALENT
|
||||
else:
|
||||
# text / input_text / tool_result-with-text / anything else with
|
||||
# a text field. Ignore the raw base64 payload inside image_url
|
||||
# dicts — dimensions don't matter, only whether it's an image.
|
||||
total += len(p.get("text", "") or "")
|
||||
return total
|
||||
|
||||
|
||||
def _content_text_for_contains(content: Any) -> str:
|
||||
"""Return a best-effort text view of message content.
|
||||
|
||||
@@ -484,7 +527,7 @@ class ContextCompressor(ContextEngine):
|
||||
for i in range(len(result) - 1, -1, -1):
|
||||
msg = result[i]
|
||||
raw_content = msg.get("content") or ""
|
||||
content_len = sum(len(p.get("text", "")) for p in raw_content) if isinstance(raw_content, list) else len(raw_content)
|
||||
content_len = _content_length_for_budget(raw_content)
|
||||
msg_tokens = content_len // _CHARS_PER_TOKEN + 10
|
||||
for tc in msg.get("tool_calls") or []:
|
||||
if isinstance(tc, dict):
|
||||
@@ -1082,8 +1125,9 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
||||
|
||||
for i in range(n - 1, head_end - 1, -1):
|
||||
msg = messages[i]
|
||||
content = msg.get("content") or ""
|
||||
msg_tokens = len(content) // _CHARS_PER_TOKEN + 10 # +10 for role/metadata
|
||||
raw_content = msg.get("content") or ""
|
||||
content_len = _content_length_for_budget(raw_content)
|
||||
msg_tokens = content_len // _CHARS_PER_TOKEN + 10 # +10 for role/metadata
|
||||
# Include tool call arguments in estimate
|
||||
for tc in msg.get("tool_calls") or []:
|
||||
if isinstance(tc, dict):
|
||||
|
||||
@@ -42,6 +42,7 @@ class FailoverReason(enum.Enum):
|
||||
# Context / payload
|
||||
context_overflow = "context_overflow" # Context too large — compress, not failover
|
||||
payload_too_large = "payload_too_large" # 413 — compress payload
|
||||
image_too_large = "image_too_large" # Native image part exceeds provider's per-image limit — shrink and retry
|
||||
|
||||
# Model
|
||||
model_not_found = "model_not_found" # 404 or invalid model — fallback to different model
|
||||
@@ -147,6 +148,20 @@ _PAYLOAD_TOO_LARGE_PATTERNS = [
|
||||
"error code: 413",
|
||||
]
|
||||
|
||||
# Image-size patterns. Matched against 400 bodies (not 413) because most
|
||||
# providers return a 400 with a specific image-too-big message before the
|
||||
# whole request hits the 413 size limit. Anthropic's wording is the most
|
||||
# important here (hard 5 MB per image, returned as
|
||||
# "messages.N.content.K.image.source.base64: image exceeds 5 MB maximum").
|
||||
_IMAGE_TOO_LARGE_PATTERNS = [
|
||||
"image exceeds", # Anthropic: "image exceeds 5 MB maximum"
|
||||
"image too large", # generic
|
||||
"image_too_large", # error_code variant
|
||||
"image size exceeds", # variant
|
||||
# "request_too_large" on a request known to contain an image → image is
|
||||
# the likely culprit; we still try the shrink path before giving up.
|
||||
]
|
||||
|
||||
# Context overflow patterns
|
||||
_CONTEXT_OVERFLOW_PATTERNS = [
|
||||
"context length",
|
||||
@@ -671,6 +686,15 @@ def _classify_400(
|
||||
) -> ClassifiedError:
|
||||
"""Classify 400 Bad Request — context overflow, format error, or generic."""
|
||||
|
||||
# Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way).
|
||||
# Must be checked BEFORE context_overflow because messages can trip both
|
||||
# patterns ("exceeds" + "image") and image-shrink is a cheaper recovery.
|
||||
if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.image_too_large,
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
# Context overflow from 400
|
||||
if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS):
|
||||
return result_fn(
|
||||
@@ -798,6 +822,13 @@ def _classify_by_message(
|
||||
should_compress=True,
|
||||
)
|
||||
|
||||
# Image-too-large patterns (from message text when no status_code)
|
||||
if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.image_too_large,
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
# Usage-limit patterns need the same disambiguation as 402: some providers
|
||||
# surface "usage limit" errors without an HTTP status code. A transient
|
||||
# signal ("try again", "resets at", …) means it's a periodic quota, not
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
"""Routing helpers for inbound user-attached images.
|
||||
|
||||
Two modes:
|
||||
|
||||
native — attach images as OpenAI-style ``image_url`` content parts on the
|
||||
user turn. Provider adapters (Anthropic, Gemini, Bedrock, Codex,
|
||||
OpenAI chat.completions) already translate these into their
|
||||
vendor-specific multimodal formats.
|
||||
|
||||
text — run ``vision_analyze`` on each image up-front and prepend the
|
||||
description to the user's text. The model never sees the pixels;
|
||||
it only sees a lossy text summary. This is the pre-existing
|
||||
behaviour and still the right choice for non-vision models.
|
||||
|
||||
The decision is made once per message turn by :func:`decide_image_input_mode`.
|
||||
It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
|
||||
| ``text``, default ``auto``) and the active model's capability metadata.
|
||||
|
||||
In ``auto`` mode:
|
||||
- If the user has explicitly configured ``auxiliary.vision.provider``
|
||||
(i.e. not ``auto`` and not empty), we assume they want the text pipeline
|
||||
regardless of the main model — they've opted in to a specific vision
|
||||
backend for a reason (cost, quality, local-only, etc.).
|
||||
- Otherwise, if the active model reports ``supports_vision=True`` in its
|
||||
models.dev metadata, we attach natively.
|
||||
- Otherwise (non-vision model, no explicit override), we fall back to text.
|
||||
|
||||
This keeps ``vision_analyze`` surfaced as a tool in every session — skills
|
||||
and agent flows that chain it (browser screenshots, deeper inspection of
|
||||
URL-referenced images, style-gating loops) keep working. The routing only
|
||||
affects *how user-attached images on the current turn* are presented to the
|
||||
main model.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_VALID_MODES = frozenset({"auto", "native", "text"})
|
||||
|
||||
|
||||
def _coerce_mode(raw: Any) -> str:
|
||||
"""Normalize a config value into one of the valid modes."""
|
||||
if not isinstance(raw, str):
|
||||
return "auto"
|
||||
val = raw.strip().lower()
|
||||
if val in _VALID_MODES:
|
||||
return val
|
||||
return "auto"
|
||||
|
||||
|
||||
def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
|
||||
"""True when the user configured a specific auxiliary vision backend.
|
||||
|
||||
An explicit override means the user *wants* the text pipeline (they're
|
||||
paying for a dedicated vision model), so we don't silently bypass it.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
return False
|
||||
aux = cfg.get("auxiliary") or {}
|
||||
if not isinstance(aux, dict):
|
||||
return False
|
||||
vision = aux.get("vision") or {}
|
||||
if not isinstance(vision, dict):
|
||||
return False
|
||||
|
||||
provider = str(vision.get("provider") or "").strip().lower()
|
||||
model = str(vision.get("model") or "").strip()
|
||||
base_url = str(vision.get("base_url") or "").strip()
|
||||
|
||||
# "auto" / "" / blank = not explicit
|
||||
if provider in ("", "auto") and not model and not base_url:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:
|
||||
"""Return True/False if we can resolve caps, None if unknown."""
|
||||
if not provider or not model:
|
||||
return None
|
||||
try:
|
||||
from agent.models_dev import get_model_capabilities
|
||||
caps = get_model_capabilities(provider, model)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)
|
||||
return None
|
||||
if caps is None:
|
||||
return None
|
||||
return bool(caps.supports_vision)
|
||||
|
||||
|
||||
def decide_image_input_mode(
|
||||
provider: str,
|
||||
model: str,
|
||||
cfg: Optional[Dict[str, Any]],
|
||||
) -> str:
|
||||
"""Return ``"native"`` or ``"text"`` for the given turn.
|
||||
|
||||
Args:
|
||||
provider: active inference provider ID (e.g. ``"anthropic"``, ``"openrouter"``).
|
||||
model: active model slug as it would be sent to the provider.
|
||||
cfg: loaded config.yaml dict, or None. When None, behaves as auto.
|
||||
"""
|
||||
mode_cfg = "auto"
|
||||
if isinstance(cfg, dict):
|
||||
agent_cfg = cfg.get("agent") or {}
|
||||
if isinstance(agent_cfg, dict):
|
||||
mode_cfg = _coerce_mode(agent_cfg.get("image_input_mode"))
|
||||
|
||||
if mode_cfg == "native":
|
||||
return "native"
|
||||
if mode_cfg == "text":
|
||||
return "text"
|
||||
|
||||
# auto
|
||||
if _explicit_aux_vision_override(cfg):
|
||||
return "text"
|
||||
|
||||
supports = _lookup_supports_vision(provider, model)
|
||||
if supports is True:
|
||||
return "native"
|
||||
return "text"
|
||||
|
||||
|
||||
# Image size handling is REACTIVE rather than proactive: we attempt native
|
||||
# attachment at full size regardless of provider, and rely on
|
||||
# ``run_agent._try_shrink_image_parts_in_messages`` to shrink + retry if
|
||||
# the provider rejects the request (e.g. Anthropic's hard 5 MB per-image
|
||||
# ceiling returned as HTTP 400 "image exceeds 5 MB maximum").
|
||||
#
|
||||
# Why reactive: our knowledge of provider ceilings is partial and evolving
|
||||
# (OpenAI accepts 49 MB+, Anthropic 5 MB, Gemini 100 MB, others unknown).
|
||||
# A proactive per-provider table would be stale the moment a provider raises
|
||||
# or lowers its limit, and silently degrading quality for users on providers
|
||||
# that would have accepted the full image is the worse failure mode.
|
||||
# The shrink-on-reject path loses 1 API call + maybe 1s of Pillow work when
|
||||
# it fires, which is cheaper than permanent quality loss.
|
||||
|
||||
|
||||
def _guess_mime(path: Path) -> str:
|
||||
mime, _ = mimetypes.guess_type(str(path))
|
||||
if mime and mime.startswith("image/"):
|
||||
return mime
|
||||
# mimetypes on some Linux distros mis-maps .jpg; default to jpeg when
|
||||
# the suffix looks imagey.
|
||||
suffix = path.suffix.lower()
|
||||
return {
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".bmp": "image/bmp",
|
||||
}.get(suffix, "image/jpeg")
|
||||
|
||||
|
||||
def _file_to_data_url(path: Path) -> Optional[str]:
|
||||
"""Encode a local image as a base64 data URL at its native size.
|
||||
|
||||
Size limits are NOT enforced here — the agent retry loop
|
||||
(``run_agent._try_shrink_image_parts_in_messages``) shrinks on the
|
||||
provider's first rejection. Keeping this simple means providers that
|
||||
accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
|
||||
quality tax just because one other provider is stricter.
|
||||
|
||||
Returns None only if the file can't be read (missing, permission
|
||||
denied, etc.); the caller reports those paths in ``skipped``.
|
||||
"""
|
||||
try:
|
||||
raw = path.read_bytes()
|
||||
except Exception as exc:
|
||||
logger.warning("image_routing: failed to read %s — %s", path, exc)
|
||||
return None
|
||||
mime = _guess_mime(path)
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
return f"data:{mime};base64,{b64}"
|
||||
|
||||
|
||||
def build_native_content_parts(
|
||||
user_text: str,
|
||||
image_paths: List[str],
|
||||
) -> Tuple[List[Dict[str, Any]], List[str]]:
|
||||
"""Build an OpenAI-style ``content`` list for a user turn.
|
||||
|
||||
Shape:
|
||||
[{"type": "text", "text": "..."},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
|
||||
...]
|
||||
|
||||
Images are attached at their native size. If a provider rejects the
|
||||
request because an image is too large (e.g. Anthropic's 5 MB per-image
|
||||
ceiling), the agent's retry loop transparently shrinks and retries
|
||||
once — see ``run_agent._try_shrink_image_parts_in_messages``.
|
||||
|
||||
Returns (content_parts, skipped_paths). Skipped paths are files that
|
||||
couldn't be read from disk.
|
||||
"""
|
||||
parts: List[Dict[str, Any]] = []
|
||||
skipped: List[str] = []
|
||||
|
||||
text = (user_text or "").strip()
|
||||
if text:
|
||||
parts.append({"type": "text", "text": text})
|
||||
|
||||
for raw_path in image_paths:
|
||||
p = Path(raw_path)
|
||||
if not p.exists() or not p.is_file():
|
||||
skipped.append(str(raw_path))
|
||||
continue
|
||||
data_url = _file_to_data_url(p)
|
||||
if not data_url:
|
||||
skipped.append(str(raw_path))
|
||||
continue
|
||||
parts.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": data_url},
|
||||
})
|
||||
|
||||
# If the text was empty, add a neutral prompt so the turn isn't just images.
|
||||
if not text and any(p.get("type") == "image_url" for p in parts):
|
||||
parts.insert(0, {"type": "text", "text": "What do you see in this image?"})
|
||||
|
||||
return parts, skipped
|
||||
|
||||
|
||||
__all__ = [
|
||||
"decide_image_input_mode",
|
||||
"build_native_content_parts",
|
||||
]
|
||||
@@ -141,6 +141,12 @@ DEFAULT_AGENT_IDENTITY = (
|
||||
"Be targeted and efficient in your exploration and investigations."
|
||||
)
|
||||
|
||||
HERMES_AGENT_HELP_GUIDANCE = (
|
||||
"If the user asks about configuring, setting up, or using Hermes Agent "
|
||||
"itself, load the `hermes-agent` skill with skill_view(name='hermes-agent') "
|
||||
"before answering. Docs: https://hermes-agent.nousresearch.com/docs"
|
||||
)
|
||||
|
||||
MEMORY_GUIDANCE = (
|
||||
"You have persistent memory across sessions. Save durable facts using the memory "
|
||||
"tool: user preferences, environment details, tool quirks, and stable conventions. "
|
||||
|
||||
@@ -6,12 +6,18 @@ adds latency to the user-facing reply.
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
from agent.auxiliary_client import call_llm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Callback signature: (task_name, exception) -> None. Used to surface
|
||||
# auxiliary failures to the user through AIAgent._emit_auxiliary_failure
|
||||
# so silent-drops (e.g. OpenRouter 402 exhausting the fallback chain)
|
||||
# become visible instead of piling up as NULL session titles.
|
||||
FailureCallback = Callable[[str, BaseException], None]
|
||||
|
||||
_TITLE_PROMPT = (
|
||||
"Generate a short, descriptive title (3-7 words) for a conversation that starts with the "
|
||||
"following exchange. The title should capture the main topic or intent. "
|
||||
@@ -19,11 +25,21 @@ _TITLE_PROMPT = (
|
||||
)
|
||||
|
||||
|
||||
def generate_title(user_message: str, assistant_response: str, timeout: float = 30.0) -> Optional[str]:
|
||||
def generate_title(
|
||||
user_message: str,
|
||||
assistant_response: str,
|
||||
timeout: float = 30.0,
|
||||
failure_callback: Optional[FailureCallback] = None,
|
||||
) -> Optional[str]:
|
||||
"""Generate a session title from the first exchange.
|
||||
|
||||
Uses the auxiliary LLM client (cheapest/fastest available model).
|
||||
Returns the title string or None on failure.
|
||||
|
||||
``failure_callback`` is invoked with ``(task, exception)`` when the
|
||||
auxiliary call raises — the caller typically wires this to
|
||||
``AIAgent._emit_auxiliary_failure`` so the user sees a warning instead
|
||||
of silently accumulating untitled sessions.
|
||||
"""
|
||||
# Truncate long messages to keep the request small
|
||||
user_snippet = user_message[:500] if user_message else ""
|
||||
@@ -52,7 +68,15 @@ def generate_title(user_message: str, assistant_response: str, timeout: float =
|
||||
title = title[:77] + "..."
|
||||
return title if title else None
|
||||
except Exception as e:
|
||||
logger.debug("Title generation failed: %s", e)
|
||||
# Log at WARNING so this shows up in agent.log without debug mode.
|
||||
# Full detail at debug level for operators who need the stack.
|
||||
logger.warning("Title generation failed: %s", e)
|
||||
logger.debug("Title generation traceback", exc_info=True)
|
||||
if failure_callback is not None:
|
||||
try:
|
||||
failure_callback("title generation", e)
|
||||
except Exception:
|
||||
logger.debug("Title generation failure_callback raised", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
@@ -61,6 +85,7 @@ def auto_title_session(
|
||||
session_id: str,
|
||||
user_message: str,
|
||||
assistant_response: str,
|
||||
failure_callback: Optional[FailureCallback] = None,
|
||||
) -> None:
|
||||
"""Generate and set a session title if one doesn't already exist.
|
||||
|
||||
@@ -81,7 +106,9 @@ def auto_title_session(
|
||||
except Exception:
|
||||
return
|
||||
|
||||
title = generate_title(user_message, assistant_response)
|
||||
title = generate_title(
|
||||
user_message, assistant_response, failure_callback=failure_callback
|
||||
)
|
||||
if not title:
|
||||
return
|
||||
|
||||
@@ -98,6 +125,7 @@ def maybe_auto_title(
|
||||
user_message: str,
|
||||
assistant_response: str,
|
||||
conversation_history: list,
|
||||
failure_callback: Optional[FailureCallback] = None,
|
||||
) -> None:
|
||||
"""Fire-and-forget title generation after the first exchange.
|
||||
|
||||
@@ -119,6 +147,7 @@ def maybe_auto_title(
|
||||
thread = threading.Thread(
|
||||
target=auto_title_session,
|
||||
args=(session_db, session_id, user_message, assistant_response),
|
||||
kwargs={"failure_callback": failure_callback},
|
||||
daemon=True,
|
||||
name="auto-title",
|
||||
)
|
||||
|
||||
@@ -15,6 +15,7 @@ Usage:
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import json
|
||||
@@ -758,9 +759,17 @@ def _run_cleanup():
|
||||
pass
|
||||
try:
|
||||
if _active_agent_ref and hasattr(_active_agent_ref, 'shutdown_memory_provider'):
|
||||
_active_agent_ref.shutdown_memory_provider(
|
||||
getattr(_active_agent_ref, 'conversation_history', None) or []
|
||||
)
|
||||
# Forward the agent's own transcript so memory providers'
|
||||
# ``on_session_end`` hooks see the real conversation instead of
|
||||
# an empty list (#15165). ``_session_messages`` is set on
|
||||
# ``AIAgent.__init__`` and refreshed every turn via
|
||||
# ``_persist_session``. Fall back to no-arg on test stubs /
|
||||
# partially-initialised agents where the attribute is missing.
|
||||
_session_msgs = getattr(_active_agent_ref, '_session_messages', None)
|
||||
if isinstance(_session_msgs, list):
|
||||
_active_agent_ref.shutdown_memory_provider(_session_msgs)
|
||||
else:
|
||||
_active_agent_ref.shutdown_memory_provider()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -1547,6 +1556,60 @@ def _should_auto_attach_clipboard_image_on_paste(pasted_text: str) -> bool:
|
||||
return not pasted_text.strip()
|
||||
|
||||
|
||||
def _strip_leaked_bracketed_paste_wrappers(text: str) -> str:
|
||||
"""Strip leaked bracketed-paste wrapper markers from user-visible text.
|
||||
|
||||
Defensive normalization for cases where terminal/prompt_toolkit parsing
|
||||
fails and bracketed-paste markers end up in the buffer as literal text.
|
||||
|
||||
We strip canonical wrappers unconditionally and also handle degraded
|
||||
visible forms like ``[200~`` / ``[201~`` and ``00~`` / ``01~`` when they
|
||||
look like wrapper boundaries, not arbitrary user content.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
text = (
|
||||
text.replace("\x1b[200~", "")
|
||||
.replace("\x1b[201~", "")
|
||||
.replace("^[[200~", "")
|
||||
.replace("^[[201~", "")
|
||||
)
|
||||
text = re.sub(r"(^|[\s\n>:\]\)])\[200~", r"\1", text)
|
||||
text = re.sub(r"\[201~(?=$|[\s\n<\[\(\):;.,!?])", "", text)
|
||||
text = re.sub(r"(^|[\s\n>:\]\)])00~", r"\1", text)
|
||||
text = re.sub(r"01~(?=$|[\s\n<\[\(\):;.,!?])", "", text)
|
||||
return text
|
||||
|
||||
|
||||
# Cursor Position Report (CPR / DSR) response, format ``ESC[<row>;<col>R``.
|
||||
# prompt_toolkit's _on_resize() + renderer send ``ESC[6n`` queries to the
|
||||
# terminal; under resize storms or tab switches the terminal's reply can
|
||||
# race past the input parser and end up in the input buffer as literal
|
||||
# text (see issue #14692). Also matches the visible-form ``^[[<row>;<col>R``
|
||||
# that appears when the ESC byte was stripped by a prior filter.
|
||||
_DSR_CPR_ESC_RE = re.compile(r"\x1b\[\d+;\d+R")
|
||||
_DSR_CPR_VISIBLE_RE = re.compile(r"\^\[\[\d+;\d+R")
|
||||
|
||||
|
||||
def _strip_leaked_terminal_responses(text: str) -> str:
|
||||
"""Strip leaked terminal control-response sequences from user input.
|
||||
|
||||
Covers Cursor Position Report (CPR / DSR) responses — ``ESC[<row>;<col>R``
|
||||
and the visible ``^[[<row>;<col>R`` form. These are replies the terminal
|
||||
sends back to queries prompt_toolkit makes during ``_on_resize`` /
|
||||
``_request_absolute_cursor_position``. When the input parser drops one
|
||||
(resize storms, multiplexer focus changes, slow PTYs) the response
|
||||
lands in the input buffer as literal text and corrupts what the user
|
||||
typed.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
text = _DSR_CPR_ESC_RE.sub("", text)
|
||||
text = _DSR_CPR_VISIBLE_RE.sub("", text)
|
||||
return text
|
||||
|
||||
|
||||
def _collect_query_images(query: str | None, image_arg: str | None = None) -> tuple[str, list[Path]]:
|
||||
"""Collect local image attachments for single-query CLI flows."""
|
||||
message = query or ""
|
||||
@@ -2155,6 +2218,42 @@ class HermesCLI:
|
||||
self._last_invalidate = now
|
||||
self._app.invalidate()
|
||||
|
||||
def _force_full_redraw(self) -> None:
|
||||
"""Force a clean full-screen repaint of the prompt_toolkit UI.
|
||||
|
||||
Used to recover from terminal buffer drift caused by external
|
||||
redraws we can't detect — e.g. macOS cmux / tmux tab switches,
|
||||
``clear`` issued from a subshell, or SSH window restores. These
|
||||
wipe or repaint the terminal without firing SIGWINCH, so
|
||||
prompt_toolkit's tracked ``_cursor_pos`` no longer matches reality
|
||||
and the next incremental redraw stacks on top of stale content
|
||||
(ghost status bars, duplicated prompts).
|
||||
|
||||
Bound to Ctrl+L and exposed as the ``/redraw`` slash command,
|
||||
matching the standard terminal-UX convention (bash, zsh, fish,
|
||||
vim, htop).
|
||||
"""
|
||||
app = getattr(self, "_app", None)
|
||||
if not app:
|
||||
return
|
||||
try:
|
||||
renderer = app.renderer
|
||||
out = renderer.output
|
||||
out.reset_attributes()
|
||||
out.erase_screen()
|
||||
out.cursor_goto(0, 0)
|
||||
out.flush()
|
||||
# Drop prompt_toolkit's cached screen + cursor state so the
|
||||
# next _redraw() starts from a known (0, 0) origin and
|
||||
# re-renders every cell rather than diffing against stale.
|
||||
renderer.reset(leave_alternate_screen=False)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
app.invalidate()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _status_bar_context_style(self, percent_used: Optional[int]) -> str:
|
||||
if percent_used is None:
|
||||
return "class:status-bar-dim"
|
||||
@@ -5971,6 +6070,12 @@ class HermesCLI:
|
||||
self.show_toolsets()
|
||||
elif canonical == "config":
|
||||
self.show_config()
|
||||
elif canonical == "redraw":
|
||||
# Manual recovery for terminal buffer drift from multiplexer
|
||||
# tab switches, subshell ``clear``, SSH window restores, etc.
|
||||
# See issue #8688 (cmux). Ctrl+L is bound to the same helper.
|
||||
self._force_full_redraw()
|
||||
_cprint(f" {_DIM}✓ UI redrawn{_RST}")
|
||||
elif canonical == "clear":
|
||||
self.new_session(silent=True)
|
||||
# Clear terminal screen. Inside the TUI, Rich's console.clear()
|
||||
@@ -8336,13 +8441,62 @@ class HermesCLI:
|
||||
):
|
||||
return None
|
||||
|
||||
# Pre-process images through the vision tool (Gemini Flash) so the
|
||||
# main model receives text descriptions instead of raw base64 image
|
||||
# content — works with any model, not just vision-capable ones.
|
||||
# Route image attachments based on the active model's vision capability.
|
||||
# "native" → pass pixels as OpenAI-style content parts (adapters
|
||||
# translate for Anthropic/Gemini/Bedrock).
|
||||
# "text" → pre-analyze each image with vision_analyze and prepend the
|
||||
# description as text — works with non-vision models.
|
||||
# See agent/image_routing.py for the decision table.
|
||||
if images:
|
||||
message = self._preprocess_images_with_vision(
|
||||
message if isinstance(message, str) else "", images
|
||||
)
|
||||
try:
|
||||
from agent.image_routing import (
|
||||
build_native_content_parts,
|
||||
decide_image_input_mode,
|
||||
)
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
_img_mode = decide_image_input_mode(
|
||||
(self.provider or "").strip(),
|
||||
(self.model or "").strip(),
|
||||
load_config(),
|
||||
)
|
||||
except Exception as _img_exc:
|
||||
logging.debug("image_routing decision failed, defaulting to text: %s", _img_exc)
|
||||
_img_mode = "text"
|
||||
|
||||
if _img_mode == "native":
|
||||
try:
|
||||
_text_for_parts = message if isinstance(message, str) else ""
|
||||
_img_str_paths = [str(p) for p in images]
|
||||
_parts, _skipped = build_native_content_parts(
|
||||
_text_for_parts,
|
||||
_img_str_paths,
|
||||
)
|
||||
if _skipped:
|
||||
_cprint(
|
||||
f" {_DIM}⚠ skipped {len(_skipped)} unreadable image path(s){_RST}"
|
||||
)
|
||||
if any(p.get("type") == "image_url" for p in _parts):
|
||||
_img_names = ", ".join(Path(p).name for p in _img_str_paths)
|
||||
_cprint(
|
||||
f" {_DIM}📎 attaching {len(images)} image(s) natively "
|
||||
f"(model supports vision): {_img_names}{_RST}"
|
||||
)
|
||||
message = _parts
|
||||
else:
|
||||
# All images unreadable — fall back to text enrichment.
|
||||
message = self._preprocess_images_with_vision(
|
||||
message if isinstance(message, str) else "", images
|
||||
)
|
||||
except Exception as _img_exc:
|
||||
logging.warning("native image attach failed, falling back to text: %s", _img_exc)
|
||||
message = self._preprocess_images_with_vision(
|
||||
message if isinstance(message, str) else "", images
|
||||
)
|
||||
else:
|
||||
message = self._preprocess_images_with_vision(
|
||||
message if isinstance(message, str) else "", images
|
||||
)
|
||||
|
||||
# Expand @ context references (e.g. @file:main.py, @diff, @folder:src/)
|
||||
if isinstance(message, str) and "@" in message:
|
||||
@@ -8645,12 +8799,20 @@ class HermesCLI:
|
||||
if response and result and not result.get("failed") and not result.get("partial"):
|
||||
try:
|
||||
from agent.title_generator import maybe_auto_title
|
||||
# Route title-generation failures through the agent's
|
||||
# user-visible warning channel so a depleted auxiliary
|
||||
# provider doesn't silently leave sessions untitled
|
||||
# (issue #15775).
|
||||
_title_failure_cb = getattr(
|
||||
self.agent, "_emit_auxiliary_failure", None
|
||||
) if self.agent else None
|
||||
maybe_auto_title(
|
||||
self._session_db,
|
||||
self.session_id,
|
||||
message,
|
||||
response,
|
||||
self.conversation_history,
|
||||
failure_callback=_title_failure_cb,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
@@ -9528,6 +9690,17 @@ class HermesCLI:
|
||||
"""Down arrow: browse history when on last line, else move cursor down."""
|
||||
event.app.current_buffer.auto_down(count=event.arg)
|
||||
|
||||
@kb.add('c-l')
|
||||
def handle_ctrl_l(event):
|
||||
"""Ctrl+L: force a clean full-screen repaint.
|
||||
|
||||
Recovers the UI after external terminal buffer drift — tmux /
|
||||
cmux tab switches, ``clear`` from a subshell, SSH window
|
||||
restores, etc. — that prompt_toolkit can't detect on its own.
|
||||
Matches the universal bash/zsh/fish/vim/htop convention.
|
||||
"""
|
||||
self._force_full_redraw()
|
||||
|
||||
@kb.add('c-c')
|
||||
def handle_ctrl_c(event):
|
||||
"""Handle Ctrl+C - cancel interactive prompts, interrupt agent, or exit.
|
||||
@@ -9755,10 +9928,18 @@ class HermesCLI:
|
||||
placeholder while preserving any existing user text in the
|
||||
buffer.
|
||||
"""
|
||||
# Diagnostic canary: measure how long the paste handler blocks
|
||||
# the prompt_toolkit event loop. If this exceeds ~500ms we log
|
||||
# it so recurring "CLI freezes on paste" reports (issue #16263,
|
||||
# macOS Tahoe 26 + iTerm2/Ghostty) arrive with data attached.
|
||||
_paste_handler_start = time.perf_counter()
|
||||
_paste_raw_size = len(event.data or "")
|
||||
pasted_text = event.data or ""
|
||||
# Normalise line endings — Windows \r\n and old Mac \r both become \n
|
||||
# so the 5-line collapse threshold and display are consistent.
|
||||
pasted_text = pasted_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
pasted_text = _strip_leaked_bracketed_paste_wrappers(pasted_text)
|
||||
pasted_text = _strip_leaked_terminal_responses(pasted_text)
|
||||
if _should_auto_attach_clipboard_image_on_paste(pasted_text) and self._try_attach_clipboard_image():
|
||||
event.app.invalidate()
|
||||
if pasted_text:
|
||||
@@ -9781,6 +9962,17 @@ class HermesCLI:
|
||||
buf.insert_text(prefix + placeholder)
|
||||
else:
|
||||
buf.insert_text(pasted_text)
|
||||
_paste_handler_elapsed_ms = (time.perf_counter() - _paste_handler_start) * 1000.0
|
||||
if _paste_handler_elapsed_ms > 500.0:
|
||||
logger.warning(
|
||||
"Slow bracketed-paste handler: %.1fms to process %d bytes "
|
||||
"(%d lines) on %s. If the input becomes unresponsive after "
|
||||
"this, attach this log line to the bug report.",
|
||||
_paste_handler_elapsed_ms,
|
||||
_paste_raw_size,
|
||||
pasted_text.count('\n') + 1 if pasted_text else 0,
|
||||
sys.platform,
|
||||
)
|
||||
|
||||
@kb.add('c-v')
|
||||
def handle_ctrl_v(event):
|
||||
@@ -9900,7 +10092,16 @@ class HermesCLI:
|
||||
still batch newlines. Alt+Enter only adds 1 newline per
|
||||
event so it never triggers this.
|
||||
"""
|
||||
text = buf.text
|
||||
text = _strip_leaked_bracketed_paste_wrappers(buf.text)
|
||||
text = _strip_leaked_terminal_responses(text)
|
||||
if text != buf.text:
|
||||
cursor = min(buf.cursor_position, len(text))
|
||||
_paste_just_collapsed[0] = True
|
||||
buf.text = text
|
||||
buf.cursor_position = cursor
|
||||
_prev_text_len[0] = len(text)
|
||||
_prev_newline_count[0] = text.count('\n')
|
||||
return
|
||||
chars_added = len(text) - _prev_text_len[0]
|
||||
_prev_text_len[0] = len(text)
|
||||
if _paste_just_collapsed[0] or self._skip_paste_collapse:
|
||||
@@ -10557,36 +10758,30 @@ class HermesCLI:
|
||||
# only cursor_up()s by the stored layout height, missing the extra
|
||||
# rows created by reflow — leaving ghost duplicates visible.
|
||||
#
|
||||
# Fix: before the standard erase, inflate _cursor_pos.y so the
|
||||
# cursor moves up far enough to cover the reflowed ghost content.
|
||||
# It's not just column-shrink: widening, row-shrinking, and
|
||||
# multiplexer-driven SIGWINCH-less redraws (cmux / tmux tab switch)
|
||||
# all produce the same class of drift, where the renderer's tracked
|
||||
# _cursor_pos.y no longer matches terminal reality. The only reliable
|
||||
# recovery is a full screen-clear (\x1b[2J\x1b[H) before the next
|
||||
# redraw, so we force one on every resize rather than trying to
|
||||
# compute the exact drift.
|
||||
_original_on_resize = app._on_resize
|
||||
|
||||
def _resize_clear_ghosts():
|
||||
from prompt_toolkit.data_structures import Point as _Pt
|
||||
renderer = app.renderer
|
||||
try:
|
||||
old_size = renderer._last_size
|
||||
new_size = renderer.output.get_size()
|
||||
if (
|
||||
old_size
|
||||
and new_size.columns < old_size.columns
|
||||
and new_size.columns > 0
|
||||
):
|
||||
reflow_factor = (
|
||||
(old_size.columns + new_size.columns - 1)
|
||||
// new_size.columns
|
||||
)
|
||||
last_h = (
|
||||
renderer._last_screen.height
|
||||
if renderer._last_screen
|
||||
else 0
|
||||
)
|
||||
extra = last_h * (reflow_factor - 1)
|
||||
if extra > 0:
|
||||
renderer._cursor_pos = _Pt(
|
||||
x=renderer._cursor_pos.x,
|
||||
y=renderer._cursor_pos.y + extra,
|
||||
)
|
||||
out = renderer.output
|
||||
# Reset attributes, erase the entire screen, and home the
|
||||
# cursor. This overwrites any reflowed status-bar rows or
|
||||
# stale content the terminal kept from the prior layout.
|
||||
out.reset_attributes()
|
||||
out.erase_screen()
|
||||
out.cursor_goto(0, 0)
|
||||
out.flush()
|
||||
# Tell the renderer its tracked position is fresh so its
|
||||
# own erase() inside _on_resize doesn't cursor_up() past
|
||||
# the top of the screen.
|
||||
renderer.reset(leave_alternate_screen=False)
|
||||
except Exception:
|
||||
pass # never break resize handling
|
||||
_original_on_resize()
|
||||
@@ -10594,7 +10789,6 @@ class HermesCLI:
|
||||
app._on_resize = _resize_clear_ghosts
|
||||
|
||||
def spinner_loop():
|
||||
last_idle_refresh = 0.0
|
||||
while not self._should_exit:
|
||||
if not self._app:
|
||||
time.sleep(0.1)
|
||||
@@ -10603,10 +10797,11 @@ class HermesCLI:
|
||||
self._invalidate(min_interval=0.1)
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
now = time.monotonic()
|
||||
if now - last_idle_refresh >= 1.0:
|
||||
last_idle_refresh = now
|
||||
self._invalidate(min_interval=1.0)
|
||||
# Do not repaint the idle prompt every second. In non-full-screen
|
||||
# prompt_toolkit mode, background redraws can fight tmux/Ghostty/cmux
|
||||
# viewport restoration after focus changes and visually move the
|
||||
# command input area. Keep idle stable; input/agent events still
|
||||
# invalidate explicitly when the UI actually changes.
|
||||
time.sleep(0.2)
|
||||
|
||||
spinner_thread = threading.Thread(target=spinner_loop, daemon=True)
|
||||
@@ -10648,6 +10843,10 @@ class HermesCLI:
|
||||
submit_images = []
|
||||
if isinstance(user_input, tuple):
|
||||
user_input, submit_images = user_input
|
||||
|
||||
if isinstance(user_input, str):
|
||||
user_input = _strip_leaked_bracketed_paste_wrappers(user_input)
|
||||
user_input = _strip_leaked_terminal_responses(user_input)
|
||||
|
||||
# Check for commands — but detect dragged/pasted file paths first.
|
||||
# See _detect_file_drop() for details.
|
||||
|
||||
+31
-3
@@ -311,6 +311,12 @@ def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None
|
||||
|
||||
elif schedule["kind"] == "cron":
|
||||
if not HAS_CRONITER:
|
||||
logger.warning(
|
||||
"Cannot compute next run for cron schedule %r: 'croniter' "
|
||||
"is not installed. Install the 'cron' extra (pip install "
|
||||
"'hermes-agent[cron]') to re-enable recurring cron jobs.",
|
||||
schedule.get("expr"),
|
||||
)
|
||||
return None
|
||||
cron = croniter(schedule["expr"], now)
|
||||
next_run = cron.get_next(datetime)
|
||||
@@ -698,10 +704,32 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,
|
||||
# Compute next run
|
||||
job["next_run_at"] = compute_next_run(job["schedule"], now)
|
||||
|
||||
# If no next run (one-shot completed), disable
|
||||
# If no next run, decide whether this is terminal completion
|
||||
# (one-shot) or a transient failure (recurring schedule couldn't
|
||||
# compute — e.g. 'croniter' missing from the runtime env).
|
||||
# Recurring jobs must NEVER be silently disabled: that turns a
|
||||
# missing runtime dep into "job completed" and the user's
|
||||
# schedule quietly goes off. See issue #16265.
|
||||
if job["next_run_at"] is None:
|
||||
job["enabled"] = False
|
||||
job["state"] = "completed"
|
||||
kind = job.get("schedule", {}).get("kind")
|
||||
if kind in ("cron", "interval"):
|
||||
job["state"] = "error"
|
||||
if not job.get("last_error"):
|
||||
job["last_error"] = (
|
||||
"Failed to compute next run for recurring "
|
||||
"schedule (is the 'croniter' package "
|
||||
"installed in the gateway's Python env?)"
|
||||
)
|
||||
logger.error(
|
||||
"Job '%s' (%s) could not compute next_run_at; "
|
||||
"leaving enabled and marking state=error so the "
|
||||
"job is not silently disabled.",
|
||||
job.get("name", job["id"]),
|
||||
kind,
|
||||
)
|
||||
else:
|
||||
job["enabled"] = False
|
||||
job["state"] = "completed"
|
||||
elif job.get("state") != "paused":
|
||||
job["state"] = "scheduled"
|
||||
|
||||
|
||||
@@ -822,6 +822,8 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
logger.info("Running job '%s' (ID: %s)", job_name, job_id)
|
||||
logger.info("Prompt: %s", prompt[:100])
|
||||
|
||||
agent = None
|
||||
|
||||
# Mark this as a cron session so the approval system can apply cron_mode.
|
||||
# This env var is process-wide and persists for the lifetime of the
|
||||
# scheduler process — every job this process runs is a cron job.
|
||||
@@ -1170,6 +1172,24 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
_session_db.close()
|
||||
except (Exception, KeyboardInterrupt) as e:
|
||||
logger.debug("Job '%s': failed to close SQLite session store: %s", job_id, e)
|
||||
# Release subprocesses, terminal sandboxes, browser daemons, and the
|
||||
# main OpenAI/httpx client held by this ephemeral cron agent. Without
|
||||
# this, a gateway that ticks cron every N minutes leaks fds per job
|
||||
# until it hits EMFILE (#10200 / "too many open files").
|
||||
try:
|
||||
if agent is not None:
|
||||
agent.close()
|
||||
except (Exception, KeyboardInterrupt) as e:
|
||||
logger.debug("Job '%s': failed to close agent resources: %s", job_id, e)
|
||||
# Each cron run spins up a short-lived worker thread whose event loop
|
||||
# dies as soon as the ``ThreadPoolExecutor`` shuts down. Any async
|
||||
# httpx clients cached under that loop are now unusable — reap them
|
||||
# so their transports don't accumulate in the process-global cache.
|
||||
try:
|
||||
from agent.auxiliary_client import cleanup_stale_async_clients
|
||||
cleanup_stale_async_clients()
|
||||
except Exception as e:
|
||||
logger.debug("Job '%s': failed to reap stale auxiliary clients: %s", job_id, e)
|
||||
|
||||
|
||||
def tick(verbose: bool = True, adapters=None, loop=None) -> int:
|
||||
|
||||
@@ -3294,6 +3294,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
chat_topic = self._get_effective_topic(message.channel, is_thread=is_thread)
|
||||
|
||||
# Build source
|
||||
guild = getattr(message, "guild", None)
|
||||
source = self.build_source(
|
||||
chat_id=str(effective_channel.id),
|
||||
chat_name=chat_name,
|
||||
@@ -3303,7 +3304,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
thread_id=thread_id,
|
||||
chat_topic=chat_topic,
|
||||
is_bot=getattr(message.author, "bot", False),
|
||||
guild_id=str(message.guild.id) if message.guild else None,
|
||||
guild_id=str(guild.id) if guild else None,
|
||||
parent_chat_id=parent_channel_id,
|
||||
message_id=str(message.id),
|
||||
)
|
||||
|
||||
@@ -28,6 +28,7 @@ from email.header import decode_header
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.base import MIMEBase
|
||||
from email.utils import formatdate
|
||||
from email import encoders
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
@@ -504,6 +505,7 @@ class EmailAdapter(BasePlatformAdapter):
|
||||
msg["In-Reply-To"] = original_msg_id
|
||||
msg["References"] = original_msg_id
|
||||
|
||||
msg["Date"] = formatdate(localtime=True)
|
||||
msg_id = f"<hermes-{uuid.uuid4().hex[:12]}@{self._address.split('@')[1]}>"
|
||||
msg["Message-ID"] = msg_id
|
||||
|
||||
@@ -586,6 +588,7 @@ class EmailAdapter(BasePlatformAdapter):
|
||||
msg["In-Reply-To"] = original_msg_id
|
||||
msg["References"] = original_msg_id
|
||||
|
||||
msg["Date"] = formatdate(localtime=True)
|
||||
msg_id = f"<hermes-{uuid.uuid4().hex[:12]}@{self._address.split('@')[1]}>"
|
||||
msg["Message-ID"] = msg_id
|
||||
|
||||
|
||||
@@ -1178,13 +1178,83 @@ class MatrixAdapter(BasePlatformAdapter):
|
||||
# Event callbacks
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _is_self_sender(self, sender: str) -> bool:
|
||||
"""Return True if the sender refers to the bot's own account.
|
||||
|
||||
Matrix user IDs are byte-compared after trimming whitespace and
|
||||
lowercasing — some homeservers normalize the localpart case
|
||||
differently at different API surfaces, and the reply-loop tail
|
||||
of the "hall of mirrors" bug (#15763) has been observed with the
|
||||
bot's own account bypassing a case-sensitive equality check.
|
||||
|
||||
When ``self._user_id`` is empty (whoami hasn't resolved yet, or
|
||||
login failed), we cannot prove a sender is NOT us, so we return
|
||||
True defensively — an unidentified bot dropping its own events
|
||||
is always preferable to falling into an echo loop.
|
||||
"""
|
||||
own = (self._user_id or "").strip().lower()
|
||||
if not own:
|
||||
return True
|
||||
return sender.strip().lower() == own
|
||||
|
||||
@staticmethod
|
||||
def _is_system_or_bridge_sender(sender: str) -> bool:
|
||||
"""Return True if the sender looks like a system / bridge / appservice
|
||||
identity rather than a real user.
|
||||
|
||||
Appservice namespaces on Matrix conventionally prefix bot / puppet
|
||||
user IDs with an underscore (e.g. ``@_telegram_12345:server``,
|
||||
``@_discord_999:server``, ``@_slack_...:server``). Server-notices
|
||||
bots and bridge-controller bots on many homeservers use the same
|
||||
pattern.
|
||||
|
||||
We treat these as system identities for pairing purposes: they
|
||||
should never be offered a pairing code, because an operator
|
||||
approving the code would hand the bridge itself permanent
|
||||
authorization — and every outbound message relayed by the bridge
|
||||
would then loop back into the agent as an "authorized user
|
||||
message", which is the root of issue #15763.
|
||||
|
||||
Matches:
|
||||
``@_something:server`` — appservice namespace convention
|
||||
``@:server`` — malformed / empty localpart
|
||||
``:server`` — malformed, no leading ``@``
|
||||
"""
|
||||
s = (sender or "").strip()
|
||||
if not s:
|
||||
return True
|
||||
# Localpart is everything between leading '@' and ':'
|
||||
if s.startswith("@"):
|
||||
s = s[1:]
|
||||
if ":" in s:
|
||||
localpart, _, _ = s.partition(":")
|
||||
else:
|
||||
localpart = s
|
||||
if not localpart:
|
||||
return True
|
||||
return localpart.startswith("_")
|
||||
|
||||
async def _on_room_message(self, event: Any) -> None:
|
||||
"""Handle incoming room message events (text, media)."""
|
||||
room_id = str(getattr(event, "room_id", ""))
|
||||
sender = str(getattr(event, "sender", ""))
|
||||
|
||||
# Ignore own messages.
|
||||
if sender == self._user_id:
|
||||
# Ignore own messages (case-insensitive; also drops when our own
|
||||
# user_id hasn't been resolved yet — see _is_self_sender docstring
|
||||
# and issue #15763).
|
||||
if self._is_self_sender(sender):
|
||||
return
|
||||
|
||||
# Ignore appservice / bridge / system identities so they never
|
||||
# trigger the pairing flow. Once a bridge user is paired, every
|
||||
# outbound message it relays would loop back as an authorized
|
||||
# user message (the "hall of mirrors" in #15763).
|
||||
if self._is_system_or_bridge_sender(sender):
|
||||
logger.debug(
|
||||
"Matrix: ignoring system/bridge sender %s in %s",
|
||||
sender,
|
||||
room_id,
|
||||
)
|
||||
return
|
||||
|
||||
# Deduplicate by event ID.
|
||||
@@ -1654,7 +1724,7 @@ class MatrixAdapter(BasePlatformAdapter):
|
||||
async def _on_reaction(self, event: Any) -> None:
|
||||
"""Handle incoming reaction events."""
|
||||
sender = str(getattr(event, "sender", ""))
|
||||
if sender == self._user_id:
|
||||
if self._is_self_sender(sender):
|
||||
return
|
||||
event_id = str(getattr(event, "event_id", ""))
|
||||
if self._is_duplicate_event(event_id):
|
||||
|
||||
@@ -2353,6 +2353,26 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
user = getattr(entity, "user", None)
|
||||
if user and getattr(user, "id", None) == bot_id:
|
||||
return True
|
||||
elif entity_type == "bot_command" and expected:
|
||||
# Telegram's official group-disambiguation form for slash
|
||||
# commands (``/cmd@botname``) is emitted as a single
|
||||
# ``bot_command`` entity covering the whole span — there
|
||||
# is no accompanying ``mention`` entity. Treat it as a
|
||||
# direct address to this bot when the ``@botname`` suffix
|
||||
# matches. This is the form Telegram's own command menu
|
||||
# autocomplete produces in groups, so dropping it at the
|
||||
# mention gate would break /new, /reset, /help, ... for
|
||||
# every group that has ``require_mention`` enabled (#15415).
|
||||
offset = int(getattr(entity, "offset", -1))
|
||||
length = int(getattr(entity, "length", 0))
|
||||
if offset < 0 or length <= 0:
|
||||
continue
|
||||
command_text = source_text[offset:offset + length]
|
||||
at_index = command_text.find("@")
|
||||
if at_index < 0:
|
||||
continue
|
||||
if command_text[at_index:].strip().lower() == expected:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _message_matches_mention_patterns(self, message: Message) -> bool:
|
||||
|
||||
+144
-7
@@ -1943,7 +1943,21 @@ class GatewayRunner:
|
||||
return
|
||||
try:
|
||||
if hasattr(agent, "shutdown_memory_provider"):
|
||||
agent.shutdown_memory_provider()
|
||||
# Pass the agent's own conversation transcript so memory
|
||||
# providers' ``on_session_end`` hooks see the real messages
|
||||
# instead of the empty default (#15165). ``_session_messages``
|
||||
# is set on ``AIAgent`` (run_agent.py:1518) and refreshed at
|
||||
# the end of every ``run_conversation`` turn via
|
||||
# ``_persist_session``; on an agent built through
|
||||
# ``object.__new__`` (test stubs) the attribute may be
|
||||
# absent, so ``getattr`` with a ``None`` default keeps the
|
||||
# call signature-compatible with the pre-fix behaviour
|
||||
# (``shutdown_memory_provider(messages=None)``).
|
||||
session_messages = getattr(agent, "_session_messages", None)
|
||||
if isinstance(session_messages, list):
|
||||
agent.shutdown_memory_provider(session_messages)
|
||||
else:
|
||||
agent.shutdown_memory_provider()
|
||||
except Exception:
|
||||
pass
|
||||
# Close tool resources (terminal sandboxes, browser daemons,
|
||||
@@ -1954,6 +1968,15 @@ class GatewayRunner:
|
||||
agent.close()
|
||||
except Exception:
|
||||
pass
|
||||
# Auxiliary async clients (session_search/web/vision/etc.) live in a
|
||||
# process-global cache and are created inside worker threads. Clean up
|
||||
# any entries whose event loop is now dead so their httpx transports do
|
||||
# not accumulate across gateway turns.
|
||||
try:
|
||||
from agent.auxiliary_client import cleanup_stale_async_clients
|
||||
cleanup_stale_async_clients()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_STUCK_LOOP_THRESHOLD = 3 # restarts while active before auto-suspend
|
||||
_STUCK_LOOP_FILE = ".restart_failure_counts"
|
||||
@@ -2917,6 +2940,19 @@ class GatewayRunner:
|
||||
# disconnect (defense in depth; safe to call repeatedly).
|
||||
_kill_tool_subprocesses("final-cleanup")
|
||||
|
||||
# Reap the process-global auxiliary-client cache once at the very
|
||||
# end of teardown. Per-turn cleanup runs in _cleanup_agent_resources
|
||||
# for each active agent, but clients bound to worker-thread loops
|
||||
# that died with their ThreadPoolExecutor (notably cron ticks) only
|
||||
# get swept here. Without this, long-running gateways accumulate
|
||||
# async httpx transports until they hit EMFILE on macOS's default
|
||||
# RLIMIT_NOFILE=256. See #14210.
|
||||
try:
|
||||
from agent.auxiliary_client import shutdown_cached_clients
|
||||
shutdown_cached_clients()
|
||||
except Exception as _e:
|
||||
logger.debug("shutdown_cached_clients error: %s", _e)
|
||||
|
||||
# Close SQLite session DBs so the WAL write lock is released.
|
||||
# Without this, --replace and similar restart flows leave the
|
||||
# old gateway's connection holding the WAL lock until Python
|
||||
@@ -4199,9 +4235,18 @@ class GatewayRunner:
|
||||
Keep the normal inbound path and the queued follow-up path on the same
|
||||
preprocessing pipeline so sender attribution, image enrichment, STT,
|
||||
document notes, reply context, and @ references all behave the same.
|
||||
|
||||
Side effect: writes ``self._pending_native_image_paths`` to a list of
|
||||
local image paths when the active model supports native vision AND
|
||||
the user has images attached. The caller consumes and clears this
|
||||
attribute at the ``run_conversation`` site to build a multimodal user
|
||||
turn. When the list is empty, the ``_enrich_message_with_vision``
|
||||
text path has already run and images are represented in-text.
|
||||
"""
|
||||
history = history or []
|
||||
message_text = event.text or ""
|
||||
# Reset per-call buffer; set only when native routing is chosen.
|
||||
self._pending_native_image_paths = []
|
||||
|
||||
_is_shared_multi_user = is_shared_multi_user_session(
|
||||
source,
|
||||
@@ -4222,10 +4267,25 @@ class GatewayRunner:
|
||||
audio_paths.append(path)
|
||||
|
||||
if image_paths:
|
||||
message_text = await self._enrich_message_with_vision(
|
||||
message_text,
|
||||
image_paths,
|
||||
)
|
||||
# Decide routing: native (attach pixels) vs text (vision_analyze
|
||||
# pre-run + prepend description). See agent/image_routing.py.
|
||||
_img_mode = self._decide_image_input_mode()
|
||||
if _img_mode == "native":
|
||||
# Defer attachment to the run_conversation call site.
|
||||
self._pending_native_image_paths = list(image_paths)
|
||||
logger.info(
|
||||
"Image routing: native (model supports vision). %d image(s) will be attached inline.",
|
||||
len(image_paths),
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Image routing: text (mode=%s). Pre-analyzing %d image(s) via vision_analyze.",
|
||||
_img_mode, len(image_paths),
|
||||
)
|
||||
message_text = await self._enrich_message_with_vision(
|
||||
message_text,
|
||||
image_paths,
|
||||
)
|
||||
|
||||
if audio_paths:
|
||||
message_text = await self._enrich_message_with_transcription(
|
||||
@@ -8378,6 +8438,29 @@ class GatewayRunner:
|
||||
ctx = copy_context()
|
||||
return await loop.run_in_executor(None, ctx.run, func, *args)
|
||||
|
||||
def _decide_image_input_mode(self) -> str:
|
||||
"""Resolve the image-input routing for the currently active model.
|
||||
|
||||
Returns ``"native"`` (attach pixels on the user turn) or ``"text"``
|
||||
(pre-analyze with vision_analyze and prepend the description). See
|
||||
agent/image_routing.py for the full decision table.
|
||||
|
||||
The active provider/model are read from config.yaml so the decision
|
||||
tracks ``/model`` switches automatically on the next message.
|
||||
"""
|
||||
try:
|
||||
from agent.image_routing import decide_image_input_mode
|
||||
from agent.auxiliary_client import _read_main_model, _read_main_provider
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
cfg = load_config()
|
||||
provider = _read_main_provider()
|
||||
model = _read_main_model()
|
||||
return decide_image_input_mode(provider, model, cfg)
|
||||
except Exception as exc:
|
||||
logger.debug("image_routing: decision failed, falling back to text — %s", exc)
|
||||
return "text"
|
||||
|
||||
async def _enrich_message_with_vision(
|
||||
self,
|
||||
user_text: str,
|
||||
@@ -10394,7 +10477,39 @@ class GatewayRunner:
|
||||
_approval_session_token = set_current_session_key(_approval_session_key)
|
||||
register_gateway_notify(_approval_session_key, _approval_notify_sync)
|
||||
try:
|
||||
result = agent.run_conversation(message, conversation_history=agent_history, task_id=session_id)
|
||||
# If _prepare_inbound_message_text buffered image paths for native
|
||||
# attachment, wrap the user turn as an OpenAI-style multimodal
|
||||
# content list. Consume-and-clear so subsequent turns on the same
|
||||
# runner instance don't re-attach stale images.
|
||||
_native_imgs = list(getattr(self, "_pending_native_image_paths", []) or [])
|
||||
self._pending_native_image_paths = []
|
||||
if _native_imgs:
|
||||
try:
|
||||
from agent.image_routing import build_native_content_parts
|
||||
_parts, _skipped = build_native_content_parts(
|
||||
message,
|
||||
_native_imgs,
|
||||
)
|
||||
if _skipped:
|
||||
logger.warning(
|
||||
"Native image attachment: skipped %d unreadable path(s): %s",
|
||||
len(_skipped), _skipped,
|
||||
)
|
||||
if any(p.get("type") == "image_url" for p in _parts):
|
||||
_run_message: Any = _parts
|
||||
else:
|
||||
# All images failed to read — fall back to plain text.
|
||||
_run_message = message
|
||||
except Exception as _img_exc:
|
||||
logger.warning(
|
||||
"Native image attachment failed, falling back to text: %s",
|
||||
_img_exc,
|
||||
)
|
||||
_run_message = message
|
||||
else:
|
||||
_run_message = message
|
||||
|
||||
result = agent.run_conversation(_run_message, conversation_history=agent_history, task_id=session_id)
|
||||
finally:
|
||||
unregister_gateway_notify(_approval_session_key)
|
||||
reset_current_session_key(_approval_session_token)
|
||||
@@ -10500,12 +10615,20 @@ class GatewayRunner:
|
||||
try:
|
||||
from agent.title_generator import maybe_auto_title
|
||||
all_msgs = result_holder[0].get("messages", []) if result_holder[0] else []
|
||||
# Route title-generation failures through the agent's
|
||||
# user-visible warning channel so a depleted auxiliary
|
||||
# provider doesn't silently leave sessions untitled
|
||||
# (issue #15775).
|
||||
_title_failure_cb = getattr(
|
||||
agent, "_emit_auxiliary_failure", None
|
||||
)
|
||||
maybe_auto_title(
|
||||
self._session_db,
|
||||
effective_session_id,
|
||||
message,
|
||||
final_response,
|
||||
all_msgs,
|
||||
failure_callback=_title_failure_cb,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
@@ -11145,13 +11268,16 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, in
|
||||
cron delivery path so live adapters can be used for E2EE rooms.
|
||||
|
||||
Also refreshes the channel directory every 5 minutes and prunes the
|
||||
image/audio/document cache once per hour.
|
||||
image/audio/document cache + expired ``hermes debug share`` pastes
|
||||
once per hour.
|
||||
"""
|
||||
from cron.scheduler import tick as cron_tick
|
||||
from gateway.platforms.base import cleanup_image_cache, cleanup_document_cache
|
||||
from hermes_cli.debug import _sweep_expired_pastes
|
||||
|
||||
IMAGE_CACHE_EVERY = 60 # ticks — once per hour at default 60s interval
|
||||
CHANNEL_DIR_EVERY = 5 # ticks — every 5 minutes
|
||||
PASTE_SWEEP_EVERY = 60 # ticks — once per hour
|
||||
|
||||
logger.info("Cron ticker started (interval=%ds)", interval)
|
||||
tick_count = 0
|
||||
@@ -11192,6 +11318,17 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, in
|
||||
except Exception as e:
|
||||
logger.debug("Document cache cleanup error: %s", e)
|
||||
|
||||
if tick_count % PASTE_SWEEP_EVERY == 0:
|
||||
try:
|
||||
deleted, remaining = _sweep_expired_pastes()
|
||||
if deleted:
|
||||
logger.info(
|
||||
"Paste sweep: deleted %d expired paste(s), %d pending",
|
||||
deleted, remaining,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Paste sweep error: %s", e)
|
||||
|
||||
stop_event.wait(timeout=interval)
|
||||
logger.info("Cron ticker stopped")
|
||||
|
||||
|
||||
+177
-1
@@ -36,12 +36,23 @@ _EXCLUDED_DIRS = {
|
||||
"__pycache__", # bytecode caches — regenerated on import
|
||||
".git", # nested git dirs (profiles shouldn't have these, but safety)
|
||||
"node_modules", # js deps if website/ somehow leaks in
|
||||
"backups", # prior auto-backups — don't nest backups exponentially
|
||||
"checkpoints", # session-local trajectory caches — regenerated per-session,
|
||||
# session-hash-keyed so they don't port to another machine anyway
|
||||
}
|
||||
|
||||
# File-name suffixes to skip
|
||||
_EXCLUDED_SUFFIXES = (
|
||||
".pyc",
|
||||
".pyo",
|
||||
# SQLite sidecar files — the backup takes a consistent snapshot of ``*.db``
|
||||
# via ``sqlite3.backup()``, so shipping the live WAL / shared-memory /
|
||||
# rollback-journal alongside would pair a fresh snapshot with stale sidecar
|
||||
# state and produce a torn restore on the next open. They're transient and
|
||||
# regenerated on first connection anyway.
|
||||
".db-wal",
|
||||
".db-shm",
|
||||
".db-journal",
|
||||
)
|
||||
|
||||
# File names to skip (runtime state that's meaningless on another machine)
|
||||
@@ -454,6 +465,12 @@ def run_import(args) -> None:
|
||||
# Critical state files to include in quick snapshots (relative to HERMES_HOME).
|
||||
# Everything else is either regeneratable (logs, cache) or managed separately
|
||||
# (skills, repo, sessions/).
|
||||
#
|
||||
# Entries may be individual files OR directories. Directories are captured
|
||||
# recursively; missing entries are silently skipped. Pairing data lives in
|
||||
# platform-specific JSON blobs outside state.db, so it's listed here explicitly
|
||||
# — `hermes update` snapshots this set before pulling so approved-user lists
|
||||
# are recoverable if anything goes wrong (issue #15733).
|
||||
_QUICK_STATE_FILES = (
|
||||
"state.db",
|
||||
"config.yaml",
|
||||
@@ -463,6 +480,10 @@ _QUICK_STATE_FILES = (
|
||||
"gateway_state.json",
|
||||
"channel_directory.json",
|
||||
"processes.json",
|
||||
# Pairing stores (generic + per-platform JSONs outside state.db)
|
||||
"pairing", # legacy location (gateway/pairing.py)
|
||||
"platforms/pairing", # new location (gateway/pairing.py)
|
||||
"feishu_comment_pairing.json", # Feishu comment subscription pairings
|
||||
)
|
||||
|
||||
_QUICK_SNAPSHOTS_DIR = "state-snapshots"
|
||||
@@ -498,7 +519,27 @@ def create_quick_snapshot(
|
||||
|
||||
for rel in _QUICK_STATE_FILES:
|
||||
src = home / rel
|
||||
if not src.exists() or not src.is_file():
|
||||
if not src.exists():
|
||||
continue
|
||||
|
||||
if src.is_dir():
|
||||
# Walk the directory and record each file individually in the
|
||||
# manifest so restore can treat them uniformly. Empty dirs are
|
||||
# skipped (nothing to snapshot).
|
||||
for sub in src.rglob("*"):
|
||||
if not sub.is_file():
|
||||
continue
|
||||
sub_rel = sub.relative_to(home).as_posix()
|
||||
dst = snap_dir / sub_rel
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
shutil.copy2(sub, dst)
|
||||
manifest[sub_rel] = dst.stat().st_size
|
||||
except (OSError, PermissionError) as exc:
|
||||
logger.warning("Could not snapshot %s: %s", sub_rel, exc)
|
||||
continue
|
||||
|
||||
if not src.is_file():
|
||||
continue
|
||||
|
||||
dst = snap_dir / rel
|
||||
@@ -653,3 +694,138 @@ def run_quick_backup(args) -> None:
|
||||
print(f" Restore with: /snapshot restore {snap_id}")
|
||||
else:
|
||||
print("No state files found to snapshot.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pre-update auto-backup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_PRE_UPDATE_BACKUPS_DIR = "backups"
|
||||
_PRE_UPDATE_PREFIX = "pre-update-"
|
||||
_PRE_UPDATE_DEFAULT_KEEP = 5
|
||||
|
||||
|
||||
def _pre_update_backup_dir(hermes_home: Optional[Path] = None) -> Path:
|
||||
home = hermes_home or get_hermes_home()
|
||||
return home / _PRE_UPDATE_BACKUPS_DIR
|
||||
|
||||
|
||||
def _prune_pre_update_backups(backup_dir: Path, keep: int) -> int:
|
||||
"""Remove oldest pre-update backups beyond the keep limit.
|
||||
|
||||
Returns the number of files deleted. Only touches files matching
|
||||
``pre-update-*.zip`` so hand-made zips dropped in the same directory
|
||||
are never touched.
|
||||
"""
|
||||
if keep < 0:
|
||||
keep = 0
|
||||
if not backup_dir.exists():
|
||||
return 0
|
||||
|
||||
backups = sorted(
|
||||
(p for p in backup_dir.iterdir()
|
||||
if p.is_file() and p.name.startswith(_PRE_UPDATE_PREFIX) and p.suffix.lower() == ".zip"),
|
||||
key=lambda p: p.name,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
deleted = 0
|
||||
for p in backups[keep:]:
|
||||
try:
|
||||
p.unlink()
|
||||
deleted += 1
|
||||
except OSError as exc:
|
||||
logger.warning("Failed to prune backup %s: %s", p.name, exc)
|
||||
|
||||
return deleted
|
||||
|
||||
|
||||
def create_pre_update_backup(
|
||||
hermes_home: Optional[Path] = None,
|
||||
keep: int = _PRE_UPDATE_DEFAULT_KEEP,
|
||||
) -> Optional[Path]:
|
||||
"""Create a full zip backup of HERMES_HOME under ``backups/``.
|
||||
|
||||
Mirrors :func:`run_backup` (same exclusion rules, same SQLite safe-copy)
|
||||
but writes to ``<HERMES_HOME>/backups/pre-update-<timestamp>.zip`` and
|
||||
auto-prunes old pre-update backups.
|
||||
|
||||
Returns the path to the created zip, or ``None`` if no files were
|
||||
found or the backup could not be created. Never raises — the caller
|
||||
(``hermes update``) should continue even if the backup fails.
|
||||
"""
|
||||
hermes_root = hermes_home or get_default_hermes_root()
|
||||
if not hermes_root.is_dir():
|
||||
return None
|
||||
|
||||
backup_dir = _pre_update_backup_dir(hermes_root)
|
||||
try:
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
except OSError as exc:
|
||||
logger.warning("Could not create pre-update backup dir %s: %s", backup_dir, exc)
|
||||
return None
|
||||
|
||||
stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
|
||||
out_path = backup_dir / f"{_PRE_UPDATE_PREFIX}{stamp}.zip"
|
||||
|
||||
# Collect files (same logic as run_backup, minus the chatty progress prints)
|
||||
files_to_add: list[tuple[Path, Path]] = []
|
||||
try:
|
||||
for dirpath, dirnames, filenames in os.walk(hermes_root, followlinks=False):
|
||||
dp = Path(dirpath)
|
||||
# Prune excluded directories in-place so os.walk doesn't descend
|
||||
dirnames[:] = [d for d in dirnames if d not in _EXCLUDED_DIRS]
|
||||
|
||||
for fname in filenames:
|
||||
fpath = dp / fname
|
||||
try:
|
||||
rel = fpath.relative_to(hermes_root)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if _should_exclude(rel):
|
||||
continue
|
||||
|
||||
# Skip the output zip itself if it already exists
|
||||
try:
|
||||
if fpath.resolve() == out_path.resolve():
|
||||
continue
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
files_to_add.append((fpath, rel))
|
||||
except OSError as exc:
|
||||
logger.warning("Pre-update backup: walk failed: %s", exc)
|
||||
return None
|
||||
|
||||
if not files_to_add:
|
||||
return None
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(out_path, "w", zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
|
||||
for abs_path, rel_path in files_to_add:
|
||||
try:
|
||||
if abs_path.suffix == ".db":
|
||||
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
|
||||
tmp_db = Path(tmp.name)
|
||||
try:
|
||||
if _safe_copy_db(abs_path, tmp_db):
|
||||
zf.write(tmp_db, arcname=str(rel_path))
|
||||
finally:
|
||||
tmp_db.unlink(missing_ok=True)
|
||||
else:
|
||||
zf.write(abs_path, arcname=str(rel_path))
|
||||
except (PermissionError, OSError, ValueError) as exc:
|
||||
logger.debug("Skipping %s in pre-update backup: %s", rel_path, exc)
|
||||
continue
|
||||
except OSError as exc:
|
||||
logger.warning("Pre-update backup: zip write failed: %s", exc)
|
||||
# Best-effort cleanup of partial file
|
||||
try:
|
||||
out_path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
return None
|
||||
|
||||
_prune_pre_update_backups(backup_dir, keep=keep)
|
||||
return out_path
|
||||
|
||||
@@ -62,6 +62,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
aliases=("reset",)),
|
||||
CommandDef("clear", "Clear screen and start a new session", "Session",
|
||||
cli_only=True),
|
||||
CommandDef("redraw", "Force a full UI repaint (recovers from terminal drift)", "Session",
|
||||
cli_only=True),
|
||||
CommandDef("history", "Show conversation history", "Session",
|
||||
cli_only=True),
|
||||
CommandDef("save", "Save the current conversation", "Session",
|
||||
|
||||
@@ -389,6 +389,20 @@ DEFAULT_CONFIG = {
|
||||
# (60+ tool iterations with tiny output) before users assume the
|
||||
# bot is dead and /restart.
|
||||
"gateway_notify_interval": 180,
|
||||
# How user-attached images are presented to the main model on each turn.
|
||||
# "auto" — attach natively when the active model reports
|
||||
# supports_vision=True AND the user hasn't explicitly
|
||||
# configured auxiliary.vision.provider. Otherwise fall
|
||||
# back to text (vision_analyze pre-analysis).
|
||||
# "native" — always attach natively; non-vision models will either
|
||||
# error at the provider or get a last-chance text fallback
|
||||
# (see run_agent._prepare_messages_for_api).
|
||||
# "text" — always pre-analyze with vision_analyze and prepend the
|
||||
# description as text; the main model never sees pixels.
|
||||
# Affects gateway platforms, the TUI, and CLI /attach. vision_analyze
|
||||
# remains available as a tool regardless of this setting — the routing
|
||||
# only controls how inbound user images are presented.
|
||||
"image_input_mode": "auto",
|
||||
},
|
||||
|
||||
"terminal": {
|
||||
@@ -1037,6 +1051,20 @@ DEFAULT_CONFIG = {
|
||||
"seen": {},
|
||||
},
|
||||
|
||||
# ``hermes update`` behaviour.
|
||||
"updates": {
|
||||
# Run a full ``hermes backup``-style zip of HERMES_HOME before every
|
||||
# ``hermes update``. Backups land in ``<HERMES_HOME>/backups/`` and
|
||||
# can be restored with ``hermes import <path>``. Off by default —
|
||||
# on large HERMES_HOME directories the zip can add minutes to every
|
||||
# update. Set to true to re-enable, or pass ``--backup`` to opt in
|
||||
# for a single update run.
|
||||
"pre_update_backup": False,
|
||||
# How many pre-update backup zips to retain. Older ones are pruned
|
||||
# automatically after each successful backup.
|
||||
"backup_keep": 5,
|
||||
},
|
||||
|
||||
# Config schema version - bump this when adding new required fields
|
||||
"_config_version": 22,
|
||||
}
|
||||
|
||||
+11
-5
@@ -45,8 +45,13 @@ def _pending_file() -> Path:
|
||||
Each entry: ``{"url": "...", "expire_at": <unix_ts>}``. Scheduled
|
||||
DELETEs used to be handled by spawning a detached Python process per
|
||||
paste that slept for 6 hours; those accumulated forever if the user
|
||||
ran ``hermes debug share`` repeatedly. We now persist the schedule
|
||||
to disk and sweep expired entries on the next debug invocation.
|
||||
ran ``hermes debug share`` repeatedly.
|
||||
|
||||
Deletion is now driven by the gateway's cron ticker
|
||||
(``gateway/run.py::_start_cron_ticker``) which calls
|
||||
``_sweep_expired_pastes`` once per hour. ``hermes debug share`` also
|
||||
runs an opportunistic sweep on entry as a fallback for CLI-only users
|
||||
who never start the gateway.
|
||||
"""
|
||||
return get_hermes_home() / "pastes" / "pending.json"
|
||||
|
||||
@@ -223,9 +228,10 @@ def _schedule_auto_delete(urls: list[str], delay_seconds: int = _AUTO_DELETE_SEC
|
||||
interpreters that never exited until the sleep completed.
|
||||
|
||||
The replacement is stateless: we append to ``~/.hermes/pastes/pending.json``
|
||||
and rely on opportunistic sweeps (``_sweep_expired_pastes``) called from
|
||||
every ``hermes debug`` invocation. If the user never runs ``hermes debug``
|
||||
again, paste.rs's own retention policy handles cleanup.
|
||||
and the gateway's cron ticker sweeps expired entries once per hour.
|
||||
``hermes debug share`` also runs an opportunistic sweep as a fallback
|
||||
for CLI-only users. If neither runs again, paste.rs's own retention
|
||||
policy handles cleanup.
|
||||
"""
|
||||
_record_pending(urls, delay_seconds=delay_seconds)
|
||||
|
||||
|
||||
+184
-12
@@ -44,6 +44,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
@@ -595,17 +596,22 @@ def _session_browse_picker(sessions: list) -> Optional[str]:
|
||||
|
||||
|
||||
def _resolve_last_session(source: str = "cli") -> Optional[str]:
|
||||
"""Look up the most recent session ID for a source."""
|
||||
"""Look up the most recently-used session ID for a source."""
|
||||
db = None
|
||||
try:
|
||||
from hermes_state import SessionDB
|
||||
|
||||
db = SessionDB()
|
||||
sessions = db.search_sessions(source=source, limit=1)
|
||||
db.close()
|
||||
if sessions:
|
||||
return sessions[0]["id"]
|
||||
return sessions[0]["id"] if sessions else None
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
if db is not None:
|
||||
try:
|
||||
db.close()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
@@ -760,9 +766,20 @@ def _resolve_session_by_name_or_id(name_or_id: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _print_tui_exit_summary(session_id: Optional[str]) -> None:
|
||||
def _read_tui_active_session_file(path: Optional[str]) -> Optional[str]:
|
||||
if not path:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
sid = str(data.get("session_id") or "").strip()
|
||||
return sid or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _print_tui_exit_summary(session_id: Optional[str], active_session_file: Optional[str] = None) -> None:
|
||||
"""Print a shell-visible epilogue after TUI exits."""
|
||||
target = session_id or _resolve_last_session(source="tui")
|
||||
target = _read_tui_active_session_file(active_session_file) or session_id or _resolve_last_session(source="tui")
|
||||
if not target:
|
||||
return
|
||||
|
||||
@@ -1037,7 +1054,14 @@ def _launch_tui(
|
||||
"""Replace current process with the TUI."""
|
||||
tui_dir = PROJECT_ROOT / "ui-tui"
|
||||
|
||||
import tempfile
|
||||
|
||||
env = os.environ.copy()
|
||||
active_session_fd, active_session_file = tempfile.mkstemp(
|
||||
prefix="hermes-tui-active-session-", suffix=".json"
|
||||
)
|
||||
os.close(active_session_fd)
|
||||
env["HERMES_TUI_ACTIVE_SESSION_FILE"] = active_session_file
|
||||
env["HERMES_PYTHON_SRC_ROOT"] = os.environ.get(
|
||||
"HERMES_PYTHON_SRC_ROOT", str(PROJECT_ROOT)
|
||||
)
|
||||
@@ -1065,13 +1089,20 @@ def _launch_tui(
|
||||
env["HERMES_TUI_RESUME"] = resume_session_id
|
||||
|
||||
argv, cwd = _make_tui_argv(tui_dir, tui_dev)
|
||||
code: Optional[int] = None
|
||||
try:
|
||||
code = subprocess.call(argv, cwd=str(cwd), env=env)
|
||||
except KeyboardInterrupt:
|
||||
code = 130
|
||||
try:
|
||||
code = subprocess.call(argv, cwd=str(cwd), env=env)
|
||||
except KeyboardInterrupt:
|
||||
code = 130
|
||||
|
||||
if code in (0, 130):
|
||||
_print_tui_exit_summary(resume_session_id)
|
||||
if code in (0, 130):
|
||||
_print_tui_exit_summary(resume_session_id, active_session_file)
|
||||
finally:
|
||||
try:
|
||||
os.unlink(active_session_file)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
sys.exit(code)
|
||||
|
||||
@@ -3332,7 +3363,26 @@ def _model_flow_named_custom(config, provider_info):
|
||||
provider_entry = providers_cfg.get(provider_key)
|
||||
if isinstance(provider_entry, dict):
|
||||
provider_entry["default_model"] = model_name
|
||||
if config_api_key and not str(provider_entry.get("api_key", "") or "").strip():
|
||||
# Only persist an inline api_key when the user originally had
|
||||
# one (either a literal secret or a ``${VAR}`` template). When
|
||||
# the entry relies on ``key_env``, do not synthesize a
|
||||
# ``${key_env}`` api_key — the runtime already resolves the
|
||||
# key from ``key_env`` directly, and writing the resolved
|
||||
# secret (or even a synthesized template) would silently
|
||||
# downgrade credential hygiene on entries that intentionally
|
||||
# keep plaintext out of ``config.yaml``. See issue #15803.
|
||||
original_api_key_ref = str(
|
||||
provider_info.get("api_key_ref", "") or ""
|
||||
).strip()
|
||||
original_api_key = str(
|
||||
provider_info.get("api_key", "") or ""
|
||||
).strip()
|
||||
had_inline_api_key = bool(original_api_key_ref or original_api_key)
|
||||
if (
|
||||
had_inline_api_key
|
||||
and config_api_key
|
||||
and not str(provider_entry.get("api_key", "") or "").strip()
|
||||
):
|
||||
provider_entry["api_key"] = config_api_key
|
||||
if key_env and not str(provider_entry.get("key_env", "") or "").strip():
|
||||
provider_entry["key_env"] = key_env
|
||||
@@ -6123,6 +6173,96 @@ def _ensure_fhs_path_guard() -> None:
|
||||
print(" (reload your shell or run 'source ~/.bashrc' to pick it up)")
|
||||
|
||||
|
||||
def _run_pre_update_backup(args) -> None:
|
||||
"""Create a full zip backup of HERMES_HOME before running the update.
|
||||
|
||||
Gated on ``updates.pre_update_backup`` in config (default false). Off
|
||||
by default because the zip can add minutes to every update on large
|
||||
HERMES_HOME directories. The ``--backup`` flag on ``hermes update``
|
||||
opts in for a single run; ``--no-backup`` forces it off when config
|
||||
has it enabled. Never raises — a backup failure should not block the
|
||||
update itself.
|
||||
"""
|
||||
# CLI flags win over config. --no-backup beats --backup if both are set.
|
||||
if getattr(args, "no_backup", False):
|
||||
print("◆ Pre-update backup: skipped (--no-backup)")
|
||||
print()
|
||||
return
|
||||
|
||||
force_backup = bool(getattr(args, "backup", False))
|
||||
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
except Exception as exc:
|
||||
logging.getLogger(__name__).debug("Could not load config for pre-update backup: %s", exc)
|
||||
cfg = {}
|
||||
|
||||
updates_cfg = cfg.get("updates", {}) if isinstance(cfg, dict) else {}
|
||||
enabled = updates_cfg.get("pre_update_backup", False)
|
||||
keep = updates_cfg.get("backup_keep", 5)
|
||||
|
||||
if not enabled and not force_backup:
|
||||
# Silent by default — the backup is off, most users don't need to
|
||||
# hear about it on every update. They can opt in via --backup
|
||||
# or by flipping the config knob.
|
||||
return
|
||||
|
||||
try:
|
||||
from hermes_cli.backup import create_pre_update_backup
|
||||
except Exception as exc:
|
||||
print(f"⚠ Pre-update backup: could not load backup module ({exc}); continuing update.")
|
||||
print()
|
||||
return
|
||||
|
||||
print("◆ Creating pre-update backup...")
|
||||
t0 = _time.monotonic()
|
||||
try:
|
||||
out_path = create_pre_update_backup(keep=int(keep))
|
||||
except Exception as exc: # defensive — helper already swallows, but just in case
|
||||
print(f" ⚠ Backup failed: {exc}")
|
||||
print(" Continuing with update.")
|
||||
print()
|
||||
return
|
||||
|
||||
elapsed = _time.monotonic() - t0
|
||||
|
||||
if out_path is None:
|
||||
print(" ⚠ Backup skipped (no files found or write failed); continuing update.")
|
||||
print()
|
||||
return
|
||||
|
||||
try:
|
||||
size_bytes = out_path.stat().st_size
|
||||
except OSError:
|
||||
size_bytes = 0
|
||||
|
||||
# Human-readable size
|
||||
size_str = f"{size_bytes} B"
|
||||
for unit in ("KB", "MB", "GB"):
|
||||
if size_bytes < 1024:
|
||||
break
|
||||
size_bytes /= 1024
|
||||
size_str = f"{size_bytes:.1f} {unit}"
|
||||
|
||||
# Render path using display_hermes_home so the user sees ~/.hermes/...
|
||||
try:
|
||||
from hermes_constants import get_hermes_home, display_hermes_home
|
||||
home = get_hermes_home()
|
||||
try:
|
||||
display_path = f"{display_hermes_home()}/{out_path.relative_to(home)}"
|
||||
except ValueError:
|
||||
display_path = str(out_path)
|
||||
except Exception:
|
||||
display_path = str(out_path)
|
||||
|
||||
print(f" Saved: {display_path} ({size_str}, {elapsed:.1f}s)")
|
||||
print(f" Restore: hermes import {out_path}")
|
||||
print(f" Disable: omit --backup (backups are off by default)")
|
||||
print(f" set updates.pre_update_backup: false in config.yaml")
|
||||
print()
|
||||
|
||||
|
||||
def cmd_update(args):
|
||||
"""Update Hermes Agent to the latest version.
|
||||
|
||||
@@ -6165,6 +6305,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
print("⚕ Updating Hermes Agent...")
|
||||
print()
|
||||
|
||||
# Pre-update backup — runs before any git/file mutation so users can
|
||||
# always roll back to the exact state they had before this update.
|
||||
_run_pre_update_backup(args)
|
||||
|
||||
# Try git-based update first, fall back to ZIP download on Windows
|
||||
# when git file I/O is broken (antivirus, NTFS filter drivers, etc.)
|
||||
use_zip_update = False
|
||||
@@ -6314,6 +6458,22 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
|
||||
print(f"→ Found {commit_count} new commit(s)")
|
||||
|
||||
# Snapshot critical state (state.db, config, pairing JSONs, etc.)
|
||||
# before pulling so a user can recover if something goes wrong.
|
||||
# Issue #15733 reported missing pairing data after an update; even
|
||||
# though `git pull` can't touch $HERMES_HOME, this is cheap
|
||||
# belt-and-suspenders insurance and gives the user something to
|
||||
# restore from via `/snapshot list` / `/snapshot restore <id>`.
|
||||
try:
|
||||
from hermes_cli.backup import create_quick_snapshot
|
||||
|
||||
snap_id = create_quick_snapshot(label="pre-update")
|
||||
if snap_id:
|
||||
print(f" ✓ Pre-update snapshot: {snap_id}")
|
||||
except Exception as exc:
|
||||
# Never let a snapshot failure block an update.
|
||||
logger.debug("Pre-update snapshot failed: %s", exc)
|
||||
|
||||
print("→ Pulling updates...")
|
||||
update_succeeded = False
|
||||
try:
|
||||
@@ -9542,6 +9702,18 @@ Examples:
|
||||
default=False,
|
||||
help="Check whether an update is available without installing anything",
|
||||
)
|
||||
update_parser.add_argument(
|
||||
"--no-backup",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Skip the pre-update backup for this run (overrides updates.pre_update_backup)",
|
||||
)
|
||||
update_parser.add_argument(
|
||||
"--backup",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Force a pre-update backup for this run (off by default; overrides updates.pre_update_backup)",
|
||||
)
|
||||
update_parser.set_defaults(func=cmd_update)
|
||||
|
||||
# =========================================================================
|
||||
|
||||
@@ -2226,6 +2226,52 @@ def copilot_model_api_mode(
|
||||
return "chat_completions"
|
||||
|
||||
|
||||
# Azure Foundry model families that require the Responses API. Azure
|
||||
# rejects /chat/completions against these deployments with
|
||||
# ``400 "The requested operation is unsupported."`` — the same payload Bob
|
||||
# Dobolina hit in April 2026 on ``gpt-5.3-codex`` while ``gpt-4o-pure`` on
|
||||
# the same endpoint worked fine. Keep the patterns broad enough to cover
|
||||
# vendor-renamed deployments (e.g. ``gpt-5.3-codex``, ``gpt-5-codex``,
|
||||
# ``gpt-5.4``, ``o1-preview``) but tight enough to leave GPT-4 / 3.5 / Llama /
|
||||
# Mistral / Grok deployments on chat completions.
|
||||
_AZURE_FOUNDRY_RESPONSES_PREFIXES = (
|
||||
"codex", # codex-*, codex-mini
|
||||
"gpt-5", # gpt-5, gpt-5.x, gpt-5-codex, gpt-5.x-codex
|
||||
"o1", # o1, o1-preview, o1-mini
|
||||
"o3", # o3, o3-mini
|
||||
"o4", # o4, o4-mini
|
||||
)
|
||||
|
||||
|
||||
def azure_foundry_model_api_mode(model_name: Optional[str]) -> Optional[str]:
|
||||
"""Infer Azure Foundry api_mode from a deployment/model name.
|
||||
|
||||
Returns ``"codex_responses"`` when the model name matches a family that
|
||||
only accepts the Responses API on Azure Foundry (GPT-5.x, codex, o1/o3/o4
|
||||
reasoning models). Returns ``None`` otherwise — the caller should fall
|
||||
back to the configured/default api_mode (typically ``chat_completions``)
|
||||
so GPT-4o, GPT-4 Turbo, Llama, Mistral, etc. keep working.
|
||||
|
||||
Intentionally does NOT return ``anthropic_messages``; Anthropic-style
|
||||
Azure endpoints are disambiguated by URL (``/anthropic`` suffix) in
|
||||
``runtime_provider._detect_api_mode_for_url`` and by the user setting
|
||||
``model.api_mode: anthropic_messages`` explicitly.
|
||||
"""
|
||||
raw = str(model_name or "").strip().lower()
|
||||
if not raw:
|
||||
return None
|
||||
# Strip any vendor/ prefix a user may have copied from OpenRouter / Copilot.
|
||||
if "/" in raw:
|
||||
raw = raw.rsplit("/", 1)[-1]
|
||||
# gpt-5-mini speaks chat completions on Copilot but Azure Foundry deploys
|
||||
# the full gpt-5 family uniformly on Responses API — don't carve an
|
||||
# exception here.
|
||||
for prefix in _AZURE_FOUNDRY_RESPONSES_PREFIXES:
|
||||
if raw.startswith(prefix):
|
||||
return "codex_responses"
|
||||
return None
|
||||
|
||||
|
||||
def normalize_opencode_model_id(provider_id: Optional[str], model_id: Optional[str]) -> str:
|
||||
"""Normalize OpenCode config IDs to the bare model slug used in API requests."""
|
||||
provider = normalize_provider(provider_id)
|
||||
|
||||
@@ -231,6 +231,19 @@ def _resolve_runtime_from_pool_entry(
|
||||
configured_mode = _parse_api_mode(model_cfg.get("api_mode"))
|
||||
if configured_mode:
|
||||
api_mode = configured_mode
|
||||
# Model-family inference for GPT-5.x / codex / o1-o4: Azure rejects
|
||||
# /chat/completions on these with 400 "operation unsupported" — see
|
||||
# azure_foundry_model_api_mode() for rationale. Skip when the user
|
||||
# explicitly picked anthropic_messages (Anthropic-style endpoint).
|
||||
if effective_model and api_mode != "anthropic_messages":
|
||||
try:
|
||||
from hermes_cli.models import azure_foundry_model_api_mode
|
||||
|
||||
inferred = azure_foundry_model_api_mode(effective_model)
|
||||
except Exception:
|
||||
inferred = None
|
||||
if inferred:
|
||||
api_mode = inferred
|
||||
# For Anthropic-style endpoints, strip /v1 suffix
|
||||
if api_mode == "anthropic_messages":
|
||||
base_url = re.sub(r"/v1/?$", "", base_url)
|
||||
@@ -608,6 +621,7 @@ def _resolve_azure_foundry_runtime(
|
||||
model_cfg: Dict[str, Any],
|
||||
explicit_api_key: Optional[str] = None,
|
||||
explicit_base_url: Optional[str] = None,
|
||||
target_model: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve an Azure Foundry runtime entry.
|
||||
|
||||
@@ -628,6 +642,22 @@ def _resolve_azure_foundry_runtime(
|
||||
cfg_base_url = str(model_cfg.get("base_url") or "").strip().rstrip("/")
|
||||
cfg_api_mode = _parse_api_mode(model_cfg.get("api_mode")) or "chat_completions"
|
||||
|
||||
# Model-family inference: Azure Foundry deploys GPT-5.x / codex / o1-o4
|
||||
# reasoning models as Responses-API-only. Calling /chat/completions
|
||||
# against them returns 400 "The requested operation is unsupported."
|
||||
# Upgrade api_mode when the model name matches, unless the user has
|
||||
# explicitly chosen anthropic_messages (Anthropic-style endpoint).
|
||||
effective_model = str(target_model or model_cfg.get("default") or "").strip()
|
||||
if effective_model and cfg_api_mode != "anthropic_messages":
|
||||
try:
|
||||
from hermes_cli.models import azure_foundry_model_api_mode
|
||||
|
||||
inferred = azure_foundry_model_api_mode(effective_model)
|
||||
except Exception:
|
||||
inferred = None
|
||||
if inferred:
|
||||
cfg_api_mode = inferred
|
||||
|
||||
env_base_url = os.getenv("AZURE_FOUNDRY_BASE_URL", "").strip().rstrip("/")
|
||||
base_url = explicit_base_url_clean or cfg_base_url or env_base_url
|
||||
if not base_url:
|
||||
@@ -864,6 +894,7 @@ def resolve_runtime_provider(
|
||||
model_cfg=_get_model_config(),
|
||||
explicit_api_key=explicit_api_key,
|
||||
explicit_base_url=explicit_base_url,
|
||||
target_model=target_model,
|
||||
)
|
||||
return azure_runtime
|
||||
|
||||
|
||||
@@ -2212,7 +2212,7 @@ async def get_usage_analytics(days: int = 30):
|
||||
cutoff = time.time() - (days * 86400)
|
||||
cur = db._conn.execute("""
|
||||
SELECT date(started_at, 'unixepoch') as day,
|
||||
SUM(input_tokens) as input_tokens,
|
||||
SUM(input_tokens + COALESCE(cache_read_tokens, 0) + COALESCE(cache_write_tokens, 0)) as input_tokens,
|
||||
SUM(output_tokens) as output_tokens,
|
||||
SUM(cache_read_tokens) as cache_read_tokens,
|
||||
SUM(reasoning_tokens) as reasoning_tokens,
|
||||
@@ -2227,18 +2227,18 @@ async def get_usage_analytics(days: int = 30):
|
||||
|
||||
cur2 = db._conn.execute("""
|
||||
SELECT model,
|
||||
SUM(input_tokens) as input_tokens,
|
||||
SUM(input_tokens + COALESCE(cache_read_tokens, 0) + COALESCE(cache_write_tokens, 0)) as input_tokens,
|
||||
SUM(output_tokens) as output_tokens,
|
||||
COALESCE(SUM(estimated_cost_usd), 0) as estimated_cost,
|
||||
COUNT(*) as sessions,
|
||||
SUM(COALESCE(api_call_count, 0)) as api_calls
|
||||
FROM sessions WHERE started_at > ? AND model IS NOT NULL
|
||||
GROUP BY model ORDER BY SUM(input_tokens) + SUM(output_tokens) DESC
|
||||
GROUP BY model ORDER BY SUM(input_tokens + COALESCE(cache_read_tokens, 0) + COALESCE(cache_write_tokens, 0)) + SUM(output_tokens) DESC
|
||||
""", (cutoff,))
|
||||
by_model = [dict(r) for r in cur2.fetchall()]
|
||||
|
||||
cur3 = db._conn.execute("""
|
||||
SELECT SUM(input_tokens) as total_input,
|
||||
SELECT SUM(input_tokens + COALESCE(cache_read_tokens, 0) + COALESCE(cache_write_tokens, 0)) as total_input,
|
||||
SUM(output_tokens) as total_output,
|
||||
SUM(cache_read_tokens) as total_cache_read,
|
||||
SUM(reasoning_tokens) as total_reasoning,
|
||||
|
||||
+19
-3
@@ -1481,16 +1481,32 @@ class SessionDB:
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List sessions, optionally filtered by source."""
|
||||
"""List sessions, optionally filtered by source.
|
||||
|
||||
Returns rows enriched with a computed ``last_active`` column (latest
|
||||
message timestamp for the session, falling back to ``started_at``),
|
||||
ordered by most-recently-used first.
|
||||
"""
|
||||
select_with_last_active = (
|
||||
"SELECT s.*, COALESCE(m.last_active, s.started_at) AS last_active "
|
||||
"FROM sessions s "
|
||||
"LEFT JOIN ("
|
||||
"SELECT session_id, MAX(timestamp) AS last_active "
|
||||
"FROM messages GROUP BY session_id"
|
||||
") m ON m.session_id = s.id "
|
||||
)
|
||||
with self._lock:
|
||||
if source:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT * FROM sessions WHERE source = ? ORDER BY started_at DESC LIMIT ? OFFSET ?",
|
||||
f"{select_with_last_active}"
|
||||
"WHERE s.source = ? "
|
||||
"ORDER BY last_active DESC, s.started_at DESC, s.id DESC LIMIT ? OFFSET ?",
|
||||
(source, limit, offset),
|
||||
)
|
||||
else:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT * FROM sessions ORDER BY started_at DESC LIMIT ? OFFSET ?",
|
||||
f"{select_with_last_active}"
|
||||
"ORDER BY last_active DESC, s.started_at DESC, s.id DESC LIMIT ? OFFSET ?",
|
||||
(limit, offset),
|
||||
)
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
@@ -17,6 +17,7 @@ pkgs.buildNpmPackage (npm // {
|
||||
inherit src npmDeps version;
|
||||
|
||||
doCheck = false;
|
||||
npmFlags = [ "--legacy-peer-deps" ];
|
||||
|
||||
installPhase = ''
|
||||
runHook preInstall
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: touchdesigner-mcp
|
||||
description: "Control a running TouchDesigner instance via twozero MCP — create operators, set parameters, wire connections, execute Python, build real-time visuals. 36 native tools."
|
||||
version: 1.0.0
|
||||
version: 1.1.0
|
||||
author: kshitijk4poor
|
||||
license: MIT
|
||||
metadata:
|
||||
@@ -332,6 +332,12 @@ See `references/network-patterns.md` for complete build scripts + shader code.
|
||||
| `references/mcp-tools.md` | Full twozero MCP tool parameter schemas |
|
||||
| `references/python-api.md` | TD Python: op(), scripting, extensions |
|
||||
| `references/troubleshooting.md` | Connection diagnostics, debugging |
|
||||
| `references/glsl.md` | GLSL uniforms, built-in functions, shader templates |
|
||||
| `references/postfx.md` | Post-FX: bloom, CRT, chromatic aberration, feedback glow |
|
||||
| `references/layout-compositor.md` | HUD layout patterns, panel grids, BSP-style layouts |
|
||||
| `references/operator-tips.md` | Wireframe rendering, feedback TOP setup |
|
||||
| `references/geometry-comp.md` | Geometry COMP: instancing, POP vs SOP, morphing |
|
||||
| `references/audio-reactive.md` | Audio band extraction, beat detection, envelope following |
|
||||
| `scripts/setup.sh` | Automated setup script |
|
||||
|
||||
---
|
||||
|
||||
@@ -0,0 +1,175 @@
|
||||
# Audio-Reactive Reference
|
||||
|
||||
Patterns for driving visuals from audio — spectrum analysis, beat detection, envelope following.
|
||||
|
||||
## Audio Input
|
||||
|
||||
```python
|
||||
# Live input from audio interface
|
||||
audio_in = root.create(audiodeviceinCHOP, 'audio_in')
|
||||
audio_in.par.rate = 44100
|
||||
|
||||
# OR: from audio file (for testing)
|
||||
audio_file = root.create(audiofileinCHOP, 'audio_in')
|
||||
audio_file.par.file = '/path/to/track.wav'
|
||||
audio_file.par.play = True
|
||||
audio_file.par.repeat = 'on' # NOT par.loop
|
||||
audio_file.par.playmode = 'locked'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Audio Band Extraction (Verified TD 2025.32460)
|
||||
|
||||
Use `audiofilterCHOP` for band separation (NOT `selectCHOP` by channel index):
|
||||
|
||||
```python
|
||||
# Audio input
|
||||
af = root.create(audiofileinCHOP, 'audio_in')
|
||||
af.par.file = path
|
||||
af.par.play = True
|
||||
af.par.repeat = 'on'
|
||||
af.par.playmode = 'locked'
|
||||
|
||||
# Low band: lowpass @ 250Hz
|
||||
flt_low = root.create(audiofilterCHOP, 'flt_low')
|
||||
flt_low.par.filter = 'lowpass'
|
||||
flt_low.par.cutofffrequency = 250
|
||||
flt_low.par.rolloff = 2
|
||||
flt_low.inputConnectors[0].connect(af)
|
||||
|
||||
# Mid band: highpass@250 → lowpass@4000
|
||||
flt_mid_hp = root.create(audiofilterCHOP, 'flt_mid_hp')
|
||||
flt_mid_hp.par.filter = 'highpass'
|
||||
flt_mid_hp.par.cutofffrequency = 250
|
||||
flt_mid_hp.par.rolloff = 2
|
||||
flt_mid_hp.inputConnectors[0].connect(af)
|
||||
|
||||
flt_mid_lp = root.create(audiofilterCHOP, 'flt_mid_lp')
|
||||
flt_mid_lp.par.filter = 'lowpass'
|
||||
flt_mid_lp.par.cutofffrequency = 4000
|
||||
flt_mid_lp.par.rolloff = 2
|
||||
flt_mid_lp.inputConnectors[0].connect(flt_mid_hp)
|
||||
|
||||
# High band: highpass @ 4000Hz
|
||||
flt_high = root.create(audiofilterCHOP, 'flt_high')
|
||||
flt_high.par.filter = 'highpass'
|
||||
flt_high.par.cutofffrequency = 4000
|
||||
flt_high.par.rolloff = 2
|
||||
flt_high.inputConnectors[0].connect(af)
|
||||
|
||||
# Per-band: RMS → lag → gain → clamp
|
||||
for name, filt in [('low', flt_low), ('mid', flt_mid_lp), ('high', flt_high)]:
|
||||
rms = root.create(analyzeCHOP, f'rms_{name}')
|
||||
rms.par.function = 'rmspower' # NOT 'rms'
|
||||
rms.inputConnectors[0].connect(filt)
|
||||
|
||||
lag = root.create(lagCHOP, f'lag_{name}')
|
||||
lag.par.lag1 = 0.05 # attack (NOT par.lagin)
|
||||
lag.par.lag2 = 0.25 # release (NOT par.lagout)
|
||||
lag.inputConnectors[0].connect(rms)
|
||||
|
||||
math = root.create(mathCHOP, f'scale_{name}')
|
||||
math.par.gain = 8.0
|
||||
math.inputConnectors[0].connect(lag)
|
||||
|
||||
# mathCHOP has NO par.clamp — use limitCHOP
|
||||
lim = root.create(limitCHOP, f'clamp_{name}')
|
||||
lim.par.type = 'clamp'
|
||||
lim.par.min = 0.0
|
||||
lim.par.max = 1.0
|
||||
lim.inputConnectors[0].connect(math)
|
||||
|
||||
null = root.create(nullCHOP, f'out_{name}')
|
||||
null.inputConnectors[0].connect(lim)
|
||||
null.viewer = True
|
||||
```
|
||||
|
||||
**Key TD 2025 corrections:**
|
||||
- `analyzeCHOP.par.function = 'rmspower'` NOT `'rms'`
|
||||
- `lagCHOP.par.lag1` / `par.lag2` NOT `par.lagin` / `par.lagout`
|
||||
- `mathCHOP` has NO `par.clamp` — use separate `limitCHOP`
|
||||
|
||||
---
|
||||
|
||||
## Beat / Onset Detection
|
||||
|
||||
### Kick Detection (slope → trigger)
|
||||
|
||||
```python
|
||||
slope = root.create(slopeCHOP, 'kick_slope')
|
||||
slope.inputConnectors[0].connect(op('out_low'))
|
||||
|
||||
trig = root.create(triggerCHOP, 'kick_trig')
|
||||
trig.par.threshold = 0.12
|
||||
trig.par.attack = 0.005 # NOT par.attacktime
|
||||
trig.par.decay = 0.15 # NOT par.decaytime
|
||||
trig.par.triggeron = 'increase'
|
||||
trig.inputConnectors[0].connect(slope)
|
||||
|
||||
kick_out = root.create(nullCHOP, 'out_kick')
|
||||
kick_out.inputConnectors[0].connect(trig)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Passing Audio to GLSL
|
||||
|
||||
```python
|
||||
glsl.par.vec0name = 'uLow'
|
||||
glsl.par.vec0valuex.expr = "op('out_low')['chan1']"
|
||||
glsl.par.vec0valuex.mode = ParMode.EXPRESSION
|
||||
|
||||
glsl.par.vec1name = 'uKick'
|
||||
glsl.par.vec1valuex.expr = "op('out_kick')['chan1']"
|
||||
glsl.par.vec1valuex.mode = ParMode.EXPRESSION
|
||||
```
|
||||
|
||||
```glsl
|
||||
uniform float uLow;
|
||||
uniform float uKick;
|
||||
float scale = 1.0 + uKick * 0.4 + uLow * 0.2;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Standard Audio Bus Pattern
|
||||
|
||||
Recommended structure:
|
||||
|
||||
```
|
||||
audiodeviceinCHOP (audio_in)
|
||||
↓
|
||||
[null_audio_in]
|
||||
├──→ audiofilterCHOP (lowpass@250) → analyzeCHOP → lagCHOP → mathCHOP → limitCHOP → null
|
||||
├──→ audiofilterCHOP (bandpass@250-4k) → analyzeCHOP → lagCHOP → mathCHOP → limitCHOP → null
|
||||
├──→ audiofilterCHOP (highpass@4k) → analyzeCHOP → lagCHOP → mathCHOP → limitCHOP → null
|
||||
│
|
||||
└──→ slopeCHOP → triggerCHOP (beat_trigger)
|
||||
```
|
||||
|
||||
Keep this entire bus inside a `baseCOMP` (e.g., `audio_bus`) and reference via paths from visual networks.
|
||||
|
||||
---
|
||||
|
||||
## MIDI Input
|
||||
|
||||
```python
|
||||
midi_in = root.create(midiinCHOP, 'midi_in')
|
||||
midi_in.par.device = 0 # Check midiinDAT for device index
|
||||
# Outputs channels named by MIDI note/CC: 'ch1n60', 'ch1c74', etc.
|
||||
|
||||
# Map CC to a parameter
|
||||
op('bloom1').par.threshold.mode = ParMode.EXPRESSION
|
||||
op('bloom1').par.threshold.expr = "op('midi_in')['ch1c74'][0]"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## CRITICAL: DO NOT use Lag CHOP for spectrum smoothing
|
||||
|
||||
Lag CHOP in timeslice mode expands 256-sample spectrum to 1600-2400 samples, averaging all values to near-zero (~1e-06). The shader receives no usable data. Use `mathCHOP(gain=8)` directly, or smooth in GLSL via temporal lerp with a feedback texture.
|
||||
|
||||
Verified:
|
||||
- Without Lag CHOP: bass bins = 5.0-5.4 (strong, usable)
|
||||
- With Lag CHOP: ALL bins = 0.000001 (dead)
|
||||
@@ -0,0 +1,121 @@
|
||||
# Geometry COMP Reference
|
||||
|
||||
## Creating Geometry COMPs
|
||||
|
||||
```python
|
||||
geo = root.create(geometryCOMP, 'geo1')
|
||||
# Remove default torus
|
||||
for c in list(geo.children):
|
||||
if c.valid: c.destroy()
|
||||
# Build your shape inside
|
||||
```
|
||||
|
||||
## Correct Pattern (shapes inside geo)
|
||||
|
||||
```python
|
||||
# Create shape INSIDE the geo COMP
|
||||
box = geo.create(boxSOP, 'cube')
|
||||
box.par.sizex = 1.5; box.par.sizey = 1.5; box.par.sizez = 1.5
|
||||
|
||||
# For POP-based geometry (TD 099), POPs must be inside:
|
||||
sph = geo.create(spherePOP, 'shape')
|
||||
out1 = geo.create(outPOP, 'out1')
|
||||
out1.inputConnectors[0].connect(sph.outputConnectors[0])
|
||||
```
|
||||
|
||||
## DO NOT: Common Mistakes
|
||||
|
||||
```python
|
||||
# BAD: Don't create geometry at parent level and wire into COMP
|
||||
box = root.create(boxPOP, 'box1') # ← outside geo, won't render
|
||||
|
||||
# BAD: Don't reference parent operators from inside COMP
|
||||
choptopop1.par.chop = '../null1' # ← hidden dependency, breaks on move
|
||||
```
|
||||
|
||||
## Instancing
|
||||
|
||||
```python
|
||||
geo.par.instancing = True
|
||||
geo.par.instanceop = 'sopto1' # relative path to CHOP/SOP with instance data
|
||||
geo.par.instancetx = 'tx'
|
||||
geo.par.instancety = 'ty'
|
||||
geo.par.instancetz = 'tz'
|
||||
```
|
||||
|
||||
### Instance Attribute Names by OP Type
|
||||
|
||||
| OP Type | Attribute Names |
|
||||
|---------|-----------------|
|
||||
| CHOP | Channel names: `tx`, `ty`, `tz` |
|
||||
| SOP/POP | `P(0)`, `P(1)`, `P(2)` for position |
|
||||
| DAT | Column header names from first row |
|
||||
| TOP | `r`, `g`, `b`, `a` |
|
||||
|
||||
### Mixed Data Sources
|
||||
|
||||
```python
|
||||
geo.par.instanceop = 'pos_chop' # Position from CHOP
|
||||
geo.par.instancetx = 'tx'
|
||||
geo.par.instancecolorop = 'color_top' # Color from TOP
|
||||
geo.par.instancecolorr = 'r'
|
||||
```
|
||||
|
||||
## Rendering Setup
|
||||
|
||||
```python
|
||||
# Camera
|
||||
cam = root.create(cameraCOMP, 'cam1')
|
||||
cam.par.tx = 0; cam.par.ty = 0; cam.par.tz = 4
|
||||
|
||||
# Render TOP
|
||||
render = root.create(renderTOP, 'render1')
|
||||
render.par.outputresolution = 'custom'
|
||||
render.par.resolutionw = 1280; render.par.resolutionh = 720
|
||||
render.par.camera = cam.path
|
||||
render.par.geometry = geo.path # accepts path string
|
||||
```
|
||||
|
||||
## POPs vs SOPs for Rendering
|
||||
|
||||
In TD 099, `geometryCOMP` renders **POPs** but NOT SOPs. A `boxSOP` inside a geometry COMP is invisible — no errors.
|
||||
|
||||
```python
|
||||
# WRONG — SOPs don't render (invisible, no errors)
|
||||
box = geo.create(boxSOP, 'cube') # ✗ invisible
|
||||
|
||||
# CORRECT — POPs render
|
||||
box = geo.create(boxPOP, 'cube') # ✓ visible
|
||||
```
|
||||
|
||||
| SOP | POP | Notes |
|
||||
|-----|-----|-------|
|
||||
| `boxSOP` | `boxPOP` | `sizex/y/z`, `surftype` |
|
||||
| `sphereSOP` | `spherePOP` | `radx/y/z`, `freq`, `type` (geodesic/grid/sharedpoles/tetrahedron) |
|
||||
| `torusSOP` | `torusPOP` | TD auto-creates in new geo COMPs |
|
||||
| `circleSOP` | `circlePOP` | |
|
||||
| `gridSOP` | `gridPOP` | |
|
||||
| `tubeSOP` | `tubePOP` | |
|
||||
|
||||
New geometry COMPs auto-create: `in1` (inPOP), `out1` (outPOP), `torus1` (torusPOP). Always clean before building.
|
||||
|
||||
## Morphing Between Shapes (switchPOP)
|
||||
|
||||
```python
|
||||
sw = geo.create(switchPOP, 'shape_switch')
|
||||
sw.par.index.expr = 'int(absTime.seconds / 3) % 4'
|
||||
sw.inputConnectors[0].connect(tetra.outputConnectors[0]) # shape 0
|
||||
sw.inputConnectors[1].connect(box.outputConnectors[0]) # shape 1
|
||||
sw.inputConnectors[2].connect(octa.outputConnectors[0]) # shape 2
|
||||
sw.inputConnectors[3].connect(sphere.outputConnectors[0]) # shape 3
|
||||
|
||||
out = geo.create(outPOP, 'out1')
|
||||
out.inputConnectors[0].connect(sw.outputConnectors[0])
|
||||
```
|
||||
|
||||
`spherePOP.par.type` options: `geodesic`, `grid`, `sharedpoles`, `tetrahedron`. Use `tetrahedron` for platonic solid polyhedra.
|
||||
|
||||
## Misc
|
||||
|
||||
- `connect()` replaces existing connections — no need to disconnect first
|
||||
- `project.name` returns the TOE filename, `project.folder` returns the directory
|
||||
@@ -0,0 +1,151 @@
|
||||
# GLSL Reference
|
||||
|
||||
## Uniforms
|
||||
|
||||
```
|
||||
TouchDesigner GLSL
|
||||
─────────────────────────────
|
||||
vec0name = 'uTime' → uniform float uTime;
|
||||
vec0valuex = 1.0 → uTime value
|
||||
```
|
||||
|
||||
### Pass Time
|
||||
|
||||
```python
|
||||
glsl_op.par.vec0name = 'uTime'
|
||||
glsl_op.par.vec0valuex.mode = ParMode.EXPRESSION
|
||||
glsl_op.par.vec0valuex.expr = 'absTime.seconds'
|
||||
```
|
||||
|
||||
```glsl
|
||||
uniform float uTime;
|
||||
void main() { float t = uTime * 0.5; }
|
||||
```
|
||||
|
||||
### Built-in Uniforms (TOP)
|
||||
|
||||
```glsl
|
||||
// Output resolution (always available)
|
||||
vec2 res = uTDOutputInfo.res.zw;
|
||||
|
||||
// Input texture (only when inputs connected)
|
||||
vec2 inputRes = uTD2DInfos[0].res.zw;
|
||||
vec4 color = texture(sTD2DInputs[0], vUV.st);
|
||||
|
||||
// UV coordinates
|
||||
vUV.st // 0-1 texture coords
|
||||
```
|
||||
|
||||
**IMPORTANT:** `uTD2DInfos` requires input textures. For standalone shaders use `uTDOutputInfo`.
|
||||
|
||||
## Built-in Utility Functions
|
||||
|
||||
```glsl
|
||||
// Noise
|
||||
float TDPerlinNoise(vec2/vec3/vec4 v);
|
||||
float TDSimplexNoise(vec2/vec3/vec4 v);
|
||||
|
||||
// Color conversion
|
||||
vec3 TDHSVToRGB(vec3 c);
|
||||
vec3 TDRGBToHSV(vec3 c);
|
||||
|
||||
// Matrix transforms
|
||||
mat4 TDTranslate(float x, float y, float z);
|
||||
mat3 TDRotateX/Y/Z(float radians);
|
||||
mat3 TDRotateOnAxis(float radians, vec3 axis);
|
||||
mat3 TDScale(float x, float y, float z);
|
||||
mat3 TDRotateToVector(vec3 forward, vec3 up);
|
||||
mat3 TDCreateRotMatrix(vec3 from, vec3 to); // vectors must be normalized
|
||||
|
||||
// Resolution struct
|
||||
struct TDTexInfo {
|
||||
vec4 res; // (1/width, 1/height, width, height)
|
||||
vec4 depth;
|
||||
};
|
||||
|
||||
// Output (always use this — handles sRGB correctly)
|
||||
fragColor = TDOutputSwizzle(color);
|
||||
|
||||
// Instancing (MAT only)
|
||||
int TDInstanceID();
|
||||
```
|
||||
|
||||
## glslTOP
|
||||
|
||||
Docked DATs created automatically:
|
||||
- `glsl1_pixel` — Pixel shader
|
||||
- `glsl1_compute` — Compute shader
|
||||
- `glsl1_info` — Compile info
|
||||
|
||||
### Pixel Shader Template
|
||||
|
||||
```glsl
|
||||
out vec4 fragColor;
|
||||
void main() {
|
||||
vec4 color = texture(sTD2DInputs[0], vUV.st);
|
||||
fragColor = TDOutputSwizzle(color);
|
||||
}
|
||||
```
|
||||
|
||||
### Compute Shader Template
|
||||
|
||||
```glsl
|
||||
layout (local_size_x = 8, local_size_y = 8) in;
|
||||
void main() {
|
||||
vec4 color = texelFetch(sTD2DInputs[0], ivec2(gl_GlobalInvocationID.xy), 0);
|
||||
TDImageStoreOutput(0, gl_GlobalInvocationID, color);
|
||||
}
|
||||
```
|
||||
|
||||
### Update Shader
|
||||
|
||||
```python
|
||||
op('/project1/glsl1_pixel').text = shader_code
|
||||
op('/project1/glsl1').cook(force=True)
|
||||
# Check errors:
|
||||
print(op('/project1/glsl1_info').text)
|
||||
```
|
||||
|
||||
## glslMAT
|
||||
|
||||
Docked DATs:
|
||||
- `glslmat1_vertex` — Vertex shader (param: `vdat`)
|
||||
- `glslmat1_pixel` — Pixel shader (param: `pdat`)
|
||||
- `glslmat1_info` — Compile info
|
||||
|
||||
Note: MAT uses `vdat`/`pdat`, TOP uses `vertexdat`/`pixeldat`.
|
||||
|
||||
### Vertex Shader Template
|
||||
|
||||
```glsl
|
||||
uniform float uTime;
|
||||
void main() {
|
||||
vec3 pos = TDPos();
|
||||
pos.z += sin(pos.x * 3.0 + uTime) * 0.2;
|
||||
vec4 worldSpacePos = TDDeform(pos);
|
||||
gl_Position = TDWorldToProj(worldSpacePos);
|
||||
}
|
||||
```
|
||||
|
||||
## Bayer 8x8 Dither Matrix
|
||||
|
||||
Reusable ordered dither function for retro/print aesthetics:
|
||||
|
||||
```glsl
|
||||
float bayer8(vec2 pos) {
|
||||
int x = int(mod(pos.x, 8.0)), y = int(mod(pos.y, 8.0)), idx = x + y * 8;
|
||||
int b[64] = int[64](
|
||||
0,32,8,40,2,34,10,42,48,16,56,24,50,18,58,26,
|
||||
12,44,4,36,14,46,6,38,60,28,52,20,62,30,54,22,
|
||||
3,35,11,43,1,33,9,41,51,19,59,27,49,17,57,25,
|
||||
15,47,7,39,13,45,5,37,63,31,55,23,61,29,53,21
|
||||
);
|
||||
return float(b[idx]) / 64.0;
|
||||
}
|
||||
```
|
||||
|
||||
## glslPOP / glsladvancedPOP / glslcopyPOP
|
||||
|
||||
All use compute shaders. Docked DATs follow naming convention:
|
||||
- `glsl1_compute` / `glsladv1_compute`
|
||||
- `glslcopy1_ptCompute` / `glslcopy1_vertCompute` / `glslcopy1_primCompute`
|
||||
@@ -0,0 +1,131 @@
|
||||
# Layout Compositor Reference
|
||||
|
||||
Patterns for building modular multi-panel grids — useful for HUD interfaces, data dashboards, and multi-source visual composites.
|
||||
|
||||
## Layout Approaches
|
||||
|
||||
| Approach | Best For | Notes |
|
||||
|----------|----------|-------|
|
||||
| `layoutTOP` | Fixed grid, quick setup | GPU, simple tiling |
|
||||
| Container COMP + `overTOP` | Full control, mixed-size panels | More setup, very flexible |
|
||||
| GLSL compositor | Procedural / BSP-style | Most powerful, more complex |
|
||||
|
||||
---
|
||||
|
||||
## layoutTOP
|
||||
|
||||
Built-in grid compositor — fastest path for uniform tile grids.
|
||||
|
||||
```python
|
||||
layout = root.create(layoutTOP, 'layout1')
|
||||
layout.par.resolutionw = 1920
|
||||
layout.par.resolutionh = 1080
|
||||
layout.par.cols = 3
|
||||
layout.par.rows = 2
|
||||
layout.par.gap = 4
|
||||
```
|
||||
|
||||
Connect inputs (up to cols×rows):
|
||||
```python
|
||||
layout.inputConnectors[0].connect(op('panel_radar'))
|
||||
layout.inputConnectors[1].connect(op('panel_wave'))
|
||||
layout.inputConnectors[2].connect(op('panel_data'))
|
||||
```
|
||||
|
||||
**Variable-width columns:** Not directly supported. Use overTOP approach for non-uniform grids.
|
||||
|
||||
---
|
||||
|
||||
## Container COMP Grid
|
||||
|
||||
Build each element as its own `containerCOMP`. Compose with `overTOP`:
|
||||
|
||||
```python
|
||||
def create_panel(root, name, width, height, x=0, y=0):
|
||||
panel = root.create(containerCOMP, name)
|
||||
panel.par.w = width
|
||||
panel.par.h = height
|
||||
panel.viewer = True
|
||||
return panel
|
||||
|
||||
# Composite with overTOP chain
|
||||
over1 = root.create(overTOP, 'over1')
|
||||
over1.inputConnectors[0].connect(panel_radar)
|
||||
over1.inputConnectors[1].connect(panel_wave)
|
||||
over1.par.topx2 = 0
|
||||
over1.par.topy2 = 512
|
||||
```
|
||||
|
||||
**Tip:** Use a `resolutionTOP` before each `overTOP` input if panels are different sizes.
|
||||
|
||||
---
|
||||
|
||||
## Panel Dividers (GLSL)
|
||||
|
||||
```glsl
|
||||
out vec4 fragColor;
|
||||
uniform vec2 uGridDivisions; // e.g. vec2(3, 2) for 3 cols, 2 rows
|
||||
uniform float uLineWidth; // pixels
|
||||
uniform vec4 uLineColor; // e.g. vec4(0.0, 1.0, 0.8, 0.6) for cyan
|
||||
|
||||
void main() {
|
||||
vec2 res = uTDOutputInfo.res.zw;
|
||||
vec2 uv = vUV.st;
|
||||
vec4 bg = texture(sTD2DInputs[0], uv);
|
||||
|
||||
float lineW = uLineWidth / res.x;
|
||||
float lineH = uLineWidth / res.y;
|
||||
|
||||
float vDiv = 0.0;
|
||||
for (float i = 1.0; i < uGridDivisions.x; i++) {
|
||||
float x = i / uGridDivisions.x;
|
||||
vDiv = max(vDiv, step(abs(uv.x - x), lineW));
|
||||
}
|
||||
|
||||
float hDiv = 0.0;
|
||||
for (float i = 1.0; i < uGridDivisions.y; i++) {
|
||||
float y = i / uGridDivisions.y;
|
||||
hDiv = max(hDiv, step(abs(uv.y - y), lineH));
|
||||
}
|
||||
|
||||
float line = max(vDiv, hDiv);
|
||||
vec4 result = mix(bg, uLineColor, line * uLineColor.a);
|
||||
fragColor = TDOutputSwizzle(result);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Element Library Pattern
|
||||
|
||||
Each visual element lives in its own `baseCOMP` as a reusable `.tox`:
|
||||
|
||||
### Standard Interface
|
||||
```
|
||||
inputs:
|
||||
- in_audio (CHOP) — audio envelope / beat data
|
||||
- in_data (CHOP) — optional data stream
|
||||
- in_control (CHOP) — intensity, color, speed params
|
||||
|
||||
outputs:
|
||||
- out_top (TOP) — rendered element
|
||||
```
|
||||
|
||||
### Network Structure
|
||||
```
|
||||
/project1/
|
||||
audio_bus/ ← all audio analysis (see audio-reactive.md)
|
||||
elements/
|
||||
elem_radar/ ← baseCOMP with out_top
|
||||
elem_wave/
|
||||
elem_data/
|
||||
compositor/
|
||||
layout1 ← layoutTOP or overTOP chain
|
||||
dividers1 ← GLSL divider lines
|
||||
postfx/ ← bloom → chrom → CRT stack (see postfx.md)
|
||||
null_out ← final output
|
||||
output/
|
||||
windowCOMP ← full-screen output
|
||||
```
|
||||
|
||||
**Key principle:** Elements don't know about each other. The compositor assembles them. Audio bus is referenced by all elements but lives separately.
|
||||
@@ -0,0 +1,106 @@
|
||||
# Operator Tips
|
||||
|
||||
## Wireframe Rendering Pattern
|
||||
|
||||
Reusable setup for wireframe geometry on black background:
|
||||
|
||||
```python
|
||||
# 1. Material
|
||||
mat = root.create(wireframeMAT, 'wire_mat')
|
||||
mat.par.colorr = 1.0; mat.par.colorg = 0.0; mat.par.colorb = 0.0
|
||||
mat.par.linewidth = 3
|
||||
|
||||
# 2. Geometry COMP
|
||||
geo = root.create(geometryCOMP, 'my_geo')
|
||||
geo.par.rx.expr = 'absTime.seconds * 30'
|
||||
geo.par.ry.expr = 'absTime.seconds * 45'
|
||||
geo.par.material = mat.path # NOTE: 'material' not 'mat'
|
||||
|
||||
# 3. Shape inside the geo
|
||||
box = geo.create(boxSOP, 'cube')
|
||||
box.par.sizex = 1.5; box.par.sizey = 1.5; box.par.sizez = 1.5
|
||||
|
||||
# 4. Camera
|
||||
cam = root.create(cameraCOMP, 'cam1')
|
||||
cam.par.tx = 0; cam.par.ty = 0; cam.par.tz = 4; cam.par.fov = 45
|
||||
|
||||
# 5. Render TOP
|
||||
render = root.create(renderTOP, 'render1')
|
||||
render.par.outputresolution = 'custom'
|
||||
render.par.resolutionw = 1280; render.par.resolutionh = 720
|
||||
render.par.bgcolorr = 0; render.par.bgcolorg = 0; render.par.bgcolorb = 0
|
||||
render.par.camera = cam.path
|
||||
render.par.geometry = geo.path
|
||||
|
||||
# 6. Output null
|
||||
out = root.create(nullTOP, 'out1')
|
||||
out.inputConnectors[0].connect(render.outputConnectors[0])
|
||||
```
|
||||
|
||||
**Key rules:**
|
||||
- Class names: `wireframeMAT` not `wireframeMat` (all-caps suffix)
|
||||
- Geometry SOPs/POPs go INSIDE the geo comp
|
||||
- Material: `geo.par.material` not `geo.par.mat`
|
||||
- Render geometry: `render.par.geometry = geo.path` (string path)
|
||||
- `wireframeMAT.par.wireframemode = 'topology'` for clean wireframe (vs `'tesselated'` for triangle edges)
|
||||
- Alternative: Use `renderTOP.par.overridemat` instead of per-geo material
|
||||
|
||||
## Feedback TOP
|
||||
|
||||
### Basic Structure
|
||||
|
||||
```
|
||||
input (initial state) ──┐
|
||||
├──→ feedback_top ──→ processing ──→ null_out
|
||||
│ ↑
|
||||
└── par.top = 'null_out' ────────────────┘
|
||||
```
|
||||
|
||||
### Setup Pattern
|
||||
|
||||
```python
|
||||
# 1. Processing chain
|
||||
glsl = root.create(glslTOP, 'sim')
|
||||
null_out = root.create(nullTOP, 'null_out')
|
||||
glsl.outputConnectors[0].connect(null_out.inputConnectors[0])
|
||||
|
||||
# 2. Feedback referencing null_out
|
||||
feedback = root.create(feedbackTOP, 'feedback')
|
||||
feedback.par.top = 'null_out'
|
||||
|
||||
# 3. Black initial state
|
||||
const_init = root.create(constantTOP, 'const_init')
|
||||
const_init.par.colorr = 0; const_init.par.colorg = 0; const_init.par.colorb = 0
|
||||
|
||||
# 4. Wire: initial → feedback, feedback → processing
|
||||
feedback.inputConnectors[0].connect(const_init)
|
||||
glsl.inputConnectors[0].connect(feedback)
|
||||
|
||||
# 5. Reset to apply initial state
|
||||
feedback.par.resetpulse.pulse()
|
||||
```
|
||||
|
||||
### Common Errors
|
||||
|
||||
| Error | Cause | Solution |
|
||||
|-------|-------|----------|
|
||||
| "Not enough sources specified" | No input connected | Connect initial state TOP |
|
||||
| Unexpected initial pattern | Wrong initial state | Use Constant TOP (black) |
|
||||
|
||||
### Tips
|
||||
|
||||
1. Use float format for simulations: `glsl.par.format = 'rgba32float'`
|
||||
2. Reset after setup: `feedback.par.resetpulse.pulse()`
|
||||
3. Match resolutions — feedback, processing, and initial state must match
|
||||
4. Soft boundary prevents edge artifacts:
|
||||
```glsl
|
||||
float edge = 3.0 * texel.x;
|
||||
float bx = smoothstep(0.0, edge, uv.x) * smoothstep(0.0, edge, 1.0 - uv.x);
|
||||
float by = smoothstep(0.0, edge, uv.y) * smoothstep(0.0, edge, 1.0 - uv.y);
|
||||
value *= bx * by;
|
||||
```
|
||||
|
||||
### Use Cases
|
||||
- **Wave Simulation** — R=height, G=velocity, black initial state
|
||||
- **Cellular Automata** — white=alive, black=dead, random noise initial state
|
||||
- **Trail / Motion Blur** — blend current frame with feedback, black initial
|
||||
@@ -143,20 +143,20 @@ Creating nodes with the same names you just destroyed in the SAME script causes
|
||||
```python
|
||||
# td_execute_python:
|
||||
for c in list(root.children):
|
||||
if c.valid and c.name.startswith('promo_'):
|
||||
if c.valid and c.name.startswith('my_'):
|
||||
c.destroy()
|
||||
# ... then create promo_audio, promo_shader etc. in same script → CRASHES
|
||||
# ... then create my_audio, my_shader etc. in same script → CRASHES
|
||||
```
|
||||
|
||||
**CORRECT (two separate calls):**
|
||||
```python
|
||||
# Call 1: td_execute_python — clean only
|
||||
for c in list(root.children):
|
||||
if c.valid and c.name.startswith('promo_'):
|
||||
if c.valid and c.name.startswith('my_'):
|
||||
c.destroy()
|
||||
|
||||
# Call 2: td_execute_python — build (separate MCP call)
|
||||
audio = root.create(audiofileinCHOP, 'promo_audio')
|
||||
audio = root.create(audiofileinCHOP, 'my_audio')
|
||||
# ... rest of build
|
||||
```
|
||||
|
||||
@@ -361,21 +361,13 @@ win.par.winopen.pulse()
|
||||
|
||||
`out.sample(x, y)` returns pixels from a single cook snapshot. Compare samples with 2+ second delays, or use screencapture on the display window.
|
||||
|
||||
### 32. Audio-reactive GLSL: dual-layer sync pipeline
|
||||
### 32. Audio-reactive GLSL: TD-side pipeline
|
||||
|
||||
For audio-synced visuals, use BOTH layers for maximum effect:
|
||||
|
||||
**Layer 1 (TD-side, real-time):** AudioFileIn → AudioSpectrum(timeslice=True, fftsize='256') → Math(gain=5) → choptoTOP(par.chop=math, layout='rowscropped') → GLSL input. The shader samples `sTD2DInputs[1]` at different x positions for bass/mid/hi. Record the TD output with MovieFileOut.
|
||||
|
||||
**Layer 2 (Python-side, post-hoc):** scipy FFT on the SAME audio file → per-frame features (rms, bass, mid, hi, beat detection) → drive ASCII brightness, chromatic aberration, beat flashes during the render pass.
|
||||
|
||||
Both layers locked to the same audio file = visuals genuinely sync to the beat at two independent stages.
|
||||
For audio-synced visuals: AudioFileIn → AudioSpectrum(timeslice=True, fftsize='256') → Math(gain=5) → choptoTOP(par.chop=math, layout='rowscropped') → GLSL input. The shader samples `sTD2DInputs[1]` at different x positions for bass/mid/hi. Record the TD output with MovieFileOut.
|
||||
|
||||
**Key gotcha:** AudioFileIn must be cued (`par.cue=True` → `par.cuepulse.pulse()`) then uncued (`par.cue=False`, `par.play=True`) before recording starts. Otherwise the spectrum is silent for the first few seconds.
|
||||
|
||||
### 33. twozero MCP: benchmark and prefer native tools
|
||||
|
||||
Benchmarked April 2026: twozero MCP with 36 native tools. The old curl/REST method (port 9981) had zero native tools.
|
||||
### 33. twozero MCP: prefer native tools
|
||||
|
||||
**Always prefer native MCP tools over td_execute_python:**
|
||||
- `td_create_operator` over `root.create()` scripts (handles viewport positioning)
|
||||
@@ -425,13 +417,16 @@ TD can show `fps:0` in `td_get_perf` while ops still cook and `TOP.save()` still
|
||||
|
||||
**a) Project is paused (playbar stopped).** TD's playbar can be toggled with spacebar. The `root` at `/` has no `.playbar` attribute (it's on the perform COMP). The easiest fix is sending a spacebar keypress via `td_input_execute`, though this tool can sometimes error. As a workaround, `TOP.save()` always works regardless of play state — use it to verify rendering is actually happening before spending time debugging FPS.
|
||||
|
||||
**b) Audio device CHOP blocking the main thread.** An `audiooutCHOP` with an active audio device can consume 300-400ms/s (2000%+ of frame budget), stalling the cook loop at FPS=0. Fix: keep the CHOP active but set `volume=0` to prevent the audio driver from blocking. Disabling it entirely (`active=False`) may also work but can prevent downstream audio processing CHOPs from cooking.
|
||||
**b) Audio device CHOP blocking the main thread (MOST COMMON).** An `audiodeviceoutCHOP` with `active=True` can consume 300-400ms/s (2000%+ of frame budget), stalling the cook loop at FPS=0. **`volume=0` is NOT sufficient** — the audio driver still blocks. Fix: `par.active = False`. This completely stops the CHOP from interacting with the audio driver. If you need audio monitoring, enable it only during short playback checks, then disable before recording.
|
||||
|
||||
Verified April 2026: disabling `audiodeviceoutCHOP` (`active=False`) restored FPS from 0 to 60 instantly, recovering from 2348% budget usage to 0.1%.
|
||||
|
||||
Diagnostic sequence when FPS=0:
|
||||
1. `td_get_perf` — check if any op has extreme CPU/s
|
||||
2. `TOP.save()` on the output — if it produces a valid image, the pipeline works, just not at real-time rate
|
||||
3. Check for blocking CHOPs (audioout, audiodevin, etc.)
|
||||
4. Toggle play state (spacebar, or check if absTime.seconds is advancing)
|
||||
1. `td_get_perf` — check if any op has extreme CPU/s (audiodeviceoutCHOP is the usual suspect)
|
||||
2. If audiodeviceoutCHOP shows >100ms/s: set `par.active = False` immediately
|
||||
3. `TOP.save()` on the output — if it produces a valid image, the pipeline works, just not at real-time rate
|
||||
4. Check for other blocking CHOPs (audiodevin, etc.)
|
||||
5. Toggle play state (spacebar, or check if absTime.seconds is advancing)
|
||||
|
||||
### 39. Recording while FPS=0 produces empty or near-empty files
|
||||
|
||||
@@ -484,9 +479,20 @@ If `td_write_dat` fails, fall back to `td_execute_python`:
|
||||
op("/project1/shader_code").text = shader_string
|
||||
```
|
||||
|
||||
### 42. td_execute_python does NOT return stdout or print() output
|
||||
### 42. td_execute_python DOES return print() output — use it for debugging
|
||||
|
||||
Despite what earlier versions of pitfall #33 stated, `print()` and `debug()` output from `td_execute_python` scripts does NOT appear in the MCP response. The response is always just `(ok)` + FPS/error summary. To read values back, use dedicated inspection tools (`td_get_operator_info`, `td_read_dat`, `td_read_chop`) instead of trying to print from within a script.
|
||||
`print()` statements in `td_execute_python` scripts appear in the MCP response text. This is the correct way to read values back from scripts. The response format is: printed output first, then `[fps X.X/X] [N err/N warn]` on a separate line.
|
||||
|
||||
However, the `result` variable (if you set one) does NOT appear verbatim — use `print()` for anything you need to read back:
|
||||
```python
|
||||
# CORRECT — appears in response:
|
||||
print('value:', some_value)
|
||||
|
||||
# WRONG — not reliably in response:
|
||||
result = some_value
|
||||
```
|
||||
|
||||
For structured data, use dedicated inspection tools (`td_get_operator_info`, `td_read_chop`) which return clean JSON.
|
||||
|
||||
### 43. td_get_operator_info JSON is appended with `[fps X.X/X]` — breaks json.loads()
|
||||
|
||||
@@ -496,13 +502,203 @@ clean = response_text.rsplit('[fps', 1)[0]
|
||||
data = json.loads(clean)
|
||||
```
|
||||
|
||||
### 44. td_get_screenshot is asynchronous — returns `{"status": "pending"}`
|
||||
### 44. td_get_screenshot is unreliable — returns `{"status": "pending"}` and may never deliver
|
||||
|
||||
Screenshots don't complete instantly. The tool returns `{"status": "pending", "requestId": "..."}` and the actual file appears later. Wait a few seconds before checking for the file. There is no callback or completion notification — poll the filesystem.
|
||||
Screenshots don't complete instantly. The tool returns `{"status": "pending", "requestId": "..."}` and the actual file may appear later — or may NEVER appear at all. In testing (April 2026), screenshots stayed "pending" indefinitely with no file written to disk, even though the shader was cooking at 8-30fps.
|
||||
|
||||
### 45. Recording duration is manual — no auto-stop at audio end
|
||||
**Do NOT rely on `td_get_screenshot` for frame capture.** For reliable frame capture, use MovieFileOut recording + ffmpeg frame extraction:
|
||||
```bash
|
||||
# Record in TD first, then extract frames:
|
||||
ffmpeg -y -i /tmp/td_output.mov -t 25 -vf 'fps=24' /tmp/td_frames/frame_%06d.png
|
||||
```
|
||||
|
||||
If you need a quick visual check, `td_get_screenshot` is worth trying (it sometimes works), but always have the recording fallback. There is no callback or completion notification — if the file doesn't appear after 5-10 seconds, it's not coming.
|
||||
|
||||
### 45. Heavy shaders cook below record FPS — many duplicate frames in output
|
||||
|
||||
A raymarched GLSL shader may only cook at 8-15fps even though MovieFileOut records at 60fps. The recording still works (TD writes the last-cooked frame each time), but the resulting file has many duplicate frames. When extracting frames for post-processing, use a lower fps filter to avoid redundant frames:
|
||||
```bash
|
||||
# Extract at 24fps from a 60fps recording of an 8fps shader:
|
||||
ffmpeg -y -i /tmp/td_output.mov -t 25 -vf 'fps=24' /tmp/td_frames/frame_%06d.png
|
||||
```
|
||||
Check actual cook FPS with `td_get_perf` before committing to a long recording. If FPS < 15, the output will be a slideshow regardless of the recording codec.
|
||||
|
||||
### 46. Recording duration is manual — no auto-stop at audio end
|
||||
|
||||
MovieFileOut records until `par.record = False` is set. If audio ends before you stop recording, the file keeps growing with repeated frames. Always stop recording promptly after the audio duration. For precision: set a timer on the agent side matching the audio length, then send `par.record = False`. Trim excess with ffmpeg as a safety net:
|
||||
```bash
|
||||
ffmpeg -i raw.mov -t 25 -c copy trimmed.mov
|
||||
```
|
||||
|
||||
### 47. AudioFileIn par.index stays at 0 in sequential mode — not a reliable progress indicator
|
||||
|
||||
When `audiofileinCHOP` is in `playmode=2` (sequential), `par.index.eval()` returns 0.0 even while audio IS actively playing and the spectrum IS receiving data. Do NOT use `par.index` to check playback progress in sequential mode.
|
||||
|
||||
**How to verify audio is actually playing:**
|
||||
- Read the spectrum CHOP values via `td_read_chop` — if values are non-zero and CHANGE between reads 1-2s apart, audio is flowing
|
||||
- Read the audio CHOP itself: non-zero waveform samples confirm the file is loaded and playing
|
||||
- `par.play.eval()` returning True is necessary but NOT sufficient — it can be True with no audio flowing if cue is stuck
|
||||
|
||||
### 48. GLSL shader whiteout — clamp audio spectrum values in the shader
|
||||
|
||||
Raw spectrum values multiplied by Math CHOP gain can produce very large numbers (5-20+) that blow out the shader's lighting, producing flat white/grey. The shader MUST clamp audio inputs:
|
||||
|
||||
```glsl
|
||||
float bass = texture(sTD2DInputs[1], vec2(0.05, 0.25)).r;
|
||||
bass = clamp(bass, 0.0, 3.0); // prevent whiteout
|
||||
mids = clamp(mids, 0.0, 3.0);
|
||||
hi = clamp(hi, 0.0, 3.0);
|
||||
```
|
||||
|
||||
Discovered when gain=10 produced ~0.13 (too dark) during quiet passages but gain=50 produced ~9.4 (total whiteout). Fix: keep gain=10, use `highfreqboost=3.0` on AudioSpectrum, clamp in shader.
|
||||
|
||||
### 49. Non-Commercial TD records at 1280x1280 (square) — always crop in post
|
||||
|
||||
Even with `resolutionw=1280, resolutionh=720` on the GLSL TOP, Non-Commercial TD may output 1280x1280 to MovieFileOut. Always check dimensions with ffprobe and crop during extraction:
|
||||
|
||||
```bash
|
||||
# Center-crop from 1280x1280 to 1280x720:
|
||||
ffmpeg -y -i /tmp/td_output.mov -t 25 -r 24 -vf "crop=1280:720:0:280" /tmp/frames/frame_%06d.png
|
||||
```
|
||||
|
||||
Large ProRes files (1-2GB) at 1280x1280 decode at ~3fps, so 25s of footage takes ~3 minutes to extract.
|
||||
|
||||
## Advanced Patterns (pitfalls 51+)
|
||||
|
||||
### 51. Connection syntax: use `outputConnectors`/`inputConnectors`, NOT `outputs`/`inputs`
|
||||
|
||||
```python
|
||||
# CORRECT
|
||||
src.outputConnectors[0].connect(dst.inputConnectors[0])
|
||||
# WRONG — raises IndexError or AttributeError
|
||||
src.outputs[0].connect(dst.inputs[0])
|
||||
```
|
||||
|
||||
For feedback TOP, BOTH are required:
|
||||
```python
|
||||
fb.par.top = target.path
|
||||
target.outputConnectors[0].connect(fb.inputConnectors[0])
|
||||
```
|
||||
|
||||
### 52. moviefileoutTOP `par.input` doesn't resolve via Python in TD 2025.32460
|
||||
|
||||
Setting `moviefileoutTOP.par.input` programmatically does NOT work. All forms fail silently with "Not enough sources specified."
|
||||
|
||||
**Workaround — frame capture + ffmpeg:**
|
||||
```python
|
||||
out = op('/project1/out')
|
||||
for i in range(300):
|
||||
delay = i * 5
|
||||
run(f"op('/project1/out').save('/tmp/frames/f_{i:04d}.png')", delayFrames=delay)
|
||||
# Then: ffmpeg -y -framerate 30 -i /tmp/frames/f_%04d.png -c:v prores -pix_fmt yuv420p /tmp/output.mov
|
||||
```
|
||||
|
||||
### 53. Batch frame capture — use `me.fetch`/`me.store` for state across calls
|
||||
|
||||
```python
|
||||
start = me.fetch('cap_frame', 0)
|
||||
for i in range(60):
|
||||
frame = start + i
|
||||
op('/project1/out').save(f'/tmp/frames/frame_{str(frame).zfill(4)}.png')
|
||||
me.store('cap_frame', start + 60)
|
||||
```
|
||||
Call 5 times for 300 frames. Each picks up where the last left off.
|
||||
|
||||
### 54. GLSL TOP pixel shader requirements in TD 2025
|
||||
|
||||
```glsl
|
||||
// REQUIRED — declare output
|
||||
layout(location = 0) out vec4 fragColor;
|
||||
|
||||
void main() {
|
||||
vec3 col = vec3(1.0, 0.0, 0.0);
|
||||
fragColor = TDOutputSwizzle(vec4(col, 1.0));
|
||||
}
|
||||
```
|
||||
**Built-in uniforms available:** `uTDOutputInfo.res` (vec4), `uTDTimeInfo.seconds`, `sTD2DInputs[N]`.
|
||||
**Auto-created DATs:** `name_pixel`, `name_vertex`, `name_compute` textDATs with example code.
|
||||
|
||||
### 55. TOP.save() doesn't advance time — identical frames in tight loops
|
||||
|
||||
`.save()` captures the current cooked frame without advancing TD's timeline:
|
||||
```python
|
||||
# WRONG — all frames identical
|
||||
for i in range(300):
|
||||
op('/project1/out').save(f'frames/f_{i:04d}.png')
|
||||
|
||||
# CORRECT — use run() with delayFrames
|
||||
for i in range(300):
|
||||
delay = i * 5
|
||||
run(f"op('/project1/out').save('frames/f_{i:04d}.png')", delayFrames=delay)
|
||||
```
|
||||
**NEVER use `time.sleep()` in TD** — it blocks the main thread and freezes the UI.
|
||||
|
||||
### 56. Feedback loop masks input changes — force switch during capture
|
||||
|
||||
With feedback TOP opacity 0.7+, the buffer dominates output. Switching input produces nearly identical frames.
|
||||
|
||||
**Fix — force switch index per capture:**
|
||||
```python
|
||||
for i in range(300):
|
||||
idx = (i // 8) % num_inputs
|
||||
delay = i * 5
|
||||
run(f"op('/project1/vswitch').par.index={idx}; op('/project1/out').save('f_{i:04d}.png')", delayFrames=delay)
|
||||
```
|
||||
|
||||
### 57. Large td_execute_python scripts fail — split into incremental calls
|
||||
|
||||
10+ operator creations in one script cause timing issues. Split into 2-4 calls of 2-4 operators each. Within one call, `create()` handles work immediately. Across calls, `op('name')` may return `None` if the previous call hasn't committed.
|
||||
|
||||
### 58. MCP instance reconnection after project.load()
|
||||
|
||||
`project.load(path)` changes the PID. After loading, call `td_list_instances()` and use the new `target_instance`. For TOX files: import as child comp instead (doesn't disconnect).
|
||||
|
||||
### 59. TOX reverse-engineering workflow
|
||||
|
||||
```python
|
||||
comp = root.loadTox(r'/path/to/file.tox')
|
||||
comp.name = '_study_comp'
|
||||
for child in comp.children:
|
||||
print(f'{child.name} ({child.OPType})')
|
||||
# Use td_get_operators_info, td_read_dat, check custom params
|
||||
```
|
||||
|
||||
### 60. sliderCOMP naming — TD appends suffix
|
||||
|
||||
TD auto-renames: `slider_brightness` → `slider_brightness1`. Always check names after creation.
|
||||
|
||||
### 61. create() requires full operator type suffix
|
||||
|
||||
```python
|
||||
# CORRECT
|
||||
proj.create('audiofileinCHOP', 'audio_in')
|
||||
proj.create('glslTOP', 'render')
|
||||
|
||||
# WRONG — raises "Unknown operator type"
|
||||
proj.create('audiofilein', 'audio_in')
|
||||
proj.create('glsl', 'render')
|
||||
```
|
||||
|
||||
### 62. Reparenting COMPs — use copyOPs, not connect()
|
||||
|
||||
Moving COMPs with `inputCOMPConnectors[0].connect()` fails. Use copy + destroy:
|
||||
```python
|
||||
copied = target.copyOPs([source]) # preserves internal wiring
|
||||
source.destroy()
|
||||
# Re-wire external connections manually after the move
|
||||
```
|
||||
|
||||
### 63. Slider wiring — expressionCHOP with op() expressions crashes TD
|
||||
|
||||
```python
|
||||
# CRASHES TD — don't do this
|
||||
echop = root.create(expressionCHOP, 'slider_ctrl')
|
||||
echop.par.chan0expr = 'op("/project1/controls/slider_brightness1").par.value0'
|
||||
|
||||
# WORKING — parameterCHOP as bridge
|
||||
pchop = root.create(parameterCHOP, 'slider_vals')
|
||||
pchop.par.ops = '/project1/controls'
|
||||
pchop.par.parameters = 'value0'
|
||||
pchop.par.custom = True
|
||||
pchop.par.builtin = False
|
||||
```
|
||||
@@ -0,0 +1,183 @@
|
||||
# Post-FX Reference
|
||||
|
||||
Bloom, CRT scanlines, chromatic aberration, and feedback glow patterns for live visual work.
|
||||
|
||||
---
|
||||
|
||||
## Bloom
|
||||
|
||||
### Built-in Bloom TOP
|
||||
|
||||
TD's `bloomTOP` is the fastest path — GPU-accelerated, no shader needed.
|
||||
|
||||
```python
|
||||
bloom = root.create(bloomTOP, 'bloom1')
|
||||
bloom.par.threshold = 0.6 # Luminance threshold (0-1)
|
||||
bloom.par.size = 0.03 # Spread radius (0-1)
|
||||
bloom.par.strength = 1.5 # Bloom intensity
|
||||
bloom.par.blendmode = 'add' # 'add' or 'screen'
|
||||
```
|
||||
|
||||
**Audio reactive bloom:**
|
||||
```python
|
||||
bloom.par.strength.mode = ParMode.EXPRESSION
|
||||
bloom.par.strength.expr = "op('audio_env')['envelope'][0] * 3.0 + 0.5"
|
||||
```
|
||||
|
||||
### GLSL Bloom (More Control)
|
||||
|
||||
For multi-pass bloom with color tinting:
|
||||
|
||||
```glsl
|
||||
// bloom_pixel.glsl — pass1: threshold + tint
|
||||
out vec4 fragColor;
|
||||
uniform float uThreshold;
|
||||
uniform vec3 uBloomColor;
|
||||
|
||||
void main() {
|
||||
vec4 col = texture(sTD2DInputs[0], vUV.st);
|
||||
float luma = dot(col.rgb, vec3(0.299, 0.587, 0.114));
|
||||
float bloom = max(0.0, luma - uThreshold);
|
||||
fragColor = TDOutputSwizzle(vec4(col.rgb * bloom * uBloomColor, col.a));
|
||||
}
|
||||
```
|
||||
|
||||
Then blur with `blurTOP` (size ~0.02-0.05), composite back over source with `addTOP` or `compositeTOP` in Add mode.
|
||||
|
||||
---
|
||||
|
||||
## CRT / Scanlines
|
||||
|
||||
Pure GLSL — create a `glslTOP` and paste into its `_pixel` DAT.
|
||||
|
||||
```glsl
|
||||
// crt_pixel.glsl
|
||||
out vec4 fragColor;
|
||||
uniform float uTime;
|
||||
uniform float uScanlineIntensity; // 0.0 - 1.0, default 0.4
|
||||
uniform float uCurvature; // 0.0 - 0.15, default 0.05
|
||||
uniform float uVignette; // 0.0 - 1.0, default 0.8
|
||||
|
||||
vec2 curveUV(vec2 uv, float amount) {
|
||||
uv = uv * 2.0 - 1.0;
|
||||
vec2 offset = abs(uv.yx) / vec2(6.0, 4.0);
|
||||
uv = uv + uv * offset * offset * amount;
|
||||
return uv * 0.5 + 0.5;
|
||||
}
|
||||
|
||||
void main() {
|
||||
vec2 res = uTDOutputInfo.res.zw;
|
||||
vec2 uv = vUV.st;
|
||||
|
||||
// CRT barrel distortion
|
||||
uv = curveUV(uv, uCurvature * 10.0);
|
||||
|
||||
// Kill pixels outside curved screen
|
||||
if (uv.x < 0.0 || uv.x > 1.0 || uv.y < 0.0 || uv.y > 1.0) {
|
||||
fragColor = vec4(0.0, 0.0, 0.0, 1.0);
|
||||
return;
|
||||
}
|
||||
|
||||
vec4 col = texture(sTD2DInputs[0], uv);
|
||||
|
||||
// Scanlines
|
||||
float scanline = sin(uv.y * res.y * 3.14159) * 0.5 + 0.5;
|
||||
col.rgb *= mix(1.0, scanline, uScanlineIntensity);
|
||||
|
||||
// Horizontal noise flicker
|
||||
float flicker = TDSimplexNoise(vec2(uv.y * 100.0, uTime * 8.0)) * 0.03;
|
||||
col.rgb += flicker;
|
||||
|
||||
// Vignette
|
||||
vec2 vig = uv * (1.0 - uv.yx);
|
||||
float v = pow(vig.x * vig.y * 15.0, uVignette);
|
||||
col.rgb *= v;
|
||||
|
||||
fragColor = TDOutputSwizzle(col);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Chromatic Aberration
|
||||
|
||||
Splits RGB channels and offsets them along screen axes.
|
||||
|
||||
```glsl
|
||||
out vec4 fragColor;
|
||||
uniform float uAmount; // 0.001 - 0.02, default 0.006
|
||||
|
||||
void main() {
|
||||
vec2 uv = vUV.st;
|
||||
vec2 dir = uv - 0.5;
|
||||
|
||||
float r = texture(sTD2DInputs[0], uv + dir * uAmount).r;
|
||||
float g = texture(sTD2DInputs[0], uv).g;
|
||||
float b = texture(sTD2DInputs[0], uv - dir * uAmount).b;
|
||||
float a = texture(sTD2DInputs[0], uv).a;
|
||||
|
||||
fragColor = TDOutputSwizzle(vec4(r, g, b, a));
|
||||
}
|
||||
```
|
||||
|
||||
**Audio-reactive variant** — spike aberration on beats:
|
||||
```glsl
|
||||
uniform float uBeat;
|
||||
void main() {
|
||||
vec2 uv = vUV.st;
|
||||
vec2 dir = uv - 0.5;
|
||||
float amount = uAmount + uBeat * 0.04;
|
||||
float r = texture(sTD2DInputs[0], uv + dir * amount * 1.2).r;
|
||||
float g = texture(sTD2DInputs[0], uv).g;
|
||||
float b = texture(sTD2DInputs[0], uv - dir * amount * 0.8).b;
|
||||
fragColor = TDOutputSwizzle(vec4(r, g, b, 1.0));
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Feedback Glow
|
||||
|
||||
Warm persistent trails for glow effects.
|
||||
|
||||
```glsl
|
||||
out vec4 fragColor;
|
||||
uniform float uDecay; // 0.92 - 0.98 for slow trails
|
||||
uniform vec3 uGlowColor; // tint accumulated feedback
|
||||
|
||||
void main() {
|
||||
vec2 uv = vUV.st;
|
||||
vec4 prev = texture(sTD2DInputs[0], uv); // feedback input
|
||||
vec4 curr = texture(sTD2DInputs[1], uv); // current frame
|
||||
|
||||
vec3 glow = prev.rgb * uDecay * uGlowColor;
|
||||
vec3 result = max(glow, curr.rgb);
|
||||
|
||||
fragColor = TDOutputSwizzle(vec4(result, 1.0));
|
||||
}
|
||||
```
|
||||
|
||||
**Tips:**
|
||||
- `uDecay = 0.95` → medium trail
|
||||
- `uDecay = 0.98` → long comet tail
|
||||
- Set `glslTOP` format to `rgba16float` for smooth gradients
|
||||
|
||||
---
|
||||
|
||||
## Full Post-FX Stack
|
||||
|
||||
Recommended order:
|
||||
|
||||
```
|
||||
[scene / composite]
|
||||
↓
|
||||
bloomTOP ← luminance threshold bloom
|
||||
↓
|
||||
glslTOP (chrom) ← chromatic aberration
|
||||
↓
|
||||
glslTOP (crt) ← scanlines + barrel distortion + vignette
|
||||
↓
|
||||
null_out ← final output
|
||||
```
|
||||
|
||||
**Performance note:** Each glslTOP is a full GPU pass. For 1920×1080 at 60fps this stack is comfortably real-time. For 4K, consider downsampling bloom input with `resolutionTOP` first.
|
||||
@@ -0,0 +1,131 @@
|
||||
# google_meet plugin
|
||||
|
||||
Let the hermes agent join a Google Meet call, transcribe it, optionally speak
|
||||
in it, and do the followup work afterwards.
|
||||
|
||||
## What ships
|
||||
|
||||
| Version | What | Status |
|
||||
|---|---|---|
|
||||
| v1 | Transcribe-only: Playwright joins Meet, scrapes captions to transcript file | ✓ ships by default |
|
||||
| v2 | Realtime duplex audio: bot speaks in-call via OpenAI Realtime + BlackHole/PulseAudio null-sink | ✓ opt in with `mode='realtime'` |
|
||||
| v3 | Remote node host: run the bot on a different machine than the gateway | ✓ opt in with `node='<name>'` |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─ gateway (Linux box, where hermes runs) ────────────────────────────┐
|
||||
│ │
|
||||
│ agent → meet_join(url, mode='realtime', node='my-mac') │
|
||||
│ │ │
|
||||
│ └─ NodeClient ─── ws ────┐ │
|
||||
│ │ │
|
||||
└──────────────────────────────────┼───────────────────────────────────┘
|
||||
│ wss (token auth)
|
||||
▼
|
||||
┌─ node host (user's Mac, signed-in Chrome lives here) ───────────────┐
|
||||
│ │
|
||||
│ NodeServer (from `hermes meet node run`) │
|
||||
│ │ │
|
||||
│ ├─ start_bot → process_manager.start() → spawns meet_bot │
|
||||
│ │ │
|
||||
│ └─ meet_bot (Playwright) │
|
||||
│ ├─ Chromium → meet.google.com │
|
||||
│ ├─ caption scraper → transcript.txt │
|
||||
│ └─ (realtime mode only) RealtimeSpeaker thread │
|
||||
│ ↓ │
|
||||
│ OpenAI Realtime WS → speaker.pcm │
|
||||
│ ↓ │
|
||||
│ paplay → null-sink ← Chrome fake mic │
|
||||
│ │
|
||||
└──────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Without v3: the whole right column runs on the gateway machine.
|
||||
Without v2: the "realtime" path is skipped; transcribe runs alone.
|
||||
|
||||
## Files
|
||||
|
||||
| Path | Purpose |
|
||||
|---|---|
|
||||
| `plugin.yaml` | manifest |
|
||||
| `__init__.py` | `register(ctx)` — registers 5 tools + `on_session_end` hook + `hermes meet` CLI |
|
||||
| `meet_bot.py` | Playwright bot subprocess (standalone, `python -m plugins.google_meet.meet_bot`) |
|
||||
| `process_manager.py` | local bot lifecycle + `enqueue_say` |
|
||||
| `tools.py` | agent-facing tools + node-routing helper |
|
||||
| `cli.py` | `hermes meet setup / auth / join / status / transcript / say / stop / node ...` |
|
||||
| `audio_bridge.py` | v2: PulseAudio null-sink (Linux) + BlackHole probe (macOS) |
|
||||
| `realtime/openai_client.py` | v2: `RealtimeSession` + `RealtimeSpeaker` (file-queue → OpenAI Realtime WS → PCM) |
|
||||
| `node/protocol.py` | v3: message envelope + validation |
|
||||
| `node/registry.py` | v3: `$HERMES_HOME/workspace/meetings/nodes.json` |
|
||||
| `node/server.py` | v3: `NodeServer` (runs on host machine) |
|
||||
| `node/client.py` | v3: `NodeClient` (used by tool handlers + CLI on gateway) |
|
||||
| `node/cli.py` | v3: `hermes meet node {run,list,approve,remove,status,ping}` |
|
||||
| `SKILL.md` | agent usage guide |
|
||||
|
||||
## Local quick start
|
||||
|
||||
```bash
|
||||
hermes plugins enable google_meet
|
||||
hermes meet install # pip + Chromium
|
||||
hermes meet setup # preflight
|
||||
hermes meet auth # optional
|
||||
hermes meet join https://meet.google.com/abc-defg-hij # transcribe
|
||||
```
|
||||
|
||||
## Realtime mode
|
||||
|
||||
Linux (preferred, most automated):
|
||||
```bash
|
||||
hermes meet install --realtime # installs pulseaudio-utils
|
||||
echo 'OPENAI_API_KEY=sk-...' >> ~/.hermes/.env
|
||||
hermes meet join https://meet.google.com/abc-defg-hij --mode realtime
|
||||
# then from the agent or CLI:
|
||||
hermes meet say "Good morning everyone, I'm the note-taker bot."
|
||||
```
|
||||
|
||||
macOS:
|
||||
```bash
|
||||
hermes meet install --realtime # runs: brew install blackhole-2ch ffmpeg
|
||||
# then — manually! — open System Settings → Sound → Input → BlackHole 2ch
|
||||
echo 'OPENAI_API_KEY=sk-...' >> ~/.hermes/.env
|
||||
hermes meet join https://meet.google.com/abc-defg-hij --mode realtime
|
||||
```
|
||||
|
||||
On macOS, hermes will **not** switch your system audio input automatically — the
|
||||
user has to do it. This is deliberate: switching default input on a whim would
|
||||
be a surprising side effect.
|
||||
|
||||
## Remote node host
|
||||
|
||||
On the node machine (e.g. user's Mac with a signed-in Chrome):
|
||||
```bash
|
||||
pip install playwright websockets
|
||||
python -m playwright install chromium
|
||||
hermes plugins enable google_meet
|
||||
hermes meet node run --display-name my-mac --host 0.0.0.0 --port 18789
|
||||
# prints the bearer token on first run; copy it
|
||||
```
|
||||
|
||||
On the gateway:
|
||||
```bash
|
||||
hermes meet node approve my-mac ws://<mac-ip>:18789 <token>
|
||||
hermes meet node ping my-mac
|
||||
# now any meet_* tool call accepts node='my-mac' (or 'auto')
|
||||
```
|
||||
|
||||
## Safety
|
||||
|
||||
- URL gate: only `https://meet.google.com/abc-defg-hij`, `/new`, `/lookup/<id>`.
|
||||
- No calendar scanning, no auto-dial, no auto-consent announcement.
|
||||
- Node server uses bearer-token auth; no key exchange, no TLS termination
|
||||
built in — run it on a LAN or behind a reverse proxy you trust.
|
||||
- One active meeting per (gateway, node) pair. A second `meet_join` leaves the first.
|
||||
- `meet_say` refuses unless the active meeting was started with `mode='realtime'`.
|
||||
|
||||
## Out of scope
|
||||
|
||||
- **Calendar scanning** — deliberately not implemented. Join URLs must be explicit.
|
||||
- **Multi-tenant node sharing** — a node serves one gateway at a time.
|
||||
- **Windows** — audio bridging isn't tested; `register()` no-ops on Windows.
|
||||
- **System audio input switching on macOS** — user responsibility, not the bot's.
|
||||
@@ -0,0 +1,148 @@
|
||||
---
|
||||
name: google_meet
|
||||
description: Join a Google Meet call, transcribe live captions, optionally speak in realtime, and do the followup work afterwards. Use when the user asks the agent to sit in on a meeting, take notes, summarize, respond in-call, or action items from it.
|
||||
version: 0.2.0
|
||||
platforms:
|
||||
- linux
|
||||
- macos
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [meetings, google-meet, transcription, realtime-voice]
|
||||
---
|
||||
|
||||
# google_meet
|
||||
|
||||
## When to use
|
||||
|
||||
The user says any of:
|
||||
|
||||
- "join my Meet at <url>"
|
||||
- "take notes on this meeting"
|
||||
- "summarize the meeting and send followups"
|
||||
- "sit in on my standup"
|
||||
- "be a bot in this call and speak up when X"
|
||||
|
||||
## Two modes
|
||||
|
||||
| Mode | What the bot does |
|
||||
|---|---|
|
||||
| `transcribe` (default) | Joins, enables captions, scrapes a transcript. Listen-only. |
|
||||
| `realtime` | Same as transcribe PLUS speaks into the meeting via OpenAI Realtime. The agent calls `meet_say(text)` and the bot's voice comes out of the call. |
|
||||
|
||||
Pick `realtime` only when the user actually wants the agent to speak. It costs real money (OpenAI Realtime is pay-per-audio-minute) and requires a virtual audio device set up on the machine running the bot.
|
||||
|
||||
## Two locations
|
||||
|
||||
| Location | When |
|
||||
|---|---|
|
||||
| Local (default) | Gateway machine runs the Playwright bot directly. |
|
||||
| Remote node (`node="<name>"`) | Bot runs on a different machine that has a signed-in Chrome and (for realtime) a configured audio bridge. Useful when the gateway runs on a headless Linux box but the user's real signed-in Chrome lives on their Mac. |
|
||||
|
||||
## Prerequisites the user must handle once
|
||||
|
||||
Easiest path — run the built-in installer:
|
||||
|
||||
```bash
|
||||
hermes plugins enable google_meet
|
||||
hermes meet install # pip deps + Chromium (transcribe only)
|
||||
hermes meet install --realtime # + pulseaudio-utils / brew blackhole+ffmpeg
|
||||
hermes meet auth # optional; skips guest-lobby wait
|
||||
hermes meet setup # preflight checks
|
||||
```
|
||||
|
||||
`hermes meet install --realtime` prompts before running `sudo apt-get` (Linux)
|
||||
or `brew install` (macOS). Pass `--yes` to skip the prompt. It will NOT touch
|
||||
your macOS default-input setting — you have to select BlackHole 2ch in
|
||||
System Settings yourself before starting a realtime meeting.
|
||||
|
||||
Or do it manually:
|
||||
```bash
|
||||
pip install playwright websockets && python -m playwright install chromium
|
||||
|
||||
# For realtime mode, additionally:
|
||||
# Linux: sudo apt install pulseaudio-utils
|
||||
# macOS: brew install blackhole-2ch ffmpeg
|
||||
# → System Settings → Sound → Input → BlackHole 2ch
|
||||
# Then set OPENAI_API_KEY or HERMES_MEET_REALTIME_KEY in ~/.hermes/.env
|
||||
```
|
||||
|
||||
For a remote node:
|
||||
```bash
|
||||
# on the user's Mac (where Chrome is signed in):
|
||||
pip install playwright websockets && python -m playwright install chromium
|
||||
hermes plugins enable google_meet
|
||||
hermes meet node run --display-name my-mac # persistent server
|
||||
# copy the printed token
|
||||
|
||||
# on the gateway:
|
||||
hermes meet node approve my-mac ws://<mac-ip>:18789 <token>
|
||||
hermes meet node ping my-mac # confirm reachable
|
||||
```
|
||||
|
||||
Run `hermes meet setup` to preflight local prereqs.
|
||||
|
||||
## Flow
|
||||
|
||||
1. **Join** — call `meet_join(url=..., mode=..., node=...)`. Returns immediately.
|
||||
2. **Announce yourself** — no auto-consent. Say (in whatever channel the user is watching): "A Hermes agent bot is in this call taking notes."
|
||||
3. **Poll** — `meet_status()` for liveness, `meet_transcript(last=20)` for recent captions. Don't re-read the whole transcript every turn.
|
||||
4. **Speak (realtime only)** — `meet_say(text="...")` queues text for TTS. The speech lags by ~2s. Don't spam it.
|
||||
5. **Leave** — `meet_leave()` when done, or set `duration="30m"` on `meet_join` for auto-leave.
|
||||
6. **Follow up** — read `meet_transcript()` in full, summarize, and use regular tools to send the recap, file issues, schedule followups.
|
||||
|
||||
## Tool reference
|
||||
|
||||
| Tool | Parameters | Use |
|
||||
|---|---|---|
|
||||
| `meet_join` | `url`, `mode?`, `guest_name?`, `duration?`, `headed?`, `node?` | Start bot |
|
||||
| `meet_status` | `node?` | Liveness + progress |
|
||||
| `meet_transcript` | `last?`, `node?` | Read captions |
|
||||
| `meet_leave` | `node?` | Close bot |
|
||||
| `meet_say` | `text`, `node?` | Speak in realtime meeting |
|
||||
|
||||
`node?` on all tools: pass a registered node name (or `"auto"` for the sole node) to operate a remote bot instead of a local one. Omit for local.
|
||||
|
||||
## Important limits
|
||||
|
||||
- Captions are only as good as Google Meet's live captions. English-biased, lossy on overlapping speakers.
|
||||
- Guest mode sits in the lobby until a host admits. Warn the user; `hermes meet auth` avoids this.
|
||||
- **Lobby timeout**: if the host doesn't admit the bot within 5 minutes (configurable via `HERMES_MEET_LOBBY_TIMEOUT` env), the bot leaves and `meet_status` reports `leaveReason: "lobby_timeout"`.
|
||||
- **One active meeting per install per location.** A second `meet_join` leaves the first.
|
||||
- **Windows not supported.**
|
||||
- Realtime mode needs a virtual audio device. If the audio bridge setup fails, the bot falls back to transcribe mode and flags it in `meet_status().error`.
|
||||
- `meet_say` requires `mode='realtime'` on the originating `meet_join`. Calling it against a transcribe-mode meeting returns a clear error.
|
||||
- **Barge-in is best-effort.** When a caption arrives attributed to a real participant while the bot is generating audio, the bot sends `response.cancel` to OpenAI Realtime. Captions take ~500ms to show up, so the bot will talk over the first second or so of a human interruption.
|
||||
|
||||
## Status dict reference
|
||||
|
||||
`meet_status()` returns (subset shown, there are more):
|
||||
|
||||
| Key | Meaning |
|
||||
|---|---|
|
||||
| `inCall` | Past the lobby. False while waiting for admission. |
|
||||
| `lobbyWaiting` | Clicked "Ask to join", waiting on host. |
|
||||
| `joinAttemptedAt` / `joinedAt` | Timestamps for lobby-click and actual admission. |
|
||||
| `captioning` | Caption observer is installed. |
|
||||
| `transcriptLines` / `lastCaptionAt` | Transcript progress. |
|
||||
| `realtime` / `realtimeReady` | Realtime mode provisioned / WS connected. |
|
||||
| `realtimeDevice` | Audio device name the bot is feeding (e.g. `hermes_meet_src`). |
|
||||
| `audioBytesOut` / `lastAudioOutAt` | How much PCM the OpenAI session has produced. |
|
||||
| `lastBargeInAt` | Timestamp of the most recent `response.cancel` sent. |
|
||||
| `leaveReason` | `duration_expired`, `lobby_timeout`, `denied`, `page_closed`, or null. |
|
||||
| `error` | Last error (soft — bot may still be running). |
|
||||
|
||||
## Transcript location
|
||||
|
||||
Local:
|
||||
```
|
||||
$HERMES_HOME/workspace/meetings/<meeting-id>/transcript.txt
|
||||
```
|
||||
|
||||
Remote node: transcript lives on the node host's disk. Use `meet_transcript(node=...)` to read it over RPC.
|
||||
|
||||
## Safety
|
||||
|
||||
- URL regex: only `https://meet.google.com/...` URLs pass.
|
||||
- No calendar scanning. No auto-dial.
|
||||
- Remote nodes use bearer-token auth; tokens are generated on the node (32 hex chars, persisted in `$HERMES_HOME/workspace/meetings/node_token.json`) and must be copied to the gateway via `hermes meet node approve`.
|
||||
- `meet_say` text is rate-limited by the OpenAI Realtime session; spam-protection is the bot's problem, not yours, but still — don't queue hundreds of lines.
|
||||
@@ -0,0 +1,103 @@
|
||||
"""google_meet plugin — let the agent join a Meet call, transcribe it, follow up.
|
||||
|
||||
v1: transcribe-only. Spawns a headless Chromium via Playwright, joins the Meet
|
||||
URL, enables live captions, scrapes them into a transcript file. The agent then
|
||||
has the transcript in its workspace and can do whatever followup work it needs
|
||||
using its regular tools.
|
||||
|
||||
v2 (not in this PR): realtime duplex audio so the agent can speak in the
|
||||
meeting, via OpenAI Realtime / Gemini Live + BlackHole / PulseAudio null-sink.
|
||||
``meet_say`` exists as a stub today so the tool surface is stable.
|
||||
|
||||
Explicit-by-design: only joins ``https://meet.google.com/`` URLs explicitly
|
||||
passed in. No calendar scanning, no auto-dial, no consent announcement.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import platform
|
||||
|
||||
from plugins.google_meet import process_manager as pm
|
||||
from plugins.google_meet.cli import register_cli as _register_meet_cli
|
||||
from plugins.google_meet.cli import meet_command as _meet_command
|
||||
from plugins.google_meet.tools import (
|
||||
MEET_JOIN_SCHEMA,
|
||||
MEET_LEAVE_SCHEMA,
|
||||
MEET_SAY_SCHEMA,
|
||||
MEET_STATUS_SCHEMA,
|
||||
MEET_TRANSCRIPT_SCHEMA,
|
||||
check_meet_requirements,
|
||||
handle_meet_join,
|
||||
handle_meet_leave,
|
||||
handle_meet_say,
|
||||
handle_meet_status,
|
||||
handle_meet_transcript,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_TOOLS = (
|
||||
("meet_join", MEET_JOIN_SCHEMA, handle_meet_join, "📞"),
|
||||
("meet_status", MEET_STATUS_SCHEMA, handle_meet_status, "🟢"),
|
||||
("meet_transcript", MEET_TRANSCRIPT_SCHEMA, handle_meet_transcript, "📝"),
|
||||
("meet_leave", MEET_LEAVE_SCHEMA, handle_meet_leave, "👋"),
|
||||
("meet_say", MEET_SAY_SCHEMA, handle_meet_say, "🗣️"),
|
||||
)
|
||||
|
||||
|
||||
def _on_session_end(**kwargs) -> None:
|
||||
"""Best-effort cleanup — if a meet bot is still running when the session
|
||||
ends, leave the call so we don't orphan a headless Chromium.
|
||||
|
||||
No-ops when nothing is active. Swallows all exceptions — session end must
|
||||
not fail because the bot cleanup hit an edge case.
|
||||
"""
|
||||
try:
|
||||
status = pm.status()
|
||||
if status.get("ok") and status.get("alive"):
|
||||
pm.stop(reason="session ended")
|
||||
except Exception as e: # pragma: no cover — defensive
|
||||
logger.debug("google_meet on_session_end cleanup failed: %s", e)
|
||||
|
||||
|
||||
def register(ctx) -> None:
|
||||
"""Register tools, CLI, and lifecycle hooks.
|
||||
|
||||
Called once by the plugin loader when the plugin is enabled via
|
||||
``plugins.enabled`` in config.yaml.
|
||||
"""
|
||||
# Windows is not supported in v1 — audio routing for v2 doesn't have a
|
||||
# tested path there and guest-join Chromium is flakier. Refuse to register
|
||||
# rather than half-working.
|
||||
system = platform.system().lower()
|
||||
if system not in ("linux", "darwin"):
|
||||
logger.info(
|
||||
"google_meet plugin: platform=%s not supported (linux/macos only)",
|
||||
system,
|
||||
)
|
||||
return
|
||||
|
||||
for name, schema, handler, emoji in _TOOLS:
|
||||
ctx.register_tool(
|
||||
name=name,
|
||||
toolset="google_meet",
|
||||
schema=schema,
|
||||
handler=handler,
|
||||
check_fn=check_meet_requirements,
|
||||
emoji=emoji,
|
||||
)
|
||||
|
||||
ctx.register_cli_command(
|
||||
name="meet",
|
||||
help="Google Meet bot (join, transcribe, follow up)",
|
||||
setup_fn=_register_meet_cli,
|
||||
handler_fn=_meet_command,
|
||||
description=(
|
||||
"Let the hermes agent join a Google Meet call and scrape live "
|
||||
"captions into a transcript. See: hermes meet setup"
|
||||
),
|
||||
)
|
||||
|
||||
ctx.register_hook("on_session_end", _on_session_end)
|
||||
@@ -0,0 +1,244 @@
|
||||
"""Virtual audio bridge for feeding generated speech into Chrome's mic.
|
||||
|
||||
v2 module. Provisions a platform-specific virtual audio device so the
|
||||
Meet bot's Chromium instance can be pointed at an input source we
|
||||
control. The OpenAI Realtime client writes PCM bytes into this device;
|
||||
Chrome reads them as if they were coming from a microphone.
|
||||
|
||||
Linux (primary): uses pactl (PulseAudio) to create a null-sink plus a
|
||||
virtual source whose master is the null-sink's monitor. Callers set
|
||||
PULSE_SOURCE=<source_name> in Chrome's env and pass the fake-mic flag.
|
||||
|
||||
macOS: requires BlackHole 2ch to be installed. This module only
|
||||
verifies its presence and returns the device name; routing OS default
|
||||
input is left to the user (or a future switchaudio-osx integration) to
|
||||
avoid surprising the user's system audio state.
|
||||
|
||||
Windows: not supported in v2.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
|
||||
|
||||
_BLACKHOLE_DEVICE = "BlackHole 2ch"
|
||||
|
||||
|
||||
class AudioBridge:
|
||||
"""Manages a virtual audio device for Chrome fake-mic input.
|
||||
|
||||
Call ``setup()`` once before launching the Meet bot and
|
||||
``teardown()`` when the session ends. ``teardown()`` is idempotent.
|
||||
"""
|
||||
|
||||
def __init__(self, name_prefix: str = "hermes_meet") -> None:
|
||||
self._name_prefix = name_prefix
|
||||
self._platform: Optional[str] = None
|
||||
self._device_name: Optional[str] = None
|
||||
self._write_target: Optional[str] = None
|
||||
self._module_ids: list[int] = []
|
||||
self._torn_down = False
|
||||
|
||||
# ── public properties ─────────────────────────────────────────────────
|
||||
|
||||
@property
|
||||
def device_name(self) -> str:
|
||||
if not self._device_name:
|
||||
raise RuntimeError("AudioBridge not set up yet")
|
||||
return self._device_name
|
||||
|
||||
@property
|
||||
def write_target(self) -> str:
|
||||
if not self._write_target:
|
||||
raise RuntimeError("AudioBridge not set up yet")
|
||||
return self._write_target
|
||||
|
||||
# ── lifecycle ─────────────────────────────────────────────────────────
|
||||
|
||||
def setup(self) -> dict:
|
||||
"""Provision the virtual audio device.
|
||||
|
||||
Returns a dict describing the device. Raises RuntimeError on
|
||||
unsupported platforms or when required system tools are missing.
|
||||
"""
|
||||
system = platform.system()
|
||||
if system == "Linux":
|
||||
return self._setup_linux()
|
||||
if system == "Darwin":
|
||||
return self._setup_darwin()
|
||||
if system == "Windows":
|
||||
raise RuntimeError("windows not supported in v2")
|
||||
raise RuntimeError(f"unsupported platform: {system}")
|
||||
|
||||
def teardown(self) -> None:
|
||||
"""Release the virtual audio device. Idempotent."""
|
||||
if self._torn_down:
|
||||
return
|
||||
# Only Linux needs explicit unloading.
|
||||
if self._platform == "linux" and self._module_ids:
|
||||
# Unload in reverse order (virtual-source before null-sink).
|
||||
for mod_id in reversed(self._module_ids):
|
||||
try:
|
||||
subprocess.run(
|
||||
["pactl", "unload-module", str(mod_id)],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
)
|
||||
except Exception:
|
||||
# Best-effort teardown — never raise from here.
|
||||
pass
|
||||
self._module_ids = []
|
||||
self._torn_down = True
|
||||
|
||||
# ── platform impls ────────────────────────────────────────────────────
|
||||
|
||||
def _setup_linux(self) -> dict:
|
||||
sink_name = f"{self._name_prefix}_sink"
|
||||
src_name = f"{self._name_prefix}_src"
|
||||
|
||||
try:
|
||||
sink_out = subprocess.run(
|
||||
[
|
||||
"pactl",
|
||||
"load-module",
|
||||
"module-null-sink",
|
||||
f"sink_name={sink_name}",
|
||||
f"sink_properties=device.description=HermesMeetSink",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"pactl not found — install PulseAudio/pipewire-pulse"
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise RuntimeError(
|
||||
f"pactl load-module null-sink failed: {exc.stderr or exc}"
|
||||
) from exc
|
||||
|
||||
sink_mod_id = self._parse_module_id(sink_out.stdout)
|
||||
|
||||
try:
|
||||
src_out = subprocess.run(
|
||||
[
|
||||
"pactl",
|
||||
"load-module",
|
||||
"module-virtual-source",
|
||||
f"source_name={src_name}",
|
||||
f"master={sink_name}.monitor",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
# Roll back the null-sink we just created so we don't leak it.
|
||||
subprocess.run(
|
||||
["pactl", "unload-module", str(sink_mod_id)],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"pactl load-module virtual-source failed: {exc.stderr or exc}"
|
||||
) from exc
|
||||
|
||||
src_mod_id = self._parse_module_id(src_out.stdout)
|
||||
|
||||
self._platform = "linux"
|
||||
self._device_name = src_name
|
||||
self._write_target = sink_name
|
||||
self._module_ids = [sink_mod_id, src_mod_id]
|
||||
self._torn_down = False
|
||||
|
||||
return {
|
||||
"platform": "linux",
|
||||
"device_name": src_name,
|
||||
"sample_rate": 48000,
|
||||
"channels": 2,
|
||||
"module_ids": list(self._module_ids),
|
||||
"write_target": sink_name,
|
||||
}
|
||||
|
||||
def _setup_darwin(self) -> dict:
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["system_profiler", "SPAudioDataType"],
|
||||
text=True,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"system_profiler not found (macOS-only command)"
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise RuntimeError(
|
||||
f"system_profiler failed: {exc.output}"
|
||||
) from exc
|
||||
|
||||
if "BlackHole" not in out:
|
||||
raise RuntimeError(
|
||||
"BlackHole virtual audio device not installed. "
|
||||
"Install via: brew install blackhole-2ch"
|
||||
)
|
||||
|
||||
self._platform = "darwin"
|
||||
self._device_name = _BLACKHOLE_DEVICE
|
||||
self._write_target = _BLACKHOLE_DEVICE
|
||||
self._module_ids = []
|
||||
self._torn_down = False
|
||||
|
||||
return {
|
||||
"platform": "darwin",
|
||||
"device_name": _BLACKHOLE_DEVICE,
|
||||
"sample_rate": 48000,
|
||||
"channels": 2,
|
||||
"module_ids": [],
|
||||
"write_target": _BLACKHOLE_DEVICE,
|
||||
}
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _parse_module_id(stdout: str) -> int:
|
||||
"""pactl load-module prints the new module ID to stdout."""
|
||||
text = (stdout or "").strip()
|
||||
if not text:
|
||||
raise RuntimeError("pactl load-module returned empty stdout")
|
||||
# Take the last whitespace-separated token on the first non-empty line.
|
||||
first = text.splitlines()[0].strip()
|
||||
token = first.split()[-1]
|
||||
try:
|
||||
return int(token)
|
||||
except ValueError as exc:
|
||||
raise RuntimeError(
|
||||
f"could not parse pactl module id from: {stdout!r}"
|
||||
) from exc
|
||||
|
||||
|
||||
def chrome_fake_audio_flags(bridge_info: dict) -> list[str]:
|
||||
"""Return Chrome flags for using the fake audio input.
|
||||
|
||||
The PulseAudio source is selected via the ``PULSE_SOURCE`` env var,
|
||||
which callers must set in Chrome's environment before launch:
|
||||
|
||||
env["PULSE_SOURCE"] = bridge_info["device_name"]
|
||||
|
||||
On macOS the caller must ensure the system default audio input is
|
||||
set to the returned BlackHole device (we do not flip that switch).
|
||||
"""
|
||||
system = platform.system()
|
||||
if system == "Linux":
|
||||
# Chromium on Linux picks up the PulseAudio source selected via
|
||||
# PULSE_SOURCE env var; the fake-ui flag skips the permission
|
||||
# prompt so the bot can pick "use my mic" without user input.
|
||||
return ["--use-fake-ui-for-media-stream"]
|
||||
if system == "Darwin":
|
||||
return ["--use-fake-ui-for-media-stream"]
|
||||
if system == "Windows":
|
||||
raise RuntimeError("windows not supported in v2")
|
||||
raise RuntimeError(f"unsupported platform: {system}")
|
||||
@@ -0,0 +1,478 @@
|
||||
"""CLI commands for the google_meet plugin.
|
||||
|
||||
Wires ``hermes meet <subcommand>``:
|
||||
setup — preflight playwright, chromium, auth file, print fixes
|
||||
auth — open a browser to sign into Google, save storage state
|
||||
join <url> — join a Meet URL synchronously (also callable from the agent)
|
||||
status — print current bot state
|
||||
transcript — print the transcript
|
||||
stop — leave the current meeting
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
from plugins.google_meet import process_manager as pm
|
||||
from plugins.google_meet.meet_bot import _is_safe_meet_url
|
||||
|
||||
|
||||
def _auth_state_path() -> Path:
|
||||
return Path(get_hermes_home()) / "workspace" / "meetings" / "auth.json"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# argparse wiring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def register_cli(subparser: argparse.ArgumentParser) -> None:
|
||||
"""Build the ``hermes meet`` argparse tree.
|
||||
|
||||
Called by :func:`_register_cli_commands` at plugin load time.
|
||||
"""
|
||||
subs = subparser.add_subparsers(dest="meet_command")
|
||||
|
||||
subs.add_parser("setup", help="Preflight: playwright, chromium, auth")
|
||||
|
||||
inst_p = subs.add_parser(
|
||||
"install",
|
||||
help="Install prerequisites (pip deps, Chromium, platform audio tools)",
|
||||
)
|
||||
inst_p.add_argument(
|
||||
"--realtime", action="store_true",
|
||||
help="Also install realtime audio tools (pulseaudio-utils on Linux, BlackHole+ffmpeg on macOS). Uses sudo/brew, prompts before invoking either.",
|
||||
)
|
||||
inst_p.add_argument(
|
||||
"--yes", "-y", action="store_true",
|
||||
help="Answer yes to all prompts (use with care; will run sudo apt-get or brew without asking).",
|
||||
)
|
||||
|
||||
subs.add_parser("auth", help="Sign in to Google and save session state")
|
||||
|
||||
join_p = subs.add_parser("join", help="Join a Meet URL")
|
||||
join_p.add_argument("url", help="https://meet.google.com/...")
|
||||
join_p.add_argument("--guest-name", default="Hermes Agent")
|
||||
join_p.add_argument("--duration", default=None, help="e.g. 30m, 2h, 90s")
|
||||
join_p.add_argument("--headed", action="store_true", help="show browser")
|
||||
join_p.add_argument(
|
||||
"--mode", choices=("transcribe", "realtime"), default="transcribe",
|
||||
help="transcribe (default, listen-only) or realtime (speak via OpenAI Realtime)"
|
||||
)
|
||||
join_p.add_argument(
|
||||
"--node", default=None,
|
||||
help="remote node name, or 'auto' to use the sole registered node"
|
||||
)
|
||||
|
||||
subs.add_parser("status", help="Print current Meet bot state")
|
||||
|
||||
tr_p = subs.add_parser("transcript", help="Print the scraped transcript")
|
||||
tr_p.add_argument("--last", type=int, default=None)
|
||||
|
||||
say_p = subs.add_parser("say", help="Speak text in an active realtime meeting")
|
||||
say_p.add_argument("text", help="what to say")
|
||||
say_p.add_argument("--node", default=None)
|
||||
|
||||
subs.add_parser("stop", help="Leave the current meeting")
|
||||
|
||||
# v3: remote node host management.
|
||||
node_p = subs.add_parser(
|
||||
"node",
|
||||
help="Manage remote meet node hosts (run/list/approve/remove/status/ping)",
|
||||
)
|
||||
try:
|
||||
from plugins.google_meet.node.cli import register_cli as _register_node_cli
|
||||
_register_node_cli(node_p)
|
||||
except Exception as e: # pragma: no cover — defensive
|
||||
# If the node module fails to import for any reason (optional dep
|
||||
# missing at import time etc.), leave the subparser present but
|
||||
# flag it. The argparse dispatch will surface a clear error.
|
||||
def _node_unavailable(args):
|
||||
print(f"hermes meet node: module unavailable ({e})")
|
||||
return 1
|
||||
node_p.set_defaults(func=_node_unavailable)
|
||||
|
||||
subparser.set_defaults(func=meet_command)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dispatch
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def meet_command(args: argparse.Namespace) -> int:
|
||||
sub = getattr(args, "meet_command", None)
|
||||
if not sub:
|
||||
print("usage: hermes meet {setup,auth,join,status,transcript,say,stop,node}")
|
||||
return 2
|
||||
if sub == "setup":
|
||||
return _cmd_setup()
|
||||
if sub == "install":
|
||||
return _cmd_install(
|
||||
realtime=bool(getattr(args, "realtime", False)),
|
||||
assume_yes=bool(getattr(args, "yes", False)),
|
||||
)
|
||||
if sub == "auth":
|
||||
return _cmd_auth()
|
||||
if sub == "join":
|
||||
return _cmd_join(
|
||||
url=args.url,
|
||||
guest_name=args.guest_name,
|
||||
duration=args.duration,
|
||||
headed=args.headed,
|
||||
mode=getattr(args, "mode", "transcribe"),
|
||||
node=getattr(args, "node", None),
|
||||
)
|
||||
if sub == "status":
|
||||
return _cmd_status()
|
||||
if sub == "transcript":
|
||||
return _cmd_transcript(last=args.last)
|
||||
if sub == "say":
|
||||
return _cmd_say(text=args.text, node=getattr(args, "node", None))
|
||||
if sub == "stop":
|
||||
return _cmd_stop()
|
||||
if sub == "node":
|
||||
# Dispatch was set by the node cli's register_cli; fall through to
|
||||
# whatever its subparsers wired.
|
||||
fn = getattr(args, "func", None)
|
||||
if fn is None or fn is meet_command:
|
||||
print("usage: hermes meet node {run,list,approve,remove,status,ping}")
|
||||
return 2
|
||||
return fn(args)
|
||||
print(f"unknown subcommand: {sub}")
|
||||
return 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Subcommand handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cmd_setup() -> int:
|
||||
import platform as _p
|
||||
|
||||
print("google_meet preflight")
|
||||
print("---------------------")
|
||||
|
||||
system = _p.system()
|
||||
system_ok = system in ("Linux", "Darwin")
|
||||
print(f" platform : {system} [{'ok' if system_ok else 'unsupported'}]")
|
||||
|
||||
try:
|
||||
import playwright # noqa: F401
|
||||
pw_ok = True
|
||||
pw_msg = "installed"
|
||||
except ImportError:
|
||||
pw_ok = False
|
||||
pw_msg = "NOT installed — run: pip install playwright"
|
||||
print(f" playwright : {pw_msg}")
|
||||
|
||||
chromium_ok = False
|
||||
chromium_msg = "unknown"
|
||||
if pw_ok:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
exe = p.chromium.executable_path
|
||||
if exe and Path(exe).exists():
|
||||
chromium_ok = True
|
||||
chromium_msg = f"ok ({exe})"
|
||||
else:
|
||||
chromium_msg = (
|
||||
"not installed — run: "
|
||||
"python -m playwright install chromium"
|
||||
)
|
||||
except Exception as e:
|
||||
chromium_msg = f"probe failed: {e}"
|
||||
except Exception as e:
|
||||
chromium_msg = f"probe failed: {e}"
|
||||
print(f" chromium : {chromium_msg}")
|
||||
|
||||
auth_path = _auth_state_path()
|
||||
auth_ok = auth_path.is_file()
|
||||
print(
|
||||
" google auth : "
|
||||
+ (f"ok ({auth_path})" if auth_ok else "not saved — run: hermes meet auth")
|
||||
)
|
||||
|
||||
print()
|
||||
all_ok = system_ok and pw_ok and chromium_ok
|
||||
if all_ok:
|
||||
print(
|
||||
"ready. Join a meeting: "
|
||||
"hermes meet join https://meet.google.com/abc-defg-hij"
|
||||
)
|
||||
else:
|
||||
print("not ready yet — fix the items above.")
|
||||
return 0 if all_ok else 1
|
||||
|
||||
|
||||
def _cmd_install(*, realtime: bool, assume_yes: bool) -> int:
|
||||
"""Install the plugin's prerequisites.
|
||||
|
||||
Always: pip install playwright + websockets, then
|
||||
``python -m playwright install chromium``.
|
||||
|
||||
With ``--realtime``: also install the platform audio bridge deps.
|
||||
Linux : ``sudo apt-get install -y pulseaudio-utils``
|
||||
macOS : ``brew install blackhole-2ch ffmpeg`` (+ remind the user
|
||||
to select BlackHole as the default input device manually)
|
||||
|
||||
Prompts before every package-manager invocation unless ``--yes``.
|
||||
Refuses to run on Windows.
|
||||
"""
|
||||
import platform as _p
|
||||
import shutil as _shutil
|
||||
import subprocess as _sp
|
||||
|
||||
system = _p.system()
|
||||
if system not in ("Linux", "Darwin"):
|
||||
print(f"google_meet install: {system} is not supported (linux/macos only)")
|
||||
return 1
|
||||
|
||||
def _confirm(prompt: str) -> bool:
|
||||
if assume_yes:
|
||||
return True
|
||||
try:
|
||||
ans = input(f"{prompt} [y/N] ").strip().lower()
|
||||
except EOFError:
|
||||
return False
|
||||
return ans in ("y", "yes")
|
||||
|
||||
print("google_meet install")
|
||||
print("-------------------")
|
||||
|
||||
# 1) pip deps — always safe, venv-scoped.
|
||||
pip_pkgs = ["playwright", "websockets"]
|
||||
print(f"\n[1/3] pip install: {' '.join(pip_pkgs)}")
|
||||
try:
|
||||
res = _sp.run(
|
||||
[sys.executable, "-m", "pip", "install", "--upgrade", *pip_pkgs],
|
||||
check=False,
|
||||
)
|
||||
if res.returncode != 0:
|
||||
print(" pip install failed")
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f" pip install failed: {e}")
|
||||
return 1
|
||||
|
||||
# 2) Playwright browsers — pulls chromium (~300MB first run).
|
||||
print("\n[2/3] python -m playwright install chromium")
|
||||
try:
|
||||
res = _sp.run(
|
||||
[sys.executable, "-m", "playwright", "install", "chromium"],
|
||||
check=False,
|
||||
)
|
||||
if res.returncode != 0:
|
||||
print(" playwright install failed (may already be installed)")
|
||||
except Exception as e:
|
||||
print(f" playwright install failed: {e}")
|
||||
return 1
|
||||
|
||||
# 3) Platform audio deps for realtime mode.
|
||||
if realtime:
|
||||
print("\n[3/3] realtime audio deps")
|
||||
if system == "Linux":
|
||||
if _shutil.which("paplay") and _shutil.which("pactl"):
|
||||
print(" pulseaudio-utils already installed.")
|
||||
else:
|
||||
if not _confirm(
|
||||
" install pulseaudio-utils? this runs `sudo apt-get install -y pulseaudio-utils`"
|
||||
):
|
||||
print(" skipped (you can run it manually later)")
|
||||
else:
|
||||
cmd = ["sudo", "apt-get", "install", "-y", "pulseaudio-utils"]
|
||||
print(f" $ {' '.join(cmd)}")
|
||||
res = _sp.run(cmd, check=False)
|
||||
if res.returncode != 0:
|
||||
print(" apt install failed — install pulseaudio-utils manually")
|
||||
elif system == "Darwin":
|
||||
have_bh = False
|
||||
try:
|
||||
out = _sp.check_output(["system_profiler", "SPAudioDataType"], text=True)
|
||||
have_bh = "BlackHole" in out
|
||||
except Exception:
|
||||
pass
|
||||
have_ffmpeg = bool(_shutil.which("ffmpeg"))
|
||||
needs = []
|
||||
if not have_bh:
|
||||
needs.append("blackhole-2ch")
|
||||
if not have_ffmpeg:
|
||||
needs.append("ffmpeg")
|
||||
if not needs:
|
||||
print(" BlackHole and ffmpeg already installed.")
|
||||
elif not _shutil.which("brew"):
|
||||
print(
|
||||
" missing: " + ", ".join(needs) + "\n"
|
||||
" install Homebrew first (https://brew.sh) or install the packages manually."
|
||||
)
|
||||
else:
|
||||
if not _confirm(f" install via brew: {' '.join(needs)}?"):
|
||||
print(" skipped (you can run it manually later)")
|
||||
else:
|
||||
cmd = ["brew", "install", *needs]
|
||||
print(f" $ {' '.join(cmd)}")
|
||||
res = _sp.run(cmd, check=False)
|
||||
if res.returncode != 0:
|
||||
print(" brew install failed — install them manually")
|
||||
print(
|
||||
"\n NOTE: macOS does not auto-route audio. Open\n"
|
||||
" System Settings → Sound → Input\n"
|
||||
" and select 'BlackHole 2ch' before starting a realtime meeting.\n"
|
||||
" hermes will not switch your default input for you."
|
||||
)
|
||||
else:
|
||||
print("\n[3/3] skipped (pass --realtime to install audio tooling too)")
|
||||
|
||||
print("\ndone. verify with: hermes meet setup")
|
||||
return 0
|
||||
|
||||
|
||||
def _cmd_auth() -> int:
|
||||
"""Open a headed Chromium, let the user sign in, save storage_state."""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
print(
|
||||
"playwright is not installed. run:\n"
|
||||
" pip install playwright && python -m playwright install chromium"
|
||||
)
|
||||
return 1
|
||||
|
||||
path = _auth_state_path()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"opening Chromium — sign in to Google, then return here and press Enter.")
|
||||
print(f"saving storage state to: {path}")
|
||||
try:
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=False)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
page.goto("https://accounts.google.com/", wait_until="domcontentloaded")
|
||||
try:
|
||||
input("press Enter after you've signed in ... ")
|
||||
except EOFError:
|
||||
pass
|
||||
context.storage_state(path=str(path))
|
||||
browser.close()
|
||||
except Exception as e:
|
||||
print(f"auth failed: {e}")
|
||||
return 1
|
||||
print("saved. you can now run: hermes meet join <url>")
|
||||
return 0
|
||||
|
||||
|
||||
def _cmd_join(
|
||||
url: str,
|
||||
*,
|
||||
guest_name: str,
|
||||
duration: Optional[str],
|
||||
headed: bool,
|
||||
mode: str = "transcribe",
|
||||
node: Optional[str] = None,
|
||||
) -> int:
|
||||
if not _is_safe_meet_url(url):
|
||||
print(f"refusing: not a meet.google.com URL: {url}")
|
||||
return 2
|
||||
if node:
|
||||
# Remote: go through NodeClient.
|
||||
try:
|
||||
from plugins.google_meet.node.registry import NodeRegistry
|
||||
from plugins.google_meet.node.client import NodeClient
|
||||
except ImportError as e:
|
||||
print(f"node module unavailable: {e}")
|
||||
return 1
|
||||
reg = NodeRegistry()
|
||||
entry = reg.resolve(node if node != "auto" else None)
|
||||
if entry is None:
|
||||
print(f"no registered node matches {node!r}")
|
||||
return 1
|
||||
client = NodeClient(url=entry["url"], token=entry["token"])
|
||||
try:
|
||||
res = client.start_bot(
|
||||
url=url, guest_name=guest_name, duration=duration,
|
||||
headed=headed, mode=mode,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"remote start_bot failed: {e}")
|
||||
return 1
|
||||
print(json.dumps({"node": entry.get("name"), **res}, indent=2))
|
||||
return 0 if res.get("ok") else 1
|
||||
|
||||
auth = _auth_state_path()
|
||||
res = pm.start(
|
||||
url=url,
|
||||
headed=headed,
|
||||
guest_name=guest_name,
|
||||
duration=duration,
|
||||
auth_state=str(auth) if auth.is_file() else None,
|
||||
mode=mode,
|
||||
)
|
||||
print(json.dumps(res, indent=2))
|
||||
return 0 if res.get("ok") else 1
|
||||
|
||||
|
||||
def _cmd_say(text: str, node: Optional[str] = None) -> int:
|
||||
if not (text or "").strip():
|
||||
print("refusing: empty text")
|
||||
return 2
|
||||
if node:
|
||||
try:
|
||||
from plugins.google_meet.node.registry import NodeRegistry
|
||||
from plugins.google_meet.node.client import NodeClient
|
||||
except ImportError as e:
|
||||
print(f"node module unavailable: {e}")
|
||||
return 1
|
||||
reg = NodeRegistry()
|
||||
entry = reg.resolve(node if node != "auto" else None)
|
||||
if entry is None:
|
||||
print(f"no registered node matches {node!r}")
|
||||
return 1
|
||||
client = NodeClient(url=entry["url"], token=entry["token"])
|
||||
try:
|
||||
res = client.say(text)
|
||||
except Exception as e:
|
||||
print(f"remote say failed: {e}")
|
||||
return 1
|
||||
print(json.dumps({"node": entry.get("name"), **res}, indent=2))
|
||||
return 0 if res.get("ok") else 1
|
||||
|
||||
res = pm.enqueue_say(text)
|
||||
print(json.dumps(res, indent=2))
|
||||
return 0 if res.get("ok") else 1
|
||||
|
||||
|
||||
def _cmd_status() -> int:
|
||||
res = pm.status()
|
||||
print(json.dumps(res, indent=2))
|
||||
return 0 if res.get("ok") else 1
|
||||
|
||||
|
||||
def _cmd_transcript(last: Optional[int]) -> int:
|
||||
res = pm.transcript(last=last)
|
||||
if not res.get("ok"):
|
||||
print(json.dumps(res, indent=2))
|
||||
return 1
|
||||
for ln in res.get("lines", []):
|
||||
print(ln)
|
||||
return 0
|
||||
|
||||
|
||||
def _cmd_stop() -> int:
|
||||
res = pm.stop(reason="hermes meet stop")
|
||||
print(json.dumps(res, indent=2))
|
||||
return 0 if res.get("ok") else 1
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
parser = argparse.ArgumentParser(prog="hermes meet")
|
||||
register_cli(parser)
|
||||
ns = parser.parse_args()
|
||||
sys.exit(meet_command(ns))
|
||||
@@ -0,0 +1,852 @@
|
||||
"""Headless Google Meet bot — Playwright + live-caption scraping.
|
||||
|
||||
Runs as a standalone subprocess spawned by ``process_manager.py``. Reads config
|
||||
from env vars, writes status + transcript to files under
|
||||
``$HERMES_HOME/workspace/meetings/<meeting-id>/``. The main hermes process
|
||||
reads those files via the ``meet_*`` tools — no IPC beyond filesystem.
|
||||
|
||||
The scraping strategy mirrors OpenUtter (sumansid/openutter): we don't parse
|
||||
WebRTC audio, we enable Google Meet's built-in live captions and observe the
|
||||
captions container in the DOM via a MutationObserver. This is lossy and
|
||||
English-biased but it is:
|
||||
|
||||
* deterministic (no API keys, no STT billing),
|
||||
* works behind Meet's normal login / admission,
|
||||
* survives Meet UI rewrites fairly well because the caption container has a
|
||||
stable ARIA role.
|
||||
|
||||
Run standalone for debugging::
|
||||
|
||||
HERMES_MEET_URL=https://meet.google.com/abc-defg-hij \\
|
||||
HERMES_MEET_OUT_DIR=/tmp/meet-debug \\
|
||||
HERMES_MEET_HEADED=1 \\
|
||||
python -m plugins.google_meet.meet_bot
|
||||
|
||||
No meet.google.com URL → exits non-zero. Any URL that doesn't start with
|
||||
``https://meet.google.com/`` is rejected (explicit-by-design).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Match ``https://meet.google.com/abc-defg-hij`` or ``.../lookup/...`` — the
|
||||
# short three-segment code or a lookup URL. Anything else is rejected.
|
||||
MEET_URL_RE = re.compile(
|
||||
r"^https://meet\.google\.com/("
|
||||
r"[a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,}"
|
||||
r"|lookup/[^/?#]+"
|
||||
r"|new"
|
||||
r")(?:[/?#].*)?$"
|
||||
)
|
||||
|
||||
|
||||
# Filenames the bot reads/writes in ``HERMES_MEET_OUT_DIR``.
|
||||
SAY_QUEUE_FILENAME = "say_queue.jsonl"
|
||||
SAY_PCM_FILENAME = "speaker.pcm"
|
||||
|
||||
|
||||
def _is_safe_meet_url(url: str) -> bool:
|
||||
"""Return True if *url* is a Google Meet URL we're willing to navigate to."""
|
||||
if not isinstance(url, str):
|
||||
return False
|
||||
return bool(MEET_URL_RE.match(url.strip()))
|
||||
|
||||
|
||||
def _meeting_id_from_url(url: str) -> str:
|
||||
"""Extract the 3-segment meeting code from a Meet URL.
|
||||
|
||||
For ``https://meet.google.com/abc-defg-hij`` → ``abc-defg-hij``.
|
||||
For ``.../lookup/<id>`` or ``/new`` we fall back to a timestamped id — the
|
||||
bot won't know the real code until after redirect, and callers pass this
|
||||
through to filename anyway.
|
||||
"""
|
||||
m = re.search(
|
||||
r"meet\.google\.com/([a-z0-9]{3,}-[a-z0-9]{3,}-[a-z0-9]{3,})",
|
||||
url or "",
|
||||
)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return f"meet-{int(time.time())}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Status + transcript file writers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _BotState:
|
||||
"""Single-process mutable state, flushed to ``status.json`` on each change."""
|
||||
|
||||
def __init__(self, out_dir: Path, meeting_id: str, url: str):
|
||||
self.out_dir = out_dir
|
||||
self.meeting_id = meeting_id
|
||||
self.url = url
|
||||
self.in_call = False
|
||||
self.captioning = False
|
||||
self.captions_enabled_attempted = False
|
||||
self.lobby_waiting = False
|
||||
self.join_attempted_at: Optional[float] = None
|
||||
self.joined_at: Optional[float] = None
|
||||
self.last_caption_at: Optional[float] = None
|
||||
self.transcript_lines = 0
|
||||
self.error: Optional[str] = None
|
||||
self.exited = False
|
||||
# v2 realtime fields.
|
||||
self.realtime = False
|
||||
self.realtime_ready = False
|
||||
self.realtime_device: Optional[str] = None
|
||||
self.audio_bytes_out: int = 0
|
||||
self.last_audio_out_at: Optional[float] = None
|
||||
self.last_barge_in_at: Optional[float] = None
|
||||
self.leave_reason: Optional[str] = None
|
||||
# Scraped captions, in order, deduped. Each entry is a dict of
|
||||
# {"ts": <epoch>, "speaker": str, "text": str}.
|
||||
self._seen: set = set()
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.transcript_path = out_dir / "transcript.txt"
|
||||
self.status_path = out_dir / "status.json"
|
||||
self._flush()
|
||||
|
||||
# -------- transcript ------------------------------------------------
|
||||
|
||||
def record_caption(self, speaker: str, text: str) -> None:
|
||||
"""Append a caption line if we haven't seen this exact (speaker, text)."""
|
||||
speaker = (speaker or "").strip() or "Unknown"
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return
|
||||
key = f"{speaker}|{text}"
|
||||
if key in self._seen:
|
||||
return
|
||||
self._seen.add(key)
|
||||
self.transcript_lines += 1
|
||||
self.last_caption_at = time.time()
|
||||
ts = time.strftime("%H:%M:%S", time.localtime(self.last_caption_at))
|
||||
line = f"[{ts}] {speaker}: {text}\n"
|
||||
# Atomic-ish append — good enough for a single-writer.
|
||||
with self.transcript_path.open("a", encoding="utf-8") as f:
|
||||
f.write(line)
|
||||
self._flush()
|
||||
|
||||
# -------- status file ----------------------------------------------
|
||||
|
||||
def _flush(self) -> None:
|
||||
data = {
|
||||
"meetingId": self.meeting_id,
|
||||
"url": self.url,
|
||||
"inCall": self.in_call,
|
||||
"captioning": self.captioning,
|
||||
"captionsEnabledAttempted": self.captions_enabled_attempted,
|
||||
"lobbyWaiting": self.lobby_waiting,
|
||||
"joinAttemptedAt": self.join_attempted_at,
|
||||
"joinedAt": self.joined_at,
|
||||
"lastCaptionAt": self.last_caption_at,
|
||||
"transcriptLines": self.transcript_lines,
|
||||
"transcriptPath": str(self.transcript_path),
|
||||
"error": self.error,
|
||||
"exited": self.exited,
|
||||
"pid": os.getpid(),
|
||||
# v2 realtime telemetry.
|
||||
"realtime": self.realtime,
|
||||
"realtimeReady": self.realtime_ready,
|
||||
"realtimeDevice": self.realtime_device,
|
||||
"audioBytesOut": self.audio_bytes_out,
|
||||
"lastAudioOutAt": self.last_audio_out_at,
|
||||
"lastBargeInAt": self.last_barge_in_at,
|
||||
"leaveReason": self.leave_reason,
|
||||
}
|
||||
tmp = self.status_path.with_suffix(".json.tmp")
|
||||
tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
tmp.replace(self.status_path)
|
||||
|
||||
def set(self, **kwargs) -> None:
|
||||
for k, v in kwargs.items():
|
||||
setattr(self, k, v)
|
||||
self._flush()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Playwright bot entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# JavaScript injected into the Meet tab to observe captions. Captures
|
||||
# {speaker, text} tuples via a MutationObserver on the caption container,
|
||||
# and exposes ``window.__hermesMeetDrain()`` to pull new entries. This
|
||||
# mirrors the OpenUtter caption scraping approach.
|
||||
_CAPTION_OBSERVER_JS = r"""
|
||||
(() => {
|
||||
if (window.__hermesMeetInstalled) return;
|
||||
window.__hermesMeetInstalled = true;
|
||||
window.__hermesMeetQueue = [];
|
||||
|
||||
const captionSelector = '[role="region"][aria-label*="aption" i], ' +
|
||||
'div[jsname="YSxPC"], ' + // legacy
|
||||
'div[jsname="tgaKEf"]'; // current (Apr 2026)
|
||||
|
||||
function pushEntry(speaker, text) {
|
||||
if (!text || !text.trim()) return;
|
||||
window.__hermesMeetQueue.push({
|
||||
ts: Date.now(),
|
||||
speaker: (speaker || '').trim(),
|
||||
text: text.trim(),
|
||||
});
|
||||
}
|
||||
|
||||
function scan(root) {
|
||||
// Meet captions render as a list of rows; each row contains a speaker
|
||||
// label and a text block. Selectors vary across Meet rewrites; we try
|
||||
// a few shapes and fall back to raw text.
|
||||
const rows = root.querySelectorAll('div[jsname="dsyhDe"], div.CNusmb, div.TBMuR');
|
||||
if (rows.length) {
|
||||
rows.forEach((row) => {
|
||||
const spkEl = row.querySelector('div.KcIKyf, div.zs7s8d, span[jsname="YSxPC"]');
|
||||
const txtEl = row.querySelector('div.bh44bd, span[jsname="tgaKEf"], div.iTTPOb');
|
||||
const speaker = spkEl ? spkEl.innerText : '';
|
||||
const text = txtEl ? txtEl.innerText : row.innerText;
|
||||
pushEntry(speaker, text);
|
||||
});
|
||||
return;
|
||||
}
|
||||
// Fallback: treat the whole region's innerText as one anonymous line.
|
||||
const text = (root.innerText || '').split('\n').filter(Boolean).pop();
|
||||
pushEntry('', text);
|
||||
}
|
||||
|
||||
function attach() {
|
||||
const el = document.querySelector(captionSelector);
|
||||
if (!el) return false;
|
||||
const obs = new MutationObserver(() => scan(el));
|
||||
obs.observe(el, { childList: true, subtree: true, characterData: true });
|
||||
scan(el);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Try now and retry on interval — the caption region only appears after
|
||||
// captions are enabled and someone speaks.
|
||||
if (!attach()) {
|
||||
const iv = setInterval(() => { if (attach()) clearInterval(iv); }, 1500);
|
||||
}
|
||||
|
||||
window.__hermesMeetDrain = () => {
|
||||
const out = window.__hermesMeetQueue.slice();
|
||||
window.__hermesMeetQueue = [];
|
||||
return out;
|
||||
};
|
||||
})();
|
||||
"""
|
||||
|
||||
|
||||
def _enable_captions_js() -> str:
|
||||
"""Return a small JS snippet that tries to click the 'Turn on captions' button.
|
||||
|
||||
Best-effort — Meet's caption toggle is keyboard-accessible via ``c``. We
|
||||
dispatch that keystroke as a cheap fallback. Real click targeting is too
|
||||
brittle to rely on.
|
||||
"""
|
||||
return r"""
|
||||
(() => {
|
||||
const ev = new KeyboardEvent('keydown', {
|
||||
key: 'c', code: 'KeyC', keyCode: 67, which: 67, bubbles: true,
|
||||
});
|
||||
document.body.dispatchEvent(ev);
|
||||
return true;
|
||||
})();
|
||||
"""
|
||||
|
||||
|
||||
def _start_realtime_speaker(
|
||||
*,
|
||||
rt: dict,
|
||||
out_dir: Path,
|
||||
bridge_info: dict,
|
||||
api_key: str,
|
||||
model: str,
|
||||
voice: str,
|
||||
instructions: str,
|
||||
stop_flag: dict,
|
||||
state: "_BotState",
|
||||
) -> None:
|
||||
"""Wire up the OpenAI Realtime session + speaker thread + PCM pump.
|
||||
|
||||
The speaker thread reads text lines from ``say_queue.jsonl``, sends each
|
||||
to OpenAI Realtime, and writes PCM audio into ``speaker.pcm``. A
|
||||
separate *pump* thread forwards that PCM into the OS audio sink so
|
||||
Chrome's fake mic picks it up. On Linux we pipe to ``paplay`` against
|
||||
the null-sink; on macOS the caller is expected to have the BlackHole
|
||||
device selected as default input.
|
||||
"""
|
||||
try:
|
||||
from plugins.google_meet.realtime.openai_client import (
|
||||
RealtimeSession,
|
||||
RealtimeSpeaker,
|
||||
)
|
||||
except Exception as e:
|
||||
state.set(error=f"realtime import failed: {e}")
|
||||
return
|
||||
|
||||
pcm_path = out_dir / SAY_PCM_FILENAME
|
||||
queue_path = out_dir / SAY_QUEUE_FILENAME
|
||||
processed_path = out_dir / "say_processed.jsonl"
|
||||
# Reset the sink file so we start clean each session.
|
||||
pcm_path.write_bytes(b"")
|
||||
# Make sure the queue exists so the speaker poller doesn't error on
|
||||
# first iteration.
|
||||
queue_path.touch()
|
||||
|
||||
try:
|
||||
session = RealtimeSession(
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
voice=voice,
|
||||
instructions=instructions,
|
||||
audio_sink_path=pcm_path,
|
||||
sample_rate=24000,
|
||||
)
|
||||
session.connect()
|
||||
except Exception as e:
|
||||
state.set(error=f"realtime connect failed: {e}")
|
||||
return
|
||||
|
||||
rt["session"] = session
|
||||
|
||||
def _stop_fn():
|
||||
return stop_flag.get("stop", False)
|
||||
|
||||
rt["speaker_stop"] = lambda: stop_flag.__setitem__("stop", stop_flag.get("stop", False))
|
||||
|
||||
speaker = RealtimeSpeaker(
|
||||
session=session,
|
||||
queue_path=queue_path,
|
||||
processed_path=processed_path,
|
||||
)
|
||||
|
||||
def _speaker_loop():
|
||||
try:
|
||||
speaker.run_until_stopped(_stop_fn)
|
||||
except Exception as e:
|
||||
state.set(error=f"realtime speaker crashed: {e}")
|
||||
|
||||
t_speaker = threading.Thread(target=_speaker_loop, name="meet-speaker", daemon=True)
|
||||
t_speaker.start()
|
||||
rt["speaker_thread"] = t_speaker
|
||||
|
||||
# PCM pump: feeds speaker.pcm (24kHz s16le mono) into the OS audio
|
||||
# device that Chrome's fake mic reads from. Different tools per
|
||||
# platform, but the contract is the same — block-read the growing
|
||||
# PCM file and stream it to the device in near-real-time.
|
||||
platform_tag = (bridge_info or {}).get("platform")
|
||||
if platform_tag == "linux":
|
||||
import subprocess as _sp
|
||||
|
||||
sink = (bridge_info or {}).get("write_target") or "hermes_meet_sink"
|
||||
try:
|
||||
proc = _sp.Popen(
|
||||
[
|
||||
"paplay",
|
||||
"--raw",
|
||||
"--rate=24000",
|
||||
"--format=s16le",
|
||||
"--channels=1",
|
||||
f"--device={sink}",
|
||||
str(pcm_path),
|
||||
],
|
||||
stdin=_sp.DEVNULL,
|
||||
stdout=_sp.DEVNULL,
|
||||
stderr=_sp.DEVNULL,
|
||||
)
|
||||
rt["pcm_pump"] = proc
|
||||
except FileNotFoundError:
|
||||
state.set(error="paplay not found — install pulseaudio-utils for realtime on Linux")
|
||||
elif platform_tag == "darwin":
|
||||
# macOS: use ffmpeg to tail-read speaker.pcm and write it to the
|
||||
# BlackHole output device. The user must have BlackHole selected
|
||||
# as the default input in System Settings → Sound for Chrome to
|
||||
# pick it up. We prefer ffmpeg because it's scriptable and can
|
||||
# target AVFoundation devices by name; fall back to afplay-ing
|
||||
# the file in a tight loop if ffmpeg is absent.
|
||||
import shutil as _shutil
|
||||
import subprocess as _sp
|
||||
|
||||
device_name = (bridge_info or {}).get("write_target") or "BlackHole 2ch"
|
||||
if _shutil.which("ffmpeg"):
|
||||
try:
|
||||
# -re: read input at native frame rate.
|
||||
# -f avfoundation -i: speaker path as raw PCM.
|
||||
# -f s16le -ar 24000 -ac 1 -i <pcm>: interpret the file.
|
||||
# -f audiotoolbox -audio_device_index: write to BlackHole.
|
||||
# Simpler: output as raw via coreaudio using "-f audiotoolbox".
|
||||
# ffmpeg's audiotoolbox output picks the current default
|
||||
# output device, which isn't what we want. Instead we use
|
||||
# -f avfoundation with the named device as OUTPUT via
|
||||
# -vn and the device name.
|
||||
proc = _sp.Popen(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-nostdin", "-hide_banner", "-loglevel", "error",
|
||||
"-re",
|
||||
"-f", "s16le", "-ar", "24000", "-ac", "1",
|
||||
"-i", str(pcm_path),
|
||||
"-f", "audiotoolbox",
|
||||
"-audio_device_index", _mac_audio_device_index(device_name),
|
||||
"-",
|
||||
],
|
||||
stdin=_sp.DEVNULL,
|
||||
stdout=_sp.DEVNULL,
|
||||
stderr=_sp.DEVNULL,
|
||||
)
|
||||
rt["pcm_pump"] = proc
|
||||
except FileNotFoundError:
|
||||
state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS")
|
||||
except Exception as e:
|
||||
state.set(error=f"macOS pcm pump failed to start: {e}")
|
||||
else:
|
||||
state.set(error="ffmpeg not found — install via `brew install ffmpeg` for realtime on macOS")
|
||||
|
||||
|
||||
def _mac_audio_device_index(device_name: str) -> str:
|
||||
"""Return the ffmpeg ``-audio_device_index`` for *device_name*, as a string.
|
||||
|
||||
Probes ``ffmpeg -f avfoundation -list_devices true -i ''`` (which prints
|
||||
the device table on stderr) and matches *device_name* case-insensitively.
|
||||
Defaults to ``"0"`` if the device can't be found — caller will get a
|
||||
misrouted stream but not a crash, and the error will be obvious.
|
||||
"""
|
||||
import subprocess as _sp
|
||||
|
||||
try:
|
||||
out = _sp.run(
|
||||
["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
except Exception:
|
||||
return "0"
|
||||
# ffmpeg prints the table on stderr. Lines look like:
|
||||
# [AVFoundation indev @ 0x...] [0] BlackHole 2ch
|
||||
import re as _re
|
||||
|
||||
needle = device_name.strip().lower()
|
||||
for line in (out.stderr or "").splitlines():
|
||||
m = _re.search(r"\[(\d+)\]\s+(.+)$", line)
|
||||
if not m:
|
||||
continue
|
||||
if m.group(2).strip().lower() == needle:
|
||||
return m.group(1)
|
||||
return "0"
|
||||
|
||||
|
||||
def run_bot() -> int: # noqa: C901 — orchestration, explicit branches
|
||||
url = os.environ.get("HERMES_MEET_URL", "").strip()
|
||||
out_dir_env = os.environ.get("HERMES_MEET_OUT_DIR", "").strip()
|
||||
headed = os.environ.get("HERMES_MEET_HEADED", "").lower() in ("1", "true", "yes")
|
||||
auth_state = os.environ.get("HERMES_MEET_AUTH_STATE", "").strip()
|
||||
guest_name = os.environ.get("HERMES_MEET_GUEST_NAME", "Hermes Agent")
|
||||
duration_s = _parse_duration(os.environ.get("HERMES_MEET_DURATION", ""))
|
||||
# v2: optional realtime mode. Enabled when HERMES_MEET_MODE=realtime.
|
||||
mode = os.environ.get("HERMES_MEET_MODE", "transcribe").strip().lower()
|
||||
realtime_model = os.environ.get("HERMES_MEET_REALTIME_MODEL", "gpt-realtime")
|
||||
realtime_voice = os.environ.get("HERMES_MEET_REALTIME_VOICE", "alloy")
|
||||
realtime_instructions = os.environ.get("HERMES_MEET_REALTIME_INSTRUCTIONS", "")
|
||||
realtime_api_key = os.environ.get("HERMES_MEET_REALTIME_KEY") or os.environ.get("OPENAI_API_KEY", "")
|
||||
|
||||
if not url or not _is_safe_meet_url(url):
|
||||
sys.stderr.write(
|
||||
"google_meet bot: refusing to launch — HERMES_MEET_URL must be a "
|
||||
"meet.google.com URL. got: %r\n" % url
|
||||
)
|
||||
return 2
|
||||
if not out_dir_env:
|
||||
sys.stderr.write("google_meet bot: HERMES_MEET_OUT_DIR is required\n")
|
||||
return 2
|
||||
|
||||
out_dir = Path(out_dir_env)
|
||||
meeting_id = _meeting_id_from_url(url)
|
||||
state = _BotState(out_dir=out_dir, meeting_id=meeting_id, url=url)
|
||||
|
||||
# SIGTERM → exit cleanly so the parent ``meet_leave`` gets a finalized
|
||||
# transcript. We set a flag instead of raising so the Playwright context
|
||||
# teardown runs in the finally block below.
|
||||
stop_flag = {"stop": False}
|
||||
|
||||
def _on_signal(_sig, _frame):
|
||||
stop_flag["stop"] = True
|
||||
|
||||
signal.signal(signal.SIGTERM, _on_signal)
|
||||
signal.signal(signal.SIGINT, _on_signal)
|
||||
|
||||
# v2 realtime: provision virtual audio device + start speaker thread.
|
||||
# We track these in a dict so the finally block can tear them down
|
||||
# regardless of how we exit. If anything in the realtime setup fails we
|
||||
# fall back to transcribe mode with a status flag.
|
||||
rt = {
|
||||
"enabled": mode == "realtime",
|
||||
"bridge": None, # AudioBridge | None
|
||||
"bridge_info": None, # dict | None
|
||||
"session": None, # RealtimeSession | None
|
||||
"speaker_thread": None, # threading.Thread | None
|
||||
"speaker_stop": None, # callable | None
|
||||
}
|
||||
if rt["enabled"]:
|
||||
if not realtime_api_key:
|
||||
state.set(error="realtime mode requested but no API key in HERMES_MEET_REALTIME_KEY/OPENAI_API_KEY — falling back to transcribe")
|
||||
rt["enabled"] = False
|
||||
else:
|
||||
try:
|
||||
from plugins.google_meet.audio_bridge import AudioBridge
|
||||
bridge = AudioBridge()
|
||||
rt["bridge_info"] = bridge.setup()
|
||||
rt["bridge"] = bridge
|
||||
state.set(realtime=True, realtime_device=rt["bridge_info"].get("device_name"))
|
||||
except Exception as e:
|
||||
state.set(error=f"audio bridge setup failed: {e} — falling back to transcribe")
|
||||
rt["enabled"] = False
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError as e:
|
||||
state.set(error=f"playwright not installed: {e}", exited=True)
|
||||
sys.stderr.write(
|
||||
"google_meet bot: playwright is not installed. Run "
|
||||
"`pip install playwright && python -m playwright install chromium`\n"
|
||||
)
|
||||
if rt["bridge"]:
|
||||
rt["bridge"].teardown()
|
||||
return 3
|
||||
|
||||
# Chrome env: if realtime is live on Linux, point PULSE_SOURCE at the
|
||||
# virtual source so Chrome's fake mic reads the audio we generate.
|
||||
chrome_env = os.environ.copy()
|
||||
chrome_args = [
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
]
|
||||
if not rt["enabled"]:
|
||||
# v1-style fake device (silence) — we don't care about mic content
|
||||
# when we're not speaking.
|
||||
chrome_args.insert(1, "--use-fake-device-for-media-stream")
|
||||
elif rt["bridge_info"] and rt["bridge_info"].get("platform") == "linux":
|
||||
chrome_env["PULSE_SOURCE"] = rt["bridge_info"].get("device_name", "")
|
||||
|
||||
try:
|
||||
with sync_playwright() as pw:
|
||||
# Playwright's launch() doesn't take env; we set PULSE_SOURCE
|
||||
# via the process env before launch so the child Chrome inherits it.
|
||||
for k, v in chrome_env.items():
|
||||
os.environ[k] = v
|
||||
browser = pw.chromium.launch(
|
||||
headless=not headed,
|
||||
args=chrome_args,
|
||||
)
|
||||
context_args = {
|
||||
"viewport": {"width": 1280, "height": 800},
|
||||
"user_agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"permissions": ["microphone", "camera"],
|
||||
}
|
||||
if auth_state and Path(auth_state).is_file():
|
||||
context_args["storage_state"] = auth_state
|
||||
context = browser.new_context(**context_args)
|
||||
page = context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
||||
except Exception as e:
|
||||
state.set(error=f"navigate failed: {e}", exited=True)
|
||||
return 4
|
||||
|
||||
# Guest-mode: Meet shows a name field before "Ask to join". When
|
||||
# we're authed, we instead see "Join now".
|
||||
_try_guest_name(page, guest_name)
|
||||
_click_join(page, state)
|
||||
|
||||
# Install caption observer and attempt to enable captions.
|
||||
try:
|
||||
page.evaluate(_enable_captions_js())
|
||||
state.set(captions_enabled_attempted=True)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
page.evaluate(_CAPTION_OBSERVER_JS)
|
||||
except Exception as e:
|
||||
state.set(error=f"caption observer install failed: {e}")
|
||||
|
||||
# Note: in_call=False until admission is confirmed (we detect
|
||||
# either the Leave button or the caption region, signalling we
|
||||
# made it past the lobby).
|
||||
state.set(captioning=True, join_attempted_at=time.time())
|
||||
|
||||
# v2 realtime: start the speaker thread reading from the
|
||||
# plugin-side say queue. The thread reads JSONL lines written by
|
||||
# meet_say, calls OpenAI Realtime, and streams the audio PCM to
|
||||
# the virtual sink that Chrome's fake-mic is pointed at.
|
||||
if rt["enabled"]:
|
||||
_start_realtime_speaker(
|
||||
rt=rt,
|
||||
out_dir=out_dir,
|
||||
bridge_info=rt["bridge_info"],
|
||||
api_key=realtime_api_key,
|
||||
model=realtime_model,
|
||||
voice=realtime_voice,
|
||||
instructions=realtime_instructions,
|
||||
stop_flag=stop_flag,
|
||||
state=state,
|
||||
)
|
||||
if rt["session"] is not None:
|
||||
state.set(realtime_ready=True)
|
||||
|
||||
# Admission + drain loop. Runs until SIGTERM, duration expiry,
|
||||
# or the page detects "You were removed / you left the
|
||||
# meeting". Responsible for:
|
||||
# * detecting admission (Leave button visible → in_call=True)
|
||||
# * timing out stuck-in-lobby (default 5 minutes)
|
||||
# * draining scraped captions into the transcript
|
||||
# * triggering realtime barge-in when a human speaks while
|
||||
# the bot is generating audio
|
||||
# * periodically flushing realtime counters into status.json
|
||||
deadline = (time.time() + duration_s) if duration_s else None
|
||||
lobby_deadline = time.time() + float(
|
||||
os.environ.get("HERMES_MEET_LOBBY_TIMEOUT", "300")
|
||||
)
|
||||
last_admission_check = 0.0
|
||||
while not stop_flag["stop"]:
|
||||
now = time.time()
|
||||
if deadline and now > deadline:
|
||||
state.set(leave_reason="duration_expired")
|
||||
break
|
||||
|
||||
# Admission detection every ~3s until admitted.
|
||||
if not state.in_call and (now - last_admission_check) > 3.0:
|
||||
last_admission_check = now
|
||||
admitted = _detect_admission(page)
|
||||
if admitted:
|
||||
state.set(
|
||||
in_call=True,
|
||||
lobby_waiting=False,
|
||||
joined_at=now,
|
||||
)
|
||||
elif now > lobby_deadline:
|
||||
state.set(
|
||||
error=(
|
||||
"lobby timeout — host never admitted the bot "
|
||||
f"within {int(lobby_deadline - state.join_attempted_at) if state.join_attempted_at else 0}s"
|
||||
),
|
||||
leave_reason="lobby_timeout",
|
||||
)
|
||||
break
|
||||
elif _detect_denied(page):
|
||||
state.set(
|
||||
error="host denied admission",
|
||||
leave_reason="denied",
|
||||
)
|
||||
break
|
||||
|
||||
try:
|
||||
queued = page.evaluate("window.__hermesMeetDrain && window.__hermesMeetDrain()")
|
||||
if isinstance(queued, list):
|
||||
for entry in queued:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
speaker = str(entry.get("speaker", ""))
|
||||
text = str(entry.get("text", ""))
|
||||
state.record_caption(speaker=speaker, text=text)
|
||||
# Barge-in: if the bot is currently generating
|
||||
# audio AND a real human just spoke, cancel the
|
||||
# in-flight response so we don't talk over them.
|
||||
if rt["enabled"] and rt["session"] is not None:
|
||||
if _looks_like_human_speaker(speaker, guest_name):
|
||||
try:
|
||||
cancelled = rt["session"].cancel_response()
|
||||
if cancelled:
|
||||
state.set(last_barge_in_at=now)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
# Meet reloaded or we got booted — try to detect and
|
||||
# exit gracefully rather than spinning.
|
||||
if page.is_closed():
|
||||
state.set(leave_reason="page_closed")
|
||||
break
|
||||
|
||||
# Fold the realtime session's byte/timestamp counters into
|
||||
# the status file so meet_status can surface them.
|
||||
if rt["session"] is not None:
|
||||
state.set(
|
||||
audio_bytes_out=getattr(rt["session"], "audio_bytes_out", 0),
|
||||
last_audio_out_at=getattr(rt["session"], "last_audio_out_at", None),
|
||||
)
|
||||
|
||||
time.sleep(1.0)
|
||||
|
||||
# Try to leave cleanly — click "Leave call" button if present.
|
||||
try:
|
||||
page.evaluate(
|
||||
"() => { const b = document.querySelector('button[aria-label*=\"eave call\"]');"
|
||||
" if (b) b.click(); }"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
# v2: teardown realtime speaker + audio bridge.
|
||||
if rt["speaker_stop"]:
|
||||
try:
|
||||
rt["speaker_stop"]()
|
||||
except Exception:
|
||||
pass
|
||||
if rt["speaker_thread"] is not None:
|
||||
try:
|
||||
rt["speaker_thread"].join(timeout=5.0)
|
||||
except Exception:
|
||||
pass
|
||||
if rt["session"]:
|
||||
try:
|
||||
rt["session"].close()
|
||||
except Exception:
|
||||
pass
|
||||
if rt["bridge"]:
|
||||
try:
|
||||
rt["bridge"].teardown()
|
||||
except Exception:
|
||||
pass
|
||||
state.set(in_call=False, captioning=False, exited=True)
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
state.set(error=f"unhandled: {e}", exited=True)
|
||||
return 1
|
||||
|
||||
|
||||
def _try_guest_name(page, guest_name: str) -> None:
|
||||
"""If Meet is showing a guest-name input, type *guest_name* into it."""
|
||||
try:
|
||||
# Meet's guest name input has placeholder "Your name".
|
||||
locator = page.locator('input[aria-label*="name" i]').first
|
||||
if locator.count() and locator.is_visible():
|
||||
locator.fill(guest_name, timeout=2_000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _detect_admission(page) -> bool:
|
||||
"""True if we're clearly past the lobby and in the call itself.
|
||||
|
||||
Uses a JS-side probe because Meet's DOM structure varies by client
|
||||
version. We check several high-signal indicators and declare admission
|
||||
on the first hit:
|
||||
|
||||
1. Leave-call button is present (``aria-label`` contains "eave call").
|
||||
2. Caption region has appeared (we installed the observer and it attached).
|
||||
3. The participant list container is visible.
|
||||
|
||||
Conservative by default — returns False on any error.
|
||||
"""
|
||||
probe = r"""
|
||||
(() => {
|
||||
const leave = document.querySelector('button[aria-label*="eave call" i]');
|
||||
if (leave) return true;
|
||||
if (window.__hermesMeetInstalled) {
|
||||
const caps = document.querySelector(
|
||||
'[role="region"][aria-label*="aption" i], ' +
|
||||
'div[jsname="YSxPC"], div[jsname="tgaKEf"]'
|
||||
);
|
||||
if (caps) return true;
|
||||
}
|
||||
const parts = document.querySelector('[aria-label*="articipants" i]');
|
||||
if (parts) return true;
|
||||
return false;
|
||||
})();
|
||||
"""
|
||||
try:
|
||||
return bool(page.evaluate(probe))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _detect_denied(page) -> bool:
|
||||
"""True when Meet is showing a 'you were denied' / 'no one admitted' page."""
|
||||
probe = r"""
|
||||
(() => {
|
||||
const text = document.body ? document.body.innerText || '' : '';
|
||||
// English only — matches what shows up when the host denies or
|
||||
// removes a guest.
|
||||
if (/You can't join this video call/i.test(text)) return true;
|
||||
if (/You were removed from the meeting/i.test(text)) return true;
|
||||
if (/No one responded to your request to join/i.test(text)) return true;
|
||||
return false;
|
||||
})();
|
||||
"""
|
||||
try:
|
||||
return bool(page.evaluate(probe))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_human_speaker(speaker: str, bot_guest_name: str) -> bool:
|
||||
"""Whether a caption line's speaker is probably a human, not our bot echo.
|
||||
|
||||
Meet attributes captions to the speaker's display name. When Chrome is
|
||||
reading our fake mic, Meet still attributes captions to *our* bot name
|
||||
(because the bot is the one "speaking"). We don't want those to trigger
|
||||
barge-in. Anything else — real participant names — does.
|
||||
|
||||
Conservative: unknown / blank speakers (common when caption scraping
|
||||
falls back to raw text) do NOT trigger barge-in, because we can't tell
|
||||
whether it was a human or us.
|
||||
"""
|
||||
if not speaker or not speaker.strip():
|
||||
return False
|
||||
spk = speaker.strip().lower()
|
||||
if spk in ("unknown", "you", bot_guest_name.strip().lower()):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _click_join(page, state: _BotState) -> None:
|
||||
"""Click 'Join now' or 'Ask to join' if either button is visible.
|
||||
|
||||
Flags ``lobby_waiting`` when we hit the "waiting for host to admit you"
|
||||
state so the agent can surface that in status.
|
||||
"""
|
||||
for label in ("Join now", "Ask to join"):
|
||||
try:
|
||||
btn = page.get_by_role("button", name=label, exact=False).first
|
||||
if btn.count() and btn.is_visible():
|
||||
btn.click(timeout=3_000)
|
||||
if label == "Ask to join":
|
||||
state.set(lobby_waiting=True)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
|
||||
def _parse_duration(raw: str) -> Optional[float]:
|
||||
"""Parse ``30m`` / ``2h`` / ``90`` (seconds) → float seconds, or None."""
|
||||
if not raw:
|
||||
return None
|
||||
raw = raw.strip().lower()
|
||||
try:
|
||||
if raw.endswith("h"):
|
||||
return float(raw[:-1]) * 3600
|
||||
if raw.endswith("m"):
|
||||
return float(raw[:-1]) * 60
|
||||
if raw.endswith("s"):
|
||||
return float(raw[:-1])
|
||||
return float(raw)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover — subprocess entry point
|
||||
sys.exit(run_bot())
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Remote 'node host' primitive for the google_meet plugin.
|
||||
|
||||
Lets the Meet bot (Playwright + Chrome) run on a different machine than
|
||||
the hermes-agent gateway. The gateway speaks a small JSON-over-WebSocket
|
||||
RPC protocol to the remote node; the node wraps the existing
|
||||
``plugins.google_meet.process_manager`` API.
|
||||
|
||||
Topology
|
||||
--------
|
||||
gateway (Linux) ── ws://mac.local:18789 ──▶ node server (Mac)
|
||||
└─ process_manager
|
||||
└─ meet_bot (Playwright)
|
||||
|
||||
Why: Google sign-in + Chrome profile live on the user's laptop. Running
|
||||
the bot there reuses that profile without shipping credentials to the
|
||||
server.
|
||||
|
||||
Public surface
|
||||
--------------
|
||||
NodeClient — gateway-side RPC client (short-lived sync WS per call)
|
||||
NodeServer — long-running server that hosts the bot
|
||||
NodeRegistry — local JSON registry of approved nodes (name → url+token)
|
||||
protocol — message envelope helpers (make_request, encode, decode, ...)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from plugins.google_meet.node import protocol
|
||||
from plugins.google_meet.node.client import NodeClient
|
||||
from plugins.google_meet.node.protocol import (
|
||||
VALID_REQUEST_TYPES,
|
||||
decode,
|
||||
encode,
|
||||
make_error,
|
||||
make_request,
|
||||
make_response,
|
||||
validate_request,
|
||||
)
|
||||
from plugins.google_meet.node.registry import NodeRegistry
|
||||
from plugins.google_meet.node.server import NodeServer
|
||||
|
||||
__all__ = [
|
||||
"NodeClient",
|
||||
"NodeServer",
|
||||
"NodeRegistry",
|
||||
"protocol",
|
||||
"make_request",
|
||||
"make_response",
|
||||
"make_error",
|
||||
"encode",
|
||||
"decode",
|
||||
"validate_request",
|
||||
"VALID_REQUEST_TYPES",
|
||||
]
|
||||
@@ -0,0 +1,125 @@
|
||||
"""`hermes meet node ...` subcommand tree.
|
||||
|
||||
Wired into the existing ``hermes meet`` parser by the plugin's top-level
|
||||
CLI. This module only defines the subparsers and their dispatch — it
|
||||
does not mutate the existing cli.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
from plugins.google_meet.node.client import NodeClient
|
||||
from plugins.google_meet.node.registry import NodeRegistry
|
||||
from plugins.google_meet.node.server import NodeServer
|
||||
|
||||
|
||||
def register_cli(subparser: argparse.ArgumentParser) -> None:
|
||||
"""Add ``run / list / approve / remove / status / ping`` subparsers.
|
||||
|
||||
*subparser* is the ``hermes meet node`` argparse object — typically
|
||||
the result of ``meet_parser.add_parser('node', ...)``.
|
||||
"""
|
||||
sp = subparser.add_subparsers(dest="node_cmd", required=True)
|
||||
|
||||
run = sp.add_parser("run", help="Start a node server on this machine.")
|
||||
run.add_argument("--host", default="0.0.0.0")
|
||||
run.add_argument("--port", type=int, default=18789)
|
||||
run.add_argument("--display-name", default="hermes-meet-node")
|
||||
run.set_defaults(func=node_command)
|
||||
|
||||
lst = sp.add_parser("list", help="List approved remote nodes.")
|
||||
lst.set_defaults(func=node_command)
|
||||
|
||||
app = sp.add_parser("approve", help="Register a remote node on the gateway.")
|
||||
app.add_argument("name")
|
||||
app.add_argument("url")
|
||||
app.add_argument("token")
|
||||
app.set_defaults(func=node_command)
|
||||
|
||||
rm = sp.add_parser("remove", help="Forget a registered node.")
|
||||
rm.add_argument("name")
|
||||
rm.set_defaults(func=node_command)
|
||||
|
||||
st = sp.add_parser("status", help="Ping a registered node.")
|
||||
st.add_argument("name")
|
||||
st.set_defaults(func=node_command)
|
||||
|
||||
pg = sp.add_parser("ping", help="Alias for status.")
|
||||
pg.add_argument("name")
|
||||
pg.set_defaults(func=node_command)
|
||||
|
||||
|
||||
def node_command(args: argparse.Namespace) -> int:
|
||||
"""Dispatch for ``hermes meet node ...``.
|
||||
|
||||
Returns a process exit code. Side-effects print to stdout/stderr.
|
||||
"""
|
||||
cmd = getattr(args, "node_cmd", None)
|
||||
|
||||
if cmd == "run":
|
||||
server = NodeServer(
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
display_name=args.display_name,
|
||||
)
|
||||
token = server.ensure_token()
|
||||
print(f"[meet-node] display_name={server.display_name}")
|
||||
print(f"[meet-node] listening on ws://{args.host}:{args.port}")
|
||||
print(f"[meet-node] token (copy to gateway): {token}")
|
||||
print(f"[meet-node] approve with:")
|
||||
print(f" hermes meet node approve <name> ws://<host>:{args.port} {token}")
|
||||
try:
|
||||
asyncio.run(server.serve())
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except RuntimeError as exc:
|
||||
print(f"[meet-node] error: {exc}", file=sys.stderr)
|
||||
return 2
|
||||
return 0
|
||||
|
||||
reg = NodeRegistry()
|
||||
|
||||
if cmd == "list":
|
||||
nodes = reg.list_all()
|
||||
if not nodes:
|
||||
print("no nodes registered")
|
||||
return 0
|
||||
for n in nodes:
|
||||
print(f"{n['name']}\t{n['url']}\ttoken={n['token'][:6]}…")
|
||||
return 0
|
||||
|
||||
if cmd == "approve":
|
||||
reg.add(args.name, args.url, args.token)
|
||||
print(f"approved node {args.name!r} at {args.url}")
|
||||
return 0
|
||||
|
||||
if cmd == "remove":
|
||||
ok = reg.remove(args.name)
|
||||
print(f"removed {args.name!r}" if ok else f"no such node: {args.name!r}")
|
||||
return 0 if ok else 1
|
||||
|
||||
if cmd in ("status", "ping"):
|
||||
entry = reg.get(args.name)
|
||||
if entry is None:
|
||||
print(f"no such node: {args.name!r}", file=sys.stderr)
|
||||
return 1
|
||||
client = NodeClient(entry["url"], entry["token"])
|
||||
try:
|
||||
result = client.ping()
|
||||
except Exception as exc: # noqa: BLE001 — surface any connection error
|
||||
print(json.dumps({"ok": False, "error": str(exc)}))
|
||||
return 1
|
||||
print(json.dumps({"ok": True, "node": args.name, **_coerce_dict(result)}))
|
||||
return 0
|
||||
|
||||
print(f"unknown node command: {cmd!r}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
|
||||
def _coerce_dict(value: Any) -> dict:
|
||||
return value if isinstance(value, dict) else {"result": value}
|
||||
@@ -0,0 +1,107 @@
|
||||
"""Gateway-side RPC client for a remote meet node.
|
||||
|
||||
Each call opens a short-lived synchronous WebSocket to the node, sends
|
||||
exactly one request, reads exactly one response, and closes. This keeps
|
||||
the client trivial to use from non-async tool handlers and avoids
|
||||
maintaining persistent connection state across agent turns.
|
||||
|
||||
The ``websockets`` package is an optional dep — we import it lazily so
|
||||
plugin load doesn't require it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from plugins.google_meet.node import protocol as _proto
|
||||
|
||||
|
||||
class NodeClient:
|
||||
"""Thin synchronous WS client matching the server's request surface."""
|
||||
|
||||
def __init__(self, url: str, token: str, timeout: float = 10.0) -> None:
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError("url must be a non-empty string")
|
||||
if not isinstance(token, str) or not token:
|
||||
raise ValueError("token must be a non-empty string")
|
||||
self.url = url
|
||||
self.token = token
|
||||
self.timeout = float(timeout)
|
||||
|
||||
# ----- core RPC -----------------------------------------------------
|
||||
|
||||
def _rpc(self, type: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send one request, return the response payload dict.
|
||||
|
||||
Raises RuntimeError when the server sends an ``error`` envelope
|
||||
or the response id doesn't match.
|
||||
"""
|
||||
try:
|
||||
from websockets.sync.client import connect # type: ignore
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"NodeClient requires the 'websockets' package. "
|
||||
"Install it with: pip install websockets"
|
||||
) from exc
|
||||
|
||||
req = _proto.make_request(type, self.token, payload)
|
||||
raw_out = _proto.encode(req)
|
||||
|
||||
with connect(self.url, open_timeout=self.timeout,
|
||||
close_timeout=self.timeout) as ws:
|
||||
ws.send(raw_out)
|
||||
raw_in = ws.recv(timeout=self.timeout)
|
||||
|
||||
if isinstance(raw_in, (bytes, bytearray)):
|
||||
raw_in = raw_in.decode("utf-8")
|
||||
resp = _proto.decode(raw_in)
|
||||
|
||||
if resp.get("type") == "error":
|
||||
raise RuntimeError(f"node error: {resp.get('error', '<unknown>')}")
|
||||
if resp.get("id") != req["id"]:
|
||||
raise RuntimeError(
|
||||
f"response id mismatch: sent {req['id']}, got {resp.get('id')!r}"
|
||||
)
|
||||
payload_out = resp.get("payload")
|
||||
if not isinstance(payload_out, dict):
|
||||
# Ping returns {"type": "pong", "payload": {...}} — still a dict.
|
||||
raise RuntimeError("response missing payload dict")
|
||||
return payload_out
|
||||
|
||||
# ----- convenience methods -----------------------------------------
|
||||
|
||||
def start_bot(
|
||||
self,
|
||||
url: str,
|
||||
guest_name: str = "Hermes Agent",
|
||||
duration: Optional[str] = None,
|
||||
headed: bool = False,
|
||||
mode: str = "transcribe",
|
||||
) -> Dict[str, Any]:
|
||||
payload: Dict[str, Any] = {
|
||||
"url": url,
|
||||
"guest_name": guest_name,
|
||||
"headed": bool(headed),
|
||||
"mode": mode,
|
||||
}
|
||||
if duration is not None:
|
||||
payload["duration"] = duration
|
||||
return self._rpc("start_bot", payload)
|
||||
|
||||
def stop(self) -> Dict[str, Any]:
|
||||
return self._rpc("stop", {})
|
||||
|
||||
def status(self) -> Dict[str, Any]:
|
||||
return self._rpc("status", {})
|
||||
|
||||
def transcript(self, last: Optional[int] = None) -> Dict[str, Any]:
|
||||
payload: Dict[str, Any] = {}
|
||||
if last is not None:
|
||||
payload["last"] = int(last)
|
||||
return self._rpc("transcript", payload)
|
||||
|
||||
def say(self, text: str) -> Dict[str, Any]:
|
||||
return self._rpc("say", {"text": str(text)})
|
||||
|
||||
def ping(self) -> Dict[str, Any]:
|
||||
return self._rpc("ping", {})
|
||||
@@ -0,0 +1,124 @@
|
||||
"""Wire protocol for gateway ↔ node RPC.
|
||||
|
||||
Everything is a JSON object with the same envelope shape:
|
||||
|
||||
Request: {"type": <str>, "id": <str>, "token": <str>, "payload": <dict>}
|
||||
Response: {"type": "<req-type>_res", "id": <req-id>, "payload": <dict>}
|
||||
Error: {"type": "error", "id": <req-id>, "error": <str>}
|
||||
|
||||
Requests must carry the shared bearer token (set up via
|
||||
``hermes meet node approve`` on the gateway and read off disk on the
|
||||
server). Mismatched tokens are rejected before dispatch.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
|
||||
VALID_REQUEST_TYPES = frozenset({
|
||||
"start_bot",
|
||||
"stop",
|
||||
"status",
|
||||
"transcript",
|
||||
"say",
|
||||
"ping",
|
||||
})
|
||||
|
||||
|
||||
def make_request(
|
||||
type: str,
|
||||
token: str,
|
||||
payload: Dict[str, Any],
|
||||
req_id: str | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Construct a request envelope.
|
||||
|
||||
``req_id`` is auto-generated (uuid4 hex) when not supplied so callers
|
||||
can correlate async responses.
|
||||
"""
|
||||
if not isinstance(type, str) or not type:
|
||||
raise ValueError("type must be a non-empty string")
|
||||
if type not in VALID_REQUEST_TYPES:
|
||||
raise ValueError(f"unknown request type: {type!r}")
|
||||
if not isinstance(token, str):
|
||||
raise ValueError("token must be a string")
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError("payload must be a dict")
|
||||
return {
|
||||
"type": type,
|
||||
"id": req_id or uuid.uuid4().hex,
|
||||
"token": token,
|
||||
"payload": payload,
|
||||
}
|
||||
|
||||
|
||||
def make_response(req_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Build a success response. The caller supplies the *request* type;
|
||||
we suffix it with ``_res`` so clients can assert they got the right
|
||||
reply.
|
||||
|
||||
For simplicity we don't require the type here — clients usually just
|
||||
key off ``id``. But we still emit a generic ``*_res`` envelope.
|
||||
"""
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError("payload must be a dict")
|
||||
return {"type": "response", "id": req_id, "payload": payload}
|
||||
|
||||
|
||||
def make_error(req_id: str, error: str) -> Dict[str, Any]:
|
||||
return {"type": "error", "id": req_id, "error": str(error)}
|
||||
|
||||
|
||||
def encode(msg: Dict[str, Any]) -> str:
|
||||
"""Serialize a message envelope to a JSON string."""
|
||||
return json.dumps(msg, separators=(",", ":"), ensure_ascii=False)
|
||||
|
||||
|
||||
def decode(raw: str) -> Dict[str, Any]:
|
||||
"""Parse a JSON envelope, raising ValueError on anything malformed.
|
||||
|
||||
Minimal type validation: must be an object, must contain ``type`` and
|
||||
``id``. Heavier validation (token match, payload shape) happens in
|
||||
:func:`validate_request` on the server side.
|
||||
"""
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except (TypeError, json.JSONDecodeError) as exc:
|
||||
raise ValueError(f"malformed JSON: {exc}") from exc
|
||||
if not isinstance(obj, dict):
|
||||
raise ValueError("envelope must be a JSON object")
|
||||
if "type" not in obj or not isinstance(obj["type"], str):
|
||||
raise ValueError("envelope missing string 'type'")
|
||||
if "id" not in obj or not isinstance(obj["id"], str):
|
||||
raise ValueError("envelope missing string 'id'")
|
||||
return obj
|
||||
|
||||
|
||||
def validate_request(msg: Dict[str, Any], expected_token: str) -> Tuple[bool, str]:
|
||||
"""Check a decoded request against the server's shared token.
|
||||
|
||||
Returns ``(True, "")`` when the envelope is acceptable or
|
||||
``(False, <reason>)`` otherwise. Reason strings are safe to surface
|
||||
back to the client in an error envelope.
|
||||
"""
|
||||
if not isinstance(msg, dict):
|
||||
return False, "envelope must be a dict"
|
||||
t = msg.get("type")
|
||||
if not isinstance(t, str) or not t:
|
||||
return False, "missing or non-string 'type'"
|
||||
if t not in VALID_REQUEST_TYPES:
|
||||
return False, f"unknown request type: {t!r}"
|
||||
if not isinstance(msg.get("id"), str) or not msg.get("id"):
|
||||
return False, "missing or non-string 'id'"
|
||||
token = msg.get("token")
|
||||
if not isinstance(token, str) or not token:
|
||||
return False, "missing token"
|
||||
if token != expected_token:
|
||||
return False, "token mismatch"
|
||||
payload = msg.get("payload")
|
||||
if not isinstance(payload, dict):
|
||||
return False, "payload must be a dict"
|
||||
return True, ""
|
||||
@@ -0,0 +1,112 @@
|
||||
"""Local JSON registry of approved remote meet nodes.
|
||||
|
||||
Lives at ``$HERMES_HOME/workspace/meetings/nodes.json``. The gateway
|
||||
consults it to resolve a ``chrome_node`` name to a ``(url, token)`` pair
|
||||
before opening a WebSocket to the remote bot host.
|
||||
|
||||
Schema
|
||||
------
|
||||
{
|
||||
"nodes": {
|
||||
"<name>": {
|
||||
"url": "ws://host:port",
|
||||
"token": "...",
|
||||
"added_at": <epoch_float>
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
|
||||
def _default_path() -> Path:
|
||||
return Path(get_hermes_home()) / "workspace" / "meetings" / "nodes.json"
|
||||
|
||||
|
||||
class NodeRegistry:
|
||||
"""Simple file-backed registry. Not concurrent-safe across processes
|
||||
— single writer assumed (the gateway CLI)."""
|
||||
|
||||
def __init__(self, path: Optional[Path] = None) -> None:
|
||||
self.path = Path(path) if path is not None else _default_path()
|
||||
|
||||
# ----- storage ------------------------------------------------------
|
||||
|
||||
def _load(self) -> Dict[str, Any]:
|
||||
if not self.path.is_file():
|
||||
return {"nodes": {}}
|
||||
try:
|
||||
data = json.loads(self.path.read_text(encoding="utf-8"))
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return {"nodes": {}}
|
||||
if not isinstance(data, dict) or not isinstance(data.get("nodes"), dict):
|
||||
return {"nodes": {}}
|
||||
return data
|
||||
|
||||
def _save(self, data: Dict[str, Any]) -> None:
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = self.path.with_suffix(".json.tmp")
|
||||
tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
tmp.replace(self.path)
|
||||
|
||||
# ----- public API ---------------------------------------------------
|
||||
|
||||
def get(self, name: str) -> Optional[Dict[str, Any]]:
|
||||
data = self._load()
|
||||
entry = data["nodes"].get(name)
|
||||
if entry is None:
|
||||
return None
|
||||
return {"name": name, **entry}
|
||||
|
||||
def add(self, name: str, url: str, token: str) -> None:
|
||||
if not isinstance(name, str) or not name:
|
||||
raise ValueError("node name must be a non-empty string")
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError("url must be a non-empty string")
|
||||
if not isinstance(token, str) or not token:
|
||||
raise ValueError("token must be a non-empty string")
|
||||
data = self._load()
|
||||
data["nodes"][name] = {
|
||||
"url": url,
|
||||
"token": token,
|
||||
"added_at": time.time(),
|
||||
}
|
||||
self._save(data)
|
||||
|
||||
def remove(self, name: str) -> bool:
|
||||
data = self._load()
|
||||
if name in data["nodes"]:
|
||||
del data["nodes"][name]
|
||||
self._save(data)
|
||||
return True
|
||||
return False
|
||||
|
||||
def list_all(self) -> List[Dict[str, Any]]:
|
||||
data = self._load()
|
||||
out: List[Dict[str, Any]] = []
|
||||
for name, entry in sorted(data["nodes"].items()):
|
||||
out.append({"name": name, **entry})
|
||||
return out
|
||||
|
||||
def resolve(self, chrome_node: Optional[str]) -> Optional[Dict[str, Any]]:
|
||||
"""Resolve a node name to its entry.
|
||||
|
||||
If ``chrome_node`` is provided, return that named node (or None).
|
||||
If ``chrome_node`` is None, return the sole registered node when
|
||||
exactly one is registered; otherwise return None (ambiguous or
|
||||
empty).
|
||||
"""
|
||||
if chrome_node:
|
||||
return self.get(chrome_node)
|
||||
nodes = self.list_all()
|
||||
if len(nodes) == 1:
|
||||
return nodes[0]
|
||||
return None
|
||||
@@ -0,0 +1,193 @@
|
||||
"""Remote node server.
|
||||
|
||||
Runs on the machine that will host the Meet bot (typically the user's
|
||||
Mac laptop with a signed-in Chrome). Exposes a WebSocket endpoint that
|
||||
accepts signed RPC requests and dispatches them to the existing
|
||||
``plugins.google_meet.process_manager`` module.
|
||||
|
||||
Launched by ``hermes meet node run``.
|
||||
|
||||
Token handling
|
||||
--------------
|
||||
On first boot we mint 32 hex chars of entropy and persist them at
|
||||
``$HERMES_HOME/workspace/meetings/node_token.json``. Subsequent boots
|
||||
reuse the same token so previously-approved gateways don't need to be
|
||||
re-paired. The operator copies this token out-of-band to the gateway
|
||||
via ``hermes meet node approve <name> <url> <token>``.
|
||||
|
||||
Dependencies
|
||||
------------
|
||||
``websockets`` is an optional dep. We import it lazily inside
|
||||
:meth:`serve` so installing the plugin doesn't require it unless you
|
||||
actually host a node.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import secrets
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
from plugins.google_meet.node import protocol as _proto
|
||||
|
||||
|
||||
def _default_token_path() -> Path:
|
||||
return Path(get_hermes_home()) / "workspace" / "meetings" / "node_token.json"
|
||||
|
||||
|
||||
class NodeServer:
|
||||
"""WebSocket server that executes meet bot RPCs locally."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str = "0.0.0.0",
|
||||
port: int = 18789,
|
||||
token_path: Optional[Path] = None,
|
||||
display_name: str = "hermes-meet-node",
|
||||
) -> None:
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.display_name = display_name
|
||||
self.token_path = Path(token_path) if token_path is not None else _default_token_path()
|
||||
self._token: Optional[str] = None
|
||||
|
||||
# ----- token management --------------------------------------------
|
||||
|
||||
def ensure_token(self) -> str:
|
||||
"""Return the persisted shared secret, generating one on first use."""
|
||||
if self._token:
|
||||
return self._token
|
||||
if self.token_path.is_file():
|
||||
try:
|
||||
data = json.loads(self.token_path.read_text(encoding="utf-8"))
|
||||
tok = data.get("token")
|
||||
if isinstance(tok, str) and tok:
|
||||
self._token = tok
|
||||
return tok
|
||||
except (OSError, json.JSONDecodeError):
|
||||
pass
|
||||
tok = secrets.token_hex(16) # 32 hex chars
|
||||
self.token_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = self.token_path.with_suffix(".json.tmp")
|
||||
tmp.write_text(
|
||||
json.dumps({"token": tok, "generated_at": time.time()}, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
tmp.replace(self.token_path)
|
||||
self._token = tok
|
||||
return tok
|
||||
|
||||
def get_token(self) -> str:
|
||||
"""Alias for :meth:`ensure_token`; does not mutate on subsequent calls."""
|
||||
return self.ensure_token()
|
||||
|
||||
# ----- dispatch -----------------------------------------------------
|
||||
|
||||
async def _handle_request(self, msg: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate + dispatch a single decoded request envelope.
|
||||
|
||||
Always returns a response envelope (success or error); never
|
||||
raises. Errors from inside the process_manager are wrapped into
|
||||
the response payload's ``ok``/``error`` keys (which pm already
|
||||
does) rather than being re-encoded as error envelopes — the
|
||||
envelope-level error channel is reserved for auth / protocol
|
||||
failures.
|
||||
"""
|
||||
expected = self.ensure_token()
|
||||
ok, reason = _proto.validate_request(msg, expected)
|
||||
if not ok:
|
||||
return _proto.make_error(str(msg.get("id") or ""), reason)
|
||||
|
||||
req_id = msg["id"]
|
||||
t = msg["type"]
|
||||
payload = msg["payload"]
|
||||
|
||||
# Import lazily so test mocks can monkeypatch freely.
|
||||
from plugins.google_meet import process_manager as pm
|
||||
|
||||
try:
|
||||
if t == "ping":
|
||||
return {"type": "pong", "id": req_id,
|
||||
"payload": {"display_name": self.display_name,
|
||||
"ts": time.time()}}
|
||||
if t == "start_bot":
|
||||
# Whitelist kwargs we pass through to pm.start.
|
||||
kwargs = {
|
||||
k: payload[k]
|
||||
for k in ("url", "guest_name", "duration", "headed",
|
||||
"auth_state", "session_id", "out_dir")
|
||||
if k in payload
|
||||
}
|
||||
if "url" not in kwargs:
|
||||
return _proto.make_error(req_id, "missing 'url' in payload")
|
||||
result = pm.start(**kwargs)
|
||||
return _proto.make_response(req_id, result)
|
||||
if t == "stop":
|
||||
reason_arg = payload.get("reason", "requested")
|
||||
result = pm.stop(reason=reason_arg)
|
||||
return _proto.make_response(req_id, result)
|
||||
if t == "status":
|
||||
return _proto.make_response(req_id, pm.status())
|
||||
if t == "transcript":
|
||||
last = payload.get("last")
|
||||
result = pm.transcript(last=last)
|
||||
return _proto.make_response(req_id, result)
|
||||
if t == "say":
|
||||
# v2 wiring: enqueue into say_queue.jsonl inside the
|
||||
# active meeting's out_dir when present. The bot-side
|
||||
# consumer is v3+ (for v1 this is a stub returning ok).
|
||||
text = payload.get("text", "")
|
||||
active = pm._read_active() # type: ignore[attr-defined]
|
||||
enqueued = False
|
||||
if active and active.get("out_dir"):
|
||||
queue = Path(active["out_dir"]) / "say_queue.jsonl"
|
||||
try:
|
||||
queue.parent.mkdir(parents=True, exist_ok=True)
|
||||
with queue.open("a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps({"text": text, "ts": time.time()}) + "\n")
|
||||
enqueued = True
|
||||
except OSError:
|
||||
enqueued = False
|
||||
return _proto.make_response(
|
||||
req_id,
|
||||
{"ok": True, "enqueued": enqueued, "text": text},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 — surface any pm crash to client
|
||||
return _proto.make_error(req_id, f"{type(exc).__name__}: {exc}")
|
||||
|
||||
return _proto.make_error(req_id, f"unhandled type: {t!r}")
|
||||
|
||||
# ----- server loop --------------------------------------------------
|
||||
|
||||
async def serve(self) -> None:
|
||||
"""Run the WebSocket server until cancelled.
|
||||
|
||||
Blocks forever. Callers typically wrap this in ``asyncio.run``.
|
||||
"""
|
||||
try:
|
||||
import websockets # type: ignore
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"NodeServer.serve requires the 'websockets' package. "
|
||||
"Install it with: pip install websockets"
|
||||
) from exc
|
||||
|
||||
self.ensure_token()
|
||||
|
||||
async def _handler(ws):
|
||||
async for raw in ws:
|
||||
try:
|
||||
msg = _proto.decode(raw if isinstance(raw, str) else raw.decode("utf-8"))
|
||||
except ValueError as exc:
|
||||
await ws.send(_proto.encode(_proto.make_error("", f"decode: {exc}")))
|
||||
continue
|
||||
reply = await self._handle_request(msg)
|
||||
await ws.send(_proto.encode(reply))
|
||||
|
||||
async with websockets.serve(_handler, self.host, self.port):
|
||||
# Run until cancelled.
|
||||
import asyncio
|
||||
await asyncio.Future()
|
||||
@@ -0,0 +1,16 @@
|
||||
name: google_meet
|
||||
version: 0.2.0
|
||||
description: "Join a Google Meet call, transcribe live captions, speak in realtime, and follow up afterwards. v1 transcribe-only is the default; v2 realtime duplex audio via OpenAI Realtime + BlackHole/PulseAudio ships with mode='realtime'; v3 remote node host lets the bot run on a different machine than the gateway (gateway on Linux, Chrome+signed-in profile on the user's Mac). Explicit-by-design: only joins meet.google.com URLs passed in \u2014 no calendar scanning, no auto-dial."
|
||||
author: NousResearch
|
||||
kind: standalone
|
||||
platforms:
|
||||
- linux
|
||||
- macos
|
||||
provides_tools:
|
||||
- meet_join
|
||||
- meet_leave
|
||||
- meet_status
|
||||
- meet_transcript
|
||||
- meet_say
|
||||
hooks:
|
||||
- on_session_end
|
||||
@@ -0,0 +1,326 @@
|
||||
"""Subprocess lifecycle manager for the google_meet bot.
|
||||
|
||||
Single active meeting at a time. Stores the running pid + out_dir in a
|
||||
session-scoped state file under ``$HERMES_HOME/workspace/meetings/.active.json``
|
||||
so tool calls across turns can find the bot, and ``on_session_end`` can clean
|
||||
it up.
|
||||
|
||||
The bot runs as a detached subprocess — we don't hold file descriptors open,
|
||||
so the parent agent loop can't block on it. We communicate via files only.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
# File + directory layout (under $HERMES_HOME):
|
||||
#
|
||||
# workspace/meetings/
|
||||
# .active.json # pointer to current session's bot
|
||||
# <meeting-id>/
|
||||
# status.json # live bot state (written by bot each tick)
|
||||
# transcript.txt # scraped captions
|
||||
#
|
||||
# .active.json holds:
|
||||
# {"pid": 12345, "meeting_id": "abc-defg-hij", "out_dir": "...",
|
||||
# "url": "https://meet.google.com/...", "started_at": 1714159200.0,
|
||||
# "session_id": "optional"}
|
||||
|
||||
|
||||
def _root() -> Path:
|
||||
return Path(get_hermes_home()) / "workspace" / "meetings"
|
||||
|
||||
|
||||
def _active_file() -> Path:
|
||||
return _root() / ".active.json"
|
||||
|
||||
|
||||
def _read_active() -> Optional[Dict[str, Any]]:
|
||||
p = _active_file()
|
||||
if not p.is_file():
|
||||
return None
|
||||
try:
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _write_active(data: Dict[str, Any]) -> None:
|
||||
p = _active_file()
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = p.with_suffix(".json.tmp")
|
||||
tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
tmp.replace(p)
|
||||
|
||||
|
||||
def _clear_active() -> None:
|
||||
try:
|
||||
_active_file().unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def _pid_alive(pid: int) -> bool:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
# Process exists but we can't signal it — treat as alive.
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API — used by tool handlers + CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def start(
|
||||
url: str,
|
||||
*,
|
||||
out_dir: Optional[Path] = None,
|
||||
headed: bool = False,
|
||||
auth_state: Optional[str] = None,
|
||||
guest_name: str = "Hermes Agent",
|
||||
duration: Optional[str] = None,
|
||||
session_id: Optional[str] = None,
|
||||
mode: str = "transcribe",
|
||||
realtime_model: Optional[str] = None,
|
||||
realtime_voice: Optional[str] = None,
|
||||
realtime_instructions: Optional[str] = None,
|
||||
realtime_api_key: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Spawn the meet_bot subprocess for *url*.
|
||||
|
||||
If a bot is already running for this hermes install, leave it first —
|
||||
we enforce single-active-meeting semantics.
|
||||
|
||||
Returns a dict summarizing the started bot.
|
||||
"""
|
||||
from plugins.google_meet.meet_bot import _is_safe_meet_url, _meeting_id_from_url
|
||||
|
||||
if not _is_safe_meet_url(url):
|
||||
return {
|
||||
"ok": False,
|
||||
"error": (
|
||||
"refusing: only https://meet.google.com/ URLs are allowed. "
|
||||
"got: " + repr(url)
|
||||
),
|
||||
}
|
||||
|
||||
existing = _read_active()
|
||||
if existing and _pid_alive(int(existing.get("pid", 0))):
|
||||
stop(reason="replaced by new meet_join")
|
||||
|
||||
meeting_id = _meeting_id_from_url(url)
|
||||
out = out_dir or (_root() / meeting_id)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Wipe any stale transcript/status files from a previous run of this
|
||||
# meeting id so polling isn't confused.
|
||||
for name in ("transcript.txt", "status.json"):
|
||||
f = out / name
|
||||
if f.exists():
|
||||
try:
|
||||
f.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
env = os.environ.copy()
|
||||
env["HERMES_MEET_URL"] = url
|
||||
env["HERMES_MEET_OUT_DIR"] = str(out)
|
||||
env["HERMES_MEET_GUEST_NAME"] = guest_name
|
||||
if headed:
|
||||
env["HERMES_MEET_HEADED"] = "1"
|
||||
if auth_state:
|
||||
env["HERMES_MEET_AUTH_STATE"] = auth_state
|
||||
if duration:
|
||||
env["HERMES_MEET_DURATION"] = duration
|
||||
# v2: realtime mode + passthroughs. The bot defaults to transcribe
|
||||
# mode if HERMES_MEET_MODE isn't set, matching v1 behavior.
|
||||
if mode:
|
||||
env["HERMES_MEET_MODE"] = mode
|
||||
if realtime_model:
|
||||
env["HERMES_MEET_REALTIME_MODEL"] = realtime_model
|
||||
if realtime_voice:
|
||||
env["HERMES_MEET_REALTIME_VOICE"] = realtime_voice
|
||||
if realtime_instructions:
|
||||
env["HERMES_MEET_REALTIME_INSTRUCTIONS"] = realtime_instructions
|
||||
if realtime_api_key:
|
||||
env["HERMES_MEET_REALTIME_KEY"] = realtime_api_key
|
||||
|
||||
log_path = out / "bot.log"
|
||||
# Detach: stdin=devnull, stdout/stderr → log file, new session so parent
|
||||
# signals don't propagate.
|
||||
log_fh = open(log_path, "ab", buffering=0)
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "-m", "plugins.google_meet.meet_bot"],
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=log_fh,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=env,
|
||||
start_new_session=True,
|
||||
close_fds=True,
|
||||
)
|
||||
finally:
|
||||
# The subprocess now owns the log fd; we can close ours.
|
||||
log_fh.close()
|
||||
|
||||
record = {
|
||||
"pid": proc.pid,
|
||||
"meeting_id": meeting_id,
|
||||
"out_dir": str(out),
|
||||
"url": url,
|
||||
"started_at": time.time(),
|
||||
"session_id": session_id,
|
||||
"log_path": str(log_path),
|
||||
"mode": mode,
|
||||
}
|
||||
_write_active(record)
|
||||
return {"ok": True, **record}
|
||||
|
||||
|
||||
def status() -> Dict[str, Any]:
|
||||
"""Return the current meeting state, or ``{"ok": False, "reason": ...}``."""
|
||||
active = _read_active()
|
||||
if not active:
|
||||
return {"ok": False, "reason": "no active meeting"}
|
||||
|
||||
pid = int(active.get("pid", 0))
|
||||
alive = _pid_alive(pid) if pid else False
|
||||
|
||||
status_path = Path(active.get("out_dir", "")) / "status.json"
|
||||
bot_status: Dict[str, Any] = {}
|
||||
if status_path.is_file():
|
||||
try:
|
||||
bot_status = json.loads(status_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"alive": alive,
|
||||
"pid": pid,
|
||||
"meetingId": active.get("meeting_id"),
|
||||
"url": active.get("url"),
|
||||
"startedAt": active.get("started_at"),
|
||||
"outDir": active.get("out_dir"),
|
||||
**bot_status,
|
||||
}
|
||||
|
||||
|
||||
def transcript(last: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""Read the current transcript file. Returns ok=False if none exists."""
|
||||
active = _read_active()
|
||||
if not active:
|
||||
return {"ok": False, "reason": "no active meeting"}
|
||||
|
||||
tp = Path(active.get("out_dir", "")) / "transcript.txt"
|
||||
if not tp.is_file():
|
||||
return {
|
||||
"ok": True,
|
||||
"meetingId": active.get("meeting_id"),
|
||||
"lines": [],
|
||||
"total": 0,
|
||||
"path": str(tp),
|
||||
}
|
||||
text = tp.read_text(encoding="utf-8", errors="replace")
|
||||
all_lines = [ln for ln in text.splitlines() if ln.strip()]
|
||||
lines = all_lines[-last:] if last else all_lines
|
||||
return {
|
||||
"ok": True,
|
||||
"meetingId": active.get("meeting_id"),
|
||||
"lines": lines,
|
||||
"total": len(all_lines),
|
||||
"path": str(tp),
|
||||
}
|
||||
|
||||
|
||||
def enqueue_say(text: str) -> Dict[str, Any]:
|
||||
"""Append a ``say`` request to the active bot's JSONL queue.
|
||||
|
||||
Returns ``{"ok": False, "reason": ...}`` when no meeting is active or
|
||||
the active bot is in transcribe-only mode. Otherwise writes a line to
|
||||
``<out_dir>/say_queue.jsonl`` that the bot's realtime speaker thread
|
||||
will consume.
|
||||
"""
|
||||
import uuid
|
||||
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return {"ok": False, "reason": "text is required"}
|
||||
|
||||
active = _read_active()
|
||||
if not active:
|
||||
return {"ok": False, "reason": "no active meeting"}
|
||||
if active.get("mode") != "realtime":
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": (
|
||||
"active meeting is in transcribe mode — pass mode='realtime' "
|
||||
"to meet_join to enable agent speech"
|
||||
),
|
||||
}
|
||||
|
||||
out_dir = Path(active.get("out_dir", ""))
|
||||
if not out_dir.is_dir():
|
||||
return {"ok": False, "reason": f"out_dir missing: {out_dir}"}
|
||||
|
||||
queue_path = out_dir / "say_queue.jsonl"
|
||||
entry = {"id": uuid.uuid4().hex[:12], "text": text}
|
||||
with queue_path.open("a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
return {
|
||||
"ok": True,
|
||||
"meetingId": active.get("meeting_id"),
|
||||
"enqueued_id": entry["id"],
|
||||
"queue_path": str(queue_path),
|
||||
}
|
||||
|
||||
|
||||
def stop(*, reason: str = "requested") -> Dict[str, Any]:
|
||||
"""Signal the active bot to leave cleanly, then clear the active pointer.
|
||||
|
||||
Sends SIGTERM and waits up to 10s for the bot to exit. Falls back to
|
||||
SIGKILL if the bot doesn't respond.
|
||||
"""
|
||||
active = _read_active()
|
||||
if not active:
|
||||
return {"ok": False, "reason": "no active meeting"}
|
||||
|
||||
pid = int(active.get("pid", 0))
|
||||
out_dir = active.get("out_dir")
|
||||
transcript_path = Path(out_dir) / "transcript.txt" if out_dir else None
|
||||
|
||||
if pid and _pid_alive(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
for _ in range(20):
|
||||
if not _pid_alive(pid):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
if _pid_alive(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
_clear_active()
|
||||
return {
|
||||
"ok": True,
|
||||
"reason": reason,
|
||||
"meetingId": active.get("meeting_id"),
|
||||
"transcriptPath": str(transcript_path) if transcript_path else None,
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
"""Realtime speech subpackage for the google_meet plugin (v2).
|
||||
|
||||
Provides a thin OpenAI Realtime API client and a file-queue speaker
|
||||
wrapper so the Meet bot can play synthesized speech through the
|
||||
virtual audio bridge.
|
||||
"""
|
||||
|
||||
from .openai_client import RealtimeSession, RealtimeSpeaker # noqa: F401
|
||||
|
||||
__all__ = ["RealtimeSession", "RealtimeSpeaker"]
|
||||
@@ -0,0 +1,332 @@
|
||||
"""OpenAI Realtime API WebSocket client + file-queue speaker.
|
||||
|
||||
This module is the "output" side of the v2 voice bridge: it takes text,
|
||||
sends it to the OpenAI Realtime API, receives audio deltas back, and
|
||||
appends the PCM bytes to a file. A separate consumer (the audio
|
||||
bridge) streams that file into Chrome's fake microphone.
|
||||
|
||||
Designed for simplicity: a single synchronous WebSocket connection per
|
||||
speaker, per session. The ``websockets`` package is imported lazily so
|
||||
that importing this module never fails just because the optional dep
|
||||
is missing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
|
||||
REALTIME_URL = "wss://api.openai.com/v1/realtime"
|
||||
|
||||
|
||||
def _require_websockets():
|
||||
"""Import ``websockets.sync.client.connect`` or raise with hint."""
|
||||
try:
|
||||
from websockets.sync.client import connect as _connect # type: ignore
|
||||
except ImportError as exc: # pragma: no cover - exercised via test
|
||||
raise RuntimeError(
|
||||
"websockets package is required for OpenAI Realtime; "
|
||||
"install with: pip install websockets"
|
||||
) from exc
|
||||
return _connect
|
||||
|
||||
|
||||
class RealtimeSession:
|
||||
"""Minimal sync client for the OpenAI Realtime WebSocket API.
|
||||
|
||||
Usage:
|
||||
sess = RealtimeSession(api_key=..., audio_sink_path=Path("out.pcm"))
|
||||
sess.connect()
|
||||
sess.speak("Hello team.")
|
||||
sess.close()
|
||||
|
||||
Thread safety: ``speak`` and ``cancel_response`` may be called from
|
||||
different threads; a lock serializes WebSocket writes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
model: str = "gpt-realtime",
|
||||
voice: str = "alloy",
|
||||
instructions: str = "",
|
||||
audio_sink_path: Optional[Path] = None,
|
||||
sample_rate: int = 24000,
|
||||
) -> None:
|
||||
import threading as _threading
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.voice = voice
|
||||
self.instructions = instructions
|
||||
self.audio_sink_path = Path(audio_sink_path) if audio_sink_path else None
|
||||
self.sample_rate = sample_rate
|
||||
self._ws: Any = None
|
||||
self._send_lock = _threading.Lock()
|
||||
self._last_response_id: Optional[str] = None
|
||||
# Public counters for status reporting.
|
||||
self.audio_bytes_out: int = 0
|
||||
self.last_audio_out_at: Optional[float] = None
|
||||
|
||||
# ── lifecycle ─────────────────────────────────────────────────────────
|
||||
|
||||
def connect(self) -> None:
|
||||
"""Open WS and send session.update with voice+instructions."""
|
||||
connect = _require_websockets()
|
||||
url = f"{REALTIME_URL}?model={self.model}"
|
||||
headers = [
|
||||
("Authorization", f"Bearer {self.api_key}"),
|
||||
("OpenAI-Beta", "realtime=v1"),
|
||||
]
|
||||
# websockets.sync.client.connect accepts either additional_headers=
|
||||
# (newer) or extra_headers= depending on version; try the newer
|
||||
# name first and fall back.
|
||||
try:
|
||||
self._ws = connect(url, additional_headers=headers)
|
||||
except TypeError:
|
||||
self._ws = connect(url, extra_headers=headers)
|
||||
|
||||
self._send_json(
|
||||
{
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"voice": self.voice,
|
||||
"instructions": self.instructions,
|
||||
"modalities": ["audio", "text"],
|
||||
"output_audio_format": "pcm16",
|
||||
"input_audio_format": "pcm16",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
def close(self) -> None:
|
||||
if self._ws is not None:
|
||||
try:
|
||||
self._ws.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._ws = None
|
||||
|
||||
# ── speaking ──────────────────────────────────────────────────────────
|
||||
|
||||
def speak(self, text: str, timeout: float = 30.0) -> dict:
|
||||
"""Send ``text`` and accumulate the audio response.
|
||||
|
||||
Audio deltas are base64-decoded and appended to
|
||||
``audio_sink_path`` (opened 'ab' and closed per call, so a
|
||||
separate streaming reader can consume whatever is there).
|
||||
"""
|
||||
if self._ws is None:
|
||||
raise RuntimeError("RealtimeSession.connect() must be called first")
|
||||
|
||||
start = time.monotonic()
|
||||
|
||||
self._send_json(
|
||||
{
|
||||
"type": "conversation.item.create",
|
||||
"item": {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": text}],
|
||||
},
|
||||
}
|
||||
)
|
||||
self._send_json(
|
||||
{
|
||||
"type": "response.create",
|
||||
"response": {"modalities": ["audio"]},
|
||||
}
|
||||
)
|
||||
|
||||
bytes_written = 0
|
||||
sink_fp = None
|
||||
if self.audio_sink_path is not None:
|
||||
self.audio_sink_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
sink_fp = open(self.audio_sink_path, "ab")
|
||||
|
||||
try:
|
||||
while True:
|
||||
remaining = timeout - (time.monotonic() - start)
|
||||
if remaining <= 0:
|
||||
raise TimeoutError(
|
||||
f"realtime response did not complete within {timeout}s"
|
||||
)
|
||||
raw = self._recv(timeout=remaining)
|
||||
if raw is None:
|
||||
# Connection closed by peer.
|
||||
break
|
||||
try:
|
||||
frame = json.loads(raw) if isinstance(raw, (str, bytes, bytearray)) else raw
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if not isinstance(frame, dict):
|
||||
continue
|
||||
ftype = frame.get("type")
|
||||
if ftype == "response.audio.delta":
|
||||
b64 = frame.get("delta") or frame.get("audio") or ""
|
||||
if b64 and sink_fp is not None:
|
||||
try:
|
||||
chunk = base64.b64decode(b64)
|
||||
except (ValueError, TypeError):
|
||||
chunk = b""
|
||||
if chunk:
|
||||
sink_fp.write(chunk)
|
||||
sink_fp.flush()
|
||||
bytes_written += len(chunk)
|
||||
self.audio_bytes_out += len(chunk)
|
||||
self.last_audio_out_at = time.time()
|
||||
elif ftype == "response.created":
|
||||
rid = (frame.get("response") or {}).get("id")
|
||||
if rid:
|
||||
self._last_response_id = rid
|
||||
elif ftype in ("response.done", "response.completed", "response.cancelled"):
|
||||
break
|
||||
elif ftype == "error":
|
||||
err = frame.get("error") or frame
|
||||
raise RuntimeError(f"realtime error: {err}")
|
||||
# All other frames (response.created, response.output_item.*,
|
||||
# response.audio_transcript.delta, rate_limits.updated, ...)
|
||||
# are ignored for v2.
|
||||
finally:
|
||||
if sink_fp is not None:
|
||||
sink_fp.close()
|
||||
|
||||
duration_ms = (time.monotonic() - start) * 1000.0
|
||||
return {
|
||||
"ok": True,
|
||||
"bytes_written": bytes_written,
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
# ── ws plumbing ───────────────────────────────────────────────────────
|
||||
|
||||
def cancel_response(self) -> bool:
|
||||
"""Interrupt the in-flight response (barge-in).
|
||||
|
||||
Sends ``response.cancel`` on the current WebSocket so the model
|
||||
stops generating audio immediately. Safe to call at any time;
|
||||
returns True if a cancel was actually sent, False when there's
|
||||
nothing to cancel or the socket isn't open.
|
||||
"""
|
||||
if self._ws is None:
|
||||
return False
|
||||
try:
|
||||
self._send_json({"type": "response.cancel"})
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _send_json(self, payload: dict) -> None:
|
||||
assert self._ws is not None
|
||||
with self._send_lock:
|
||||
self._ws.send(json.dumps(payload))
|
||||
|
||||
def _recv(self, timeout: Optional[float] = None):
|
||||
assert self._ws is not None
|
||||
try:
|
||||
if timeout is None:
|
||||
return self._ws.recv()
|
||||
return self._ws.recv(timeout=timeout)
|
||||
except TypeError:
|
||||
# Older websockets may not accept timeout kwarg.
|
||||
return self._ws.recv()
|
||||
|
||||
|
||||
class RealtimeSpeaker:
|
||||
"""File-based JSONL queue wrapper around :class:`RealtimeSession`.
|
||||
|
||||
Each line in ``queue_path`` is a JSON object of the form
|
||||
``{"id": "<uuid>", "text": "..."}``. Processed lines are appended
|
||||
to ``processed_path`` (if set) and then removed from the queue;
|
||||
if ``processed_path`` is ``None``, processed lines are simply
|
||||
dropped.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session: RealtimeSession,
|
||||
queue_path: Path,
|
||||
processed_path: Optional[Path] = None,
|
||||
) -> None:
|
||||
self.session = session
|
||||
self.queue_path = Path(queue_path)
|
||||
self.processed_path = Path(processed_path) if processed_path else None
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _read_queue(self) -> list[dict]:
|
||||
if not self.queue_path.exists():
|
||||
return []
|
||||
out: list[dict] = []
|
||||
for line in self.queue_path.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except ValueError:
|
||||
continue
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if "id" not in entry:
|
||||
entry["id"] = str(uuid.uuid4())
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
def _rewrite_queue(self, remaining: list[dict]) -> None:
|
||||
if not remaining:
|
||||
# Keep the file but empty — consumers may be watching for
|
||||
# new writes via mtime, and delete-then-recreate is a race.
|
||||
self.queue_path.write_text("")
|
||||
return
|
||||
self.queue_path.write_text(
|
||||
"\n".join(json.dumps(e) for e in remaining) + "\n"
|
||||
)
|
||||
|
||||
def _append_processed(self, entry: dict, result: dict) -> None:
|
||||
if self.processed_path is None:
|
||||
return
|
||||
self.processed_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
record = {"id": entry.get("id"), "text": entry.get("text", ""), "result": result}
|
||||
with open(self.processed_path, "a") as fp:
|
||||
fp.write(json.dumps(record) + "\n")
|
||||
|
||||
# ── main loop ────────────────────────────────────────────────────────
|
||||
|
||||
def run_until_stopped(
|
||||
self,
|
||||
stop_fn: Callable[[], bool],
|
||||
poll_interval: float = 0.5,
|
||||
) -> None:
|
||||
while not stop_fn():
|
||||
entries = self._read_queue()
|
||||
if not entries:
|
||||
time.sleep(poll_interval)
|
||||
continue
|
||||
# Process one at a time; re-check the queue file after each
|
||||
# speak() call because new entries may have arrived.
|
||||
head = entries[0]
|
||||
text = (head.get("text") or "").strip()
|
||||
if text:
|
||||
try:
|
||||
result = self.session.speak(text)
|
||||
except Exception as exc:
|
||||
result = {"ok": False, "error": str(exc)}
|
||||
else:
|
||||
result = {"ok": True, "bytes_written": 0, "duration_ms": 0.0}
|
||||
self._append_processed(head, result)
|
||||
|
||||
# Re-read the queue from disk in case it was appended to
|
||||
# while we were speaking, then drop the head.
|
||||
latest = self._read_queue()
|
||||
if latest and latest[0].get("id") == head.get("id"):
|
||||
self._rewrite_queue(latest[1:])
|
||||
else:
|
||||
# Fallback: drop-by-id anywhere in the queue.
|
||||
self._rewrite_queue(
|
||||
[e for e in latest if e.get("id") != head.get("id")]
|
||||
)
|
||||
@@ -0,0 +1,348 @@
|
||||
"""Agent-facing tools for the google_meet plugin.
|
||||
|
||||
Tools:
|
||||
meet_join — join a Google Meet URL (spawns Playwright bot locally
|
||||
OR on a remote node host via node=<name>)
|
||||
meet_status — report bot liveness + transcript progress
|
||||
meet_transcript — read the current transcript (optional last-N)
|
||||
meet_leave — signal the bot to leave cleanly
|
||||
meet_say — (v2) speak text through the realtime audio bridge.
|
||||
Requires the active meeting to have been joined with
|
||||
mode='realtime'.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from plugins.google_meet import process_manager as pm
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Runtime gate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_meet_requirements() -> bool:
|
||||
"""Return True when the plugin can actually run LOCALLY.
|
||||
|
||||
Gates on:
|
||||
* Python ``playwright`` package importable
|
||||
* the plugin being on a supported platform (Linux or macOS)
|
||||
|
||||
Note: remote-node operation (``node=<name>``) only needs the
|
||||
``websockets`` dep on the gateway side — Chromium lives on the node.
|
||||
But the plugin-level gate keeps the v1 semantics; individual tool
|
||||
handlers relax the requirement when a node is addressed.
|
||||
"""
|
||||
import platform as _p
|
||||
if _p.system().lower() not in ("linux", "darwin"):
|
||||
return False
|
||||
try:
|
||||
import playwright # noqa: F401
|
||||
except ImportError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Node client helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _resolve_node_client(node: Optional[str]):
|
||||
"""Return (NodeClient, node_name) for *node*, or (None, None) to run local.
|
||||
|
||||
Raises RuntimeError with a readable message if the node is named but
|
||||
unresolvable, so the handler can surface a clear error to the agent.
|
||||
"""
|
||||
if node is None or node == "":
|
||||
return None, None
|
||||
from plugins.google_meet.node.registry import NodeRegistry
|
||||
from plugins.google_meet.node.client import NodeClient
|
||||
|
||||
reg = NodeRegistry()
|
||||
entry = reg.resolve(node if node != "auto" else None)
|
||||
if entry is None:
|
||||
raise RuntimeError(
|
||||
f"no registered meet node matches {node!r} — "
|
||||
"run `hermes meet node approve <name> <url> <token>` first"
|
||||
)
|
||||
client = NodeClient(url=entry["url"], token=entry["token"])
|
||||
return client, entry.get("name")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MEET_JOIN_SCHEMA: Dict[str, Any] = {
|
||||
"name": "meet_join",
|
||||
"description": (
|
||||
"Join a Google Meet call and start scraping live captions into a "
|
||||
"transcript file. Only meet.google.com URLs are accepted; no calendar "
|
||||
"scanning, no auto-dial. Spawns a headless Chromium subprocess that "
|
||||
"runs in parallel with the agent loop — returns immediately. Poll "
|
||||
"with meet_status and read captions with meet_transcript. Reminder "
|
||||
"to the agent: you should announce yourself in the meeting (there is "
|
||||
"no automatic consent announcement)."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Full https://meet.google.com/... URL. Required."
|
||||
),
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["transcribe", "realtime"],
|
||||
"description": (
|
||||
"transcribe (default): listen-only, scrape captions. "
|
||||
"realtime: also enable agent speech via meet_say "
|
||||
"(requires OpenAI Realtime key + platform audio bridge)."
|
||||
),
|
||||
},
|
||||
"guest_name": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Display name to use when joining as guest. Defaults to "
|
||||
"'Hermes Agent'."
|
||||
),
|
||||
},
|
||||
"duration": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Optional max duration before auto-leave (e.g. '30m', "
|
||||
"'2h', '90s'). Omit to stay until meet_leave is called."
|
||||
),
|
||||
},
|
||||
"headed": {
|
||||
"type": "boolean",
|
||||
"description": (
|
||||
"Run Chromium headed instead of headless (debug only). "
|
||||
"Default false."
|
||||
),
|
||||
},
|
||||
"node": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Name of a registered remote node to run the bot on "
|
||||
"(useful when the gateway runs on a headless Linux box "
|
||||
"but the user's Chrome with a signed-in Google profile "
|
||||
"lives on their Mac). Pass 'auto' to use the single "
|
||||
"registered node. Default: run locally. Nodes are "
|
||||
"approved via `hermes meet node approve`."
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["url"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
MEET_STATUS_SCHEMA: Dict[str, Any] = {
|
||||
"name": "meet_status",
|
||||
"description": (
|
||||
"Report the current Meet session state — whether the bot is alive, "
|
||||
"has joined, is sitting in the lobby, number of transcript lines "
|
||||
"captured, and last-caption timestamp."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {"type": "string"},
|
||||
},
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
MEET_TRANSCRIPT_SCHEMA: Dict[str, Any] = {
|
||||
"name": "meet_transcript",
|
||||
"description": (
|
||||
"Read the scraped transcript for the active Meet session. Returns "
|
||||
"full transcript unless 'last' is set, in which case returns the last "
|
||||
"N lines only."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"last": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Optional: return only the last N caption lines. Useful "
|
||||
"for polling during a meeting without re-reading the "
|
||||
"whole transcript."
|
||||
),
|
||||
"minimum": 1,
|
||||
},
|
||||
"node": {"type": "string"},
|
||||
},
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
MEET_LEAVE_SCHEMA: Dict[str, Any] = {
|
||||
"name": "meet_leave",
|
||||
"description": (
|
||||
"Leave the active Meet call cleanly, stop caption scraping, and "
|
||||
"finalize the transcript file. Safe to call when no meeting is "
|
||||
"active — returns ok=false with a reason."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"node": {"type": "string"},
|
||||
},
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
MEET_SAY_SCHEMA: Dict[str, Any] = {
|
||||
"name": "meet_say",
|
||||
"description": (
|
||||
"Speak text into the active Meet call. Requires the active meeting "
|
||||
"to have been joined with mode='realtime'. The text is queued to "
|
||||
"the bot's OpenAI Realtime session; the generated audio is streamed "
|
||||
"into Chrome's fake microphone via a virtual audio device "
|
||||
"(PulseAudio null-sink on Linux, BlackHole on macOS). Returns "
|
||||
"immediately — the actual speech lags by a couple of seconds."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {"type": "string", "description": "Text to speak."},
|
||||
"node": {"type": "string"},
|
||||
},
|
||||
"required": ["text"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _json(obj: Any) -> str:
|
||||
return json.dumps(obj, ensure_ascii=False)
|
||||
|
||||
|
||||
def _err(msg: str, **extra) -> str:
|
||||
return _json({"success": False, "error": msg, **extra})
|
||||
|
||||
|
||||
def handle_meet_join(args: Dict[str, Any], **_kw) -> str:
|
||||
url = (args.get("url") or "").strip()
|
||||
if not url:
|
||||
return _err("url is required")
|
||||
mode = (args.get("mode") or "transcribe").strip().lower()
|
||||
if mode not in ("transcribe", "realtime"):
|
||||
return _err(f"mode must be 'transcribe' or 'realtime' (got {mode!r})")
|
||||
|
||||
node = args.get("node")
|
||||
try:
|
||||
client, node_name = _resolve_node_client(node)
|
||||
except RuntimeError as e:
|
||||
return _err(str(e))
|
||||
|
||||
if client is not None:
|
||||
# Remote path — delegate to the node host.
|
||||
try:
|
||||
res = client.start_bot(
|
||||
url=url,
|
||||
guest_name=str(args.get("guest_name") or "Hermes Agent"),
|
||||
duration=str(args.get("duration")) if args.get("duration") else None,
|
||||
headed=bool(args.get("headed", False)),
|
||||
mode=mode,
|
||||
)
|
||||
return _json({"success": bool(res.get("ok")), "node": node_name, **res})
|
||||
except Exception as e:
|
||||
return _err(f"remote node start_bot failed: {e}", node=node_name)
|
||||
|
||||
# Local path — same as v1, with v2 params.
|
||||
if not check_meet_requirements():
|
||||
return _err(
|
||||
"google_meet plugin prerequisites missing — install with "
|
||||
"`pip install playwright && python -m playwright install "
|
||||
"chromium`. Plugin is supported on Linux and macOS only."
|
||||
)
|
||||
res = pm.start(
|
||||
url=url,
|
||||
headed=bool(args.get("headed", False)),
|
||||
guest_name=str(args.get("guest_name") or "Hermes Agent"),
|
||||
duration=str(args.get("duration")) if args.get("duration") else None,
|
||||
mode=mode,
|
||||
)
|
||||
return _json({"success": bool(res.get("ok")), **res})
|
||||
|
||||
|
||||
def handle_meet_status(args: Dict[str, Any], **_kw) -> str:
|
||||
try:
|
||||
client, node_name = _resolve_node_client(args.get("node"))
|
||||
except RuntimeError as e:
|
||||
return _err(str(e))
|
||||
if client is not None:
|
||||
try:
|
||||
res = client.status()
|
||||
return _json({"success": bool(res.get("ok")), "node": node_name, **res})
|
||||
except Exception as e:
|
||||
return _err(f"remote node status failed: {e}", node=node_name)
|
||||
res = pm.status()
|
||||
return _json({"success": bool(res.get("ok")), **res})
|
||||
|
||||
|
||||
def handle_meet_transcript(args: Dict[str, Any], **_kw) -> str:
|
||||
last = args.get("last")
|
||||
try:
|
||||
last_i = int(last) if last is not None else None
|
||||
if last_i is not None and last_i < 1:
|
||||
last_i = None
|
||||
except (TypeError, ValueError):
|
||||
last_i = None
|
||||
try:
|
||||
client, node_name = _resolve_node_client(args.get("node"))
|
||||
except RuntimeError as e:
|
||||
return _err(str(e))
|
||||
if client is not None:
|
||||
try:
|
||||
res = client.transcript(last=last_i)
|
||||
return _json({"success": bool(res.get("ok")), "node": node_name, **res})
|
||||
except Exception as e:
|
||||
return _err(f"remote node transcript failed: {e}", node=node_name)
|
||||
res = pm.transcript(last=last_i)
|
||||
return _json({"success": bool(res.get("ok")), **res})
|
||||
|
||||
|
||||
def handle_meet_leave(args: Dict[str, Any], **_kw) -> str:
|
||||
try:
|
||||
client, node_name = _resolve_node_client(args.get("node"))
|
||||
except RuntimeError as e:
|
||||
return _err(str(e))
|
||||
if client is not None:
|
||||
try:
|
||||
res = client.stop()
|
||||
return _json({"success": bool(res.get("ok")), "node": node_name, **res})
|
||||
except Exception as e:
|
||||
return _err(f"remote node stop failed: {e}", node=node_name)
|
||||
res = pm.stop(reason="agent called meet_leave")
|
||||
return _json({"success": bool(res.get("ok")), **res})
|
||||
|
||||
|
||||
def handle_meet_say(args: Dict[str, Any], **_kw) -> str:
|
||||
text = (args.get("text") or "").strip()
|
||||
if not text:
|
||||
return _err("text is required")
|
||||
try:
|
||||
client, node_name = _resolve_node_client(args.get("node"))
|
||||
except RuntimeError as e:
|
||||
return _err(str(e))
|
||||
if client is not None:
|
||||
try:
|
||||
res = client.say(text)
|
||||
return _json({"success": bool(res.get("ok")), "node": node_name, **res})
|
||||
except Exception as e:
|
||||
return _err(f"remote node say failed: {e}", node=node_name)
|
||||
res = pm.enqueue_say(text)
|
||||
return _json({"success": bool(res.get("ok")), **res})
|
||||
@@ -526,16 +526,24 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
|
||||
print("\n Configuring Hindsight memory:\n")
|
||||
|
||||
existing_config = self._config if isinstance(self._config, dict) else _load_config()
|
||||
if not isinstance(existing_config, dict):
|
||||
existing_config = {}
|
||||
|
||||
# Step 1: Mode selection
|
||||
mode_values = ["cloud", "local_embedded", "local_external"]
|
||||
mode_items = [
|
||||
("Cloud", "Hindsight Cloud API (lightweight, just needs an API key)"),
|
||||
("Local Embedded", "Run Hindsight locally (downloads ~200MB, needs LLM key)"),
|
||||
("Local External", "Connect to an existing Hindsight instance"),
|
||||
]
|
||||
mode_idx = _curses_select(" Select mode", mode_items, default=0)
|
||||
mode = ["cloud", "local_embedded", "local_external"][mode_idx]
|
||||
existing_mode = existing_config.get("mode")
|
||||
mode_default_idx = mode_values.index(existing_mode) if existing_mode in mode_values else 0
|
||||
mode_idx = _curses_select(" Select mode", mode_items, default=mode_default_idx)
|
||||
mode = mode_values[mode_idx]
|
||||
|
||||
provider_config: dict = {"mode": mode}
|
||||
provider_config: dict = dict(existing_config)
|
||||
provider_config["mode"] = mode
|
||||
env_writes: dict = {}
|
||||
|
||||
# Step 2: Install/upgrade deps for selected mode
|
||||
@@ -601,21 +609,29 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
(p, f"default model: {_PROVIDER_DEFAULT_MODELS[p]}")
|
||||
for p in providers_list
|
||||
]
|
||||
llm_idx = _curses_select(" Select LLM provider", llm_items, default=0)
|
||||
existing_llm_provider = provider_config.get("llm_provider")
|
||||
llm_default_idx = providers_list.index(existing_llm_provider) if existing_llm_provider in providers_list else 0
|
||||
llm_idx = _curses_select(" Select LLM provider", llm_items, default=llm_default_idx)
|
||||
llm_provider = providers_list[llm_idx]
|
||||
|
||||
provider_config["llm_provider"] = llm_provider
|
||||
|
||||
if llm_provider == "openai_compatible":
|
||||
val = input(" LLM endpoint URL (e.g. http://192.168.1.10:8080/v1): ").strip()
|
||||
existing_base_url = provider_config.get("llm_base_url", "")
|
||||
prompt = " LLM endpoint URL (e.g. http://192.168.1.10:8080/v1)"
|
||||
if existing_base_url:
|
||||
prompt += f" [{existing_base_url}]"
|
||||
prompt += ": "
|
||||
val = input(prompt).strip()
|
||||
if val:
|
||||
provider_config["llm_base_url"] = val
|
||||
elif llm_provider == "openrouter":
|
||||
provider_config["llm_base_url"] = "https://openrouter.ai/api/v1"
|
||||
|
||||
default_model = _PROVIDER_DEFAULT_MODELS.get(llm_provider, "gpt-4o-mini")
|
||||
val = input(f" LLM model [{default_model}]: ").strip()
|
||||
provider_config["llm_model"] = val or default_model
|
||||
provider_default_model = _PROVIDER_DEFAULT_MODELS.get(llm_provider, "gpt-4o-mini")
|
||||
current_model = provider_config.get("llm_model") or provider_default_model
|
||||
val = input(f" LLM model [{current_model}]: ").strip()
|
||||
provider_config["llm_model"] = val or current_model
|
||||
|
||||
sys.stdout.write(" LLM API key: ")
|
||||
sys.stdout.flush()
|
||||
@@ -633,15 +649,16 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
env_writes["HINDSIGHT_LLM_API_KEY"] = existing_llm_key
|
||||
|
||||
# Step 4: Save everything
|
||||
provider_config["bank_id"] = "hermes"
|
||||
provider_config["recall_budget"] = "mid"
|
||||
# Read existing timeout from config if present, otherwise use default
|
||||
existing_timeout = self._config.get("timeout") if self._config else None
|
||||
timeout_val = existing_timeout if existing_timeout else _DEFAULT_TIMEOUT
|
||||
provider_config.setdefault("bank_id", "hermes")
|
||||
provider_config.setdefault("recall_budget", "mid")
|
||||
# Read existing timeout from config if present, otherwise use default.
|
||||
# Preserve explicit 0 values instead of treating them as blank.
|
||||
existing_timeout = provider_config.get("timeout")
|
||||
timeout_val = existing_timeout if existing_timeout is not None else _DEFAULT_TIMEOUT
|
||||
provider_config["timeout"] = timeout_val
|
||||
env_writes["HINDSIGHT_TIMEOUT"] = str(timeout_val)
|
||||
if mode == "local_embedded":
|
||||
existing_idle_timeout = self._config.get("idle_timeout") if self._config else None
|
||||
existing_idle_timeout = provider_config.get("idle_timeout")
|
||||
idle_timeout_val = existing_idle_timeout if existing_idle_timeout is not None else _DEFAULT_IDLE_TIMEOUT
|
||||
provider_config["idle_timeout"] = idle_timeout_val
|
||||
env_writes["HINDSIGHT_IDLE_TIMEOUT"] = str(idle_timeout_val)
|
||||
@@ -1204,7 +1221,6 @@ class HindsightMemoryProvider(MemoryProvider):
|
||||
|
||||
def _sync():
|
||||
try:
|
||||
client = self._get_client()
|
||||
item = self._build_retain_kwargs(
|
||||
content,
|
||||
context=self._retain_context,
|
||||
|
||||
+385
-33
@@ -74,6 +74,12 @@ from model_tools import (
|
||||
check_toolset_requirements,
|
||||
)
|
||||
from tools.terminal_tool import cleanup_vm, get_active_env, is_persistent_env
|
||||
from tools.terminal_tool import (
|
||||
set_approval_callback as _set_approval_callback,
|
||||
set_sudo_password_callback as _set_sudo_password_callback,
|
||||
_get_approval_callback,
|
||||
_get_sudo_password_callback,
|
||||
)
|
||||
from tools.tool_result_storage import maybe_persist_tool_result, enforce_turn_budget
|
||||
from tools.interrupt import set_interrupt as _set_interrupt
|
||||
from tools.browser_tool import cleanup_browser
|
||||
@@ -86,6 +92,7 @@ from agent.error_classifier import classify_api_error, FailoverReason
|
||||
from agent.prompt_builder import (
|
||||
DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS,
|
||||
MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE,
|
||||
HERMES_AGENT_HELP_GUIDANCE,
|
||||
build_nous_subscription_prompt,
|
||||
)
|
||||
from agent.model_metadata import (
|
||||
@@ -2416,7 +2423,10 @@ class AIAgent:
|
||||
if not self.compression_enabled:
|
||||
return
|
||||
try:
|
||||
from agent.auxiliary_client import get_text_auxiliary_client
|
||||
from agent.auxiliary_client import (
|
||||
_resolve_task_provider_model,
|
||||
get_text_auxiliary_client,
|
||||
)
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
get_model_context_length,
|
||||
@@ -2426,6 +2436,14 @@ class AIAgent:
|
||||
"compression",
|
||||
main_runtime=self._current_main_runtime(),
|
||||
)
|
||||
# Best-effort aux provider label for the warning message. The
|
||||
# configured provider may be "auto", in which case we fall back
|
||||
# to the client's base_url hostname so the user can still tell
|
||||
# where the compression model is actually being called.
|
||||
try:
|
||||
_aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression")
|
||||
except Exception:
|
||||
_aux_cfg_provider = ""
|
||||
if client is None or not aux_model:
|
||||
msg = (
|
||||
"⚠ No auxiliary LLM provider configured — context "
|
||||
@@ -2492,10 +2510,37 @@ class AIAgent:
|
||||
new_threshold / main_ctx
|
||||
)
|
||||
safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
|
||||
# Build human-readable "model (provider)" labels for both
|
||||
# the main model and the compression model so users can
|
||||
# tell at a glance which provider each side is actually
|
||||
# using. When the configured provider is empty or "auto",
|
||||
# fall back to the client's base_url hostname.
|
||||
_main_model = getattr(self, "model", "") or "?"
|
||||
_main_provider = getattr(self, "provider", "") or ""
|
||||
_aux_provider_label = (
|
||||
_aux_cfg_provider
|
||||
if _aux_cfg_provider and _aux_cfg_provider != "auto"
|
||||
else ""
|
||||
)
|
||||
if not _aux_provider_label:
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
_aux_provider_label = (
|
||||
urlparse(aux_base_url).hostname or aux_base_url
|
||||
)
|
||||
except Exception:
|
||||
_aux_provider_label = aux_base_url or "auto"
|
||||
_main_label = (
|
||||
f"{_main_model} ({_main_provider})"
|
||||
if _main_provider
|
||||
else _main_model
|
||||
)
|
||||
_aux_label = f"{aux_model} ({_aux_provider_label})"
|
||||
msg = (
|
||||
f"⚠ Compression model ({aux_model}) context is "
|
||||
f"{aux_context:,} tokens, but the main model's "
|
||||
f"compression threshold was {old_threshold:,} tokens. "
|
||||
f"⚠ Compression model {_aux_label} context is "
|
||||
f"{aux_context:,} tokens, but the main model "
|
||||
f"{_main_label}'s compression threshold was "
|
||||
f"{old_threshold:,} tokens. "
|
||||
f"Auto-lowered this session's threshold to "
|
||||
f"{new_threshold:,} tokens so compression can run.\n"
|
||||
f" To make this permanent, edit config.yaml — either:\n"
|
||||
@@ -3240,6 +3285,21 @@ class AIAgent:
|
||||
|
||||
def _run_review():
|
||||
import contextlib
|
||||
# Install a non-interactive approval callback on this worker
|
||||
# thread so any dangerous-command guard the review agent trips
|
||||
# resolves to "deny" instead of falling back to input() -- which
|
||||
# deadlocks against the parent's prompt_toolkit TUI (#15216).
|
||||
# Same pattern as _subagent_auto_deny in tools/delegate_tool.py.
|
||||
def _bg_review_auto_deny(command, description, **kwargs):
|
||||
logger.warning(
|
||||
"Background review auto-denied dangerous command: %s (%s)",
|
||||
command, description,
|
||||
)
|
||||
return "deny"
|
||||
try:
|
||||
_set_approval_callback(_bg_review_auto_deny)
|
||||
except Exception:
|
||||
pass
|
||||
review_agent = None
|
||||
try:
|
||||
with open(os.devnull, "w") as _devnull, \
|
||||
@@ -3265,6 +3325,7 @@ class AIAgent:
|
||||
api_key=_parent_runtime.get("api_key") or None,
|
||||
credential_pool=getattr(self, "_credential_pool", None),
|
||||
parent_session_id=self.session_id,
|
||||
enabled_toolsets=["memory", "skills"],
|
||||
)
|
||||
review_agent._memory_write_origin = "background_review"
|
||||
review_agent._memory_write_context = "background_review"
|
||||
@@ -3321,6 +3382,12 @@ class AIAgent:
|
||||
review_agent.close()
|
||||
except Exception:
|
||||
pass
|
||||
# Clear the approval callback on this bg-review thread so a
|
||||
# recycled thread-id doesn't inherit a stale reference.
|
||||
try:
|
||||
_set_approval_callback(None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
t = threading.Thread(target=_run_review, daemon=True, name="bg-review")
|
||||
t.start()
|
||||
@@ -4498,6 +4565,9 @@ class AIAgent:
|
||||
# Fallback to hardcoded identity
|
||||
prompt_parts = [DEFAULT_AGENT_IDENTITY]
|
||||
|
||||
# Pointer to the hermes-agent skill + docs for user questions about Hermes itself.
|
||||
prompt_parts.append(HERMES_AGENT_HELP_GUIDANCE)
|
||||
|
||||
# Tool-aware behavioral guidance: only inject when the tools are loaded
|
||||
tool_guidance = []
|
||||
if "memory" in self.valid_tool_names:
|
||||
@@ -5226,7 +5296,39 @@ class AIAgent:
|
||||
logger.debug("Dead connection check error: %s", exc)
|
||||
return False
|
||||
|
||||
def _create_request_openai_client(self, *, reason: str) -> Any:
|
||||
@staticmethod
|
||||
def _api_kwargs_have_image_parts(api_kwargs: dict) -> bool:
|
||||
"""Return True when the outbound request still contains native image parts."""
|
||||
if not isinstance(api_kwargs, dict):
|
||||
return False
|
||||
candidates = []
|
||||
messages = api_kwargs.get("messages")
|
||||
if isinstance(messages, list):
|
||||
candidates.extend(messages)
|
||||
# Responses API payloads use `input`; after conversion, image parts can
|
||||
# still be present there instead of in `messages`.
|
||||
response_input = api_kwargs.get("input")
|
||||
if isinstance(response_input, list):
|
||||
candidates.extend(response_input)
|
||||
|
||||
def _contains_image(value: Any) -> bool:
|
||||
if isinstance(value, dict):
|
||||
ptype = value.get("type")
|
||||
if ptype in {"image_url", "input_image"}:
|
||||
return True
|
||||
return any(_contains_image(v) for v in value.values())
|
||||
if isinstance(value, list):
|
||||
return any(_contains_image(v) for v in value)
|
||||
return False
|
||||
|
||||
return any(_contains_image(item) for item in candidates)
|
||||
|
||||
def _copilot_headers_for_request(self, *, is_vision: bool) -> dict:
|
||||
from hermes_cli.copilot_auth import copilot_request_headers
|
||||
|
||||
return copilot_request_headers(is_agent_turn=True, is_vision=is_vision)
|
||||
|
||||
def _create_request_openai_client(self, *, reason: str, api_kwargs: Optional[dict] = None) -> Any:
|
||||
from unittest.mock import Mock
|
||||
|
||||
primary_client = self._ensure_primary_openai_client(reason=reason)
|
||||
@@ -5234,6 +5336,11 @@ class AIAgent:
|
||||
return primary_client
|
||||
with self._openai_client_lock():
|
||||
request_kwargs = dict(self._client_kwargs)
|
||||
if (
|
||||
base_url_host_matches(str(request_kwargs.get("base_url", "")), "api.githubcopilot.com")
|
||||
and self._api_kwargs_have_image_parts(api_kwargs or {})
|
||||
):
|
||||
request_kwargs["default_headers"] = self._copilot_headers_for_request(is_vision=True)
|
||||
return self._create_openai_client(request_kwargs, reason=reason, shared=False)
|
||||
|
||||
def _close_request_openai_client(self, client: Any, *, reason: str) -> None:
|
||||
@@ -5776,7 +5883,10 @@ class AIAgent:
|
||||
def _call():
|
||||
try:
|
||||
if self.api_mode == "codex_responses":
|
||||
request_client_holder["client"] = self._create_request_openai_client(reason="codex_stream_request")
|
||||
request_client_holder["client"] = self._create_request_openai_client(
|
||||
reason="codex_stream_request",
|
||||
api_kwargs=api_kwargs,
|
||||
)
|
||||
result["response"] = self._run_codex_stream(
|
||||
api_kwargs,
|
||||
client=request_client_holder["client"],
|
||||
@@ -5808,7 +5918,10 @@ class AIAgent:
|
||||
raise
|
||||
result["response"] = normalize_converse_response(raw_response)
|
||||
else:
|
||||
request_client_holder["client"] = self._create_request_openai_client(reason="chat_completion_request")
|
||||
request_client_holder["client"] = self._create_request_openai_client(
|
||||
reason="chat_completion_request",
|
||||
api_kwargs=api_kwargs,
|
||||
)
|
||||
result["response"] = request_client_holder["client"].chat.completions.create(**api_kwargs)
|
||||
except Exception as e:
|
||||
result["error"] = e
|
||||
@@ -6151,7 +6264,8 @@ class AIAgent:
|
||||
),
|
||||
}
|
||||
request_client_holder["client"] = self._create_request_openai_client(
|
||||
reason="chat_completion_stream_request"
|
||||
reason="chat_completion_stream_request",
|
||||
api_kwargs=stream_kwargs,
|
||||
)
|
||||
# Reset stale-stream timer so the detector measures from this
|
||||
# attempt's start, not a previous attempt's last chunk.
|
||||
@@ -7283,6 +7397,26 @@ class AIAgent:
|
||||
self._anthropic_image_fallback_cache[cache_key] = note
|
||||
return note
|
||||
|
||||
def _model_supports_vision(self) -> bool:
|
||||
"""Return True if the active provider+model reports native vision.
|
||||
|
||||
Used to decide whether to strip image content parts from API-bound
|
||||
messages (for non-vision models) or let the provider adapter handle
|
||||
them natively (for vision-capable models).
|
||||
"""
|
||||
try:
|
||||
from agent.models_dev import get_model_capabilities
|
||||
provider = (getattr(self, "provider", "") or "").strip()
|
||||
model = (getattr(self, "model", "") or "").strip()
|
||||
if not provider or not model:
|
||||
return False
|
||||
caps = get_model_capabilities(provider, model)
|
||||
if caps is None:
|
||||
return False
|
||||
return bool(caps.supports_vision)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
|
||||
if not self._content_has_image_parts(content):
|
||||
return content
|
||||
@@ -7346,12 +7480,23 @@ class AIAgent:
|
||||
return t
|
||||
|
||||
def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
|
||||
# Fast exit when no message carries image content at all.
|
||||
if not any(
|
||||
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
|
||||
for msg in api_messages
|
||||
):
|
||||
return api_messages
|
||||
|
||||
# The Anthropic adapter (agent/anthropic_adapter.py:_convert_content_part_to_anthropic)
|
||||
# already translates OpenAI-style image_url/input_image parts into
|
||||
# native Anthropic ``{"type": "image", "source": ...}`` blocks. When
|
||||
# the active model supports vision we let the adapter do its job and
|
||||
# skip this legacy text-fallback preprocessor entirely.
|
||||
if self._model_supports_vision():
|
||||
return api_messages
|
||||
|
||||
# Non-vision Anthropic model (rare today, but keep the fallback for
|
||||
# compat): replace each image part with a vision_analyze text note.
|
||||
transformed = copy.deepcopy(api_messages)
|
||||
for msg in transformed:
|
||||
if not isinstance(msg, dict):
|
||||
@@ -7362,6 +7507,150 @@ class AIAgent:
|
||||
)
|
||||
return transformed
|
||||
|
||||
def _prepare_messages_for_non_vision_model(self, api_messages: list) -> list:
|
||||
"""Strip native image parts when the active model lacks vision.
|
||||
|
||||
Runs on the chat.completions / codex_responses paths. Vision-capable
|
||||
models pass through unchanged (provider and any downstream translator
|
||||
handle the image parts natively). Non-vision models get each image
|
||||
replaced by a cached vision_analyze text description so the turn
|
||||
doesn't fail with "model does not support image input".
|
||||
"""
|
||||
if not any(
|
||||
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
|
||||
for msg in api_messages
|
||||
):
|
||||
return api_messages
|
||||
|
||||
if self._model_supports_vision():
|
||||
return api_messages
|
||||
|
||||
transformed = copy.deepcopy(api_messages)
|
||||
for msg in transformed:
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
# Reuse the Anthropic text-fallback preprocessor — the behaviour is
|
||||
# identical (walk content parts, replace images with cached
|
||||
# descriptions, merge back into a single text or structured
|
||||
# content). Naming is historical.
|
||||
msg["content"] = self._preprocess_anthropic_content(
|
||||
msg.get("content"),
|
||||
str(msg.get("role", "user") or "user"),
|
||||
)
|
||||
return transformed
|
||||
|
||||
def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool:
|
||||
"""Re-encode all native image parts at a smaller size to recover from
|
||||
image-too-large errors (Anthropic 5 MB, unknown other providers).
|
||||
|
||||
Mutates ``api_messages`` in place. Returns True if any image part was
|
||||
actually replaced, False if there were no image parts to shrink or
|
||||
Pillow couldn't help (caller should surface the original error).
|
||||
|
||||
Strategy: look for ``image_url`` / ``input_image`` parts carrying a
|
||||
``data:image/...;base64,...`` payload. For each one whose encoded
|
||||
size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB
|
||||
ceiling with header overhead), write the base64 to a tempfile, call
|
||||
``vision_tools._resize_image_for_vision`` to produce a smaller data
|
||||
URL, and substitute it in place.
|
||||
|
||||
Non-data-URL images (http/https URLs) are not touched — the provider
|
||||
fetches those itself and the size limit is different.
|
||||
"""
|
||||
if not api_messages:
|
||||
return False
|
||||
|
||||
try:
|
||||
from tools.vision_tools import _resize_image_for_vision
|
||||
except Exception as exc:
|
||||
logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc)
|
||||
return False
|
||||
|
||||
# 4 MB target leaves comfortable headroom under Anthropic's 5 MB.
|
||||
# Non-Anthropic providers we haven't observed rejecting are fine with
|
||||
# much larger; shrinking to 4 MB here loses quality but only fires
|
||||
# after a confirmed provider rejection, so the alternative is failure.
|
||||
target_bytes = 4 * 1024 * 1024
|
||||
changed_count = 0
|
||||
|
||||
def _shrink_data_url(url: str) -> Optional[str]:
|
||||
"""Return a smaller data URL, or None if shrink can't help."""
|
||||
if not isinstance(url, str) or not url.startswith("data:"):
|
||||
return None
|
||||
if len(url) <= target_bytes:
|
||||
# This specific image wasn't the oversized one.
|
||||
return None
|
||||
try:
|
||||
header, _, data = url.partition(",")
|
||||
mime = "image/jpeg"
|
||||
if header.startswith("data:"):
|
||||
mime_part = header[len("data:"):].split(";", 1)[0].strip()
|
||||
if mime_part.startswith("image/"):
|
||||
mime = mime_part
|
||||
import base64 as _b64
|
||||
raw = _b64.b64decode(data)
|
||||
suffix = {
|
||||
"image/png": ".png", "image/gif": ".gif", "image/webp": ".webp",
|
||||
"image/jpeg": ".jpg", "image/jpg": ".jpg", "image/bmp": ".bmp",
|
||||
}.get(mime, ".jpg")
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
prefix="hermes_shrink_", suffix=suffix, delete=False,
|
||||
)
|
||||
try:
|
||||
tmp.write(raw)
|
||||
tmp.close()
|
||||
resized = _resize_image_for_vision(
|
||||
Path(tmp.name),
|
||||
mime_type=mime,
|
||||
max_base64_bytes=target_bytes,
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
Path(tmp.name).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
if not resized or len(resized) >= len(url):
|
||||
# Shrink didn't help (or made it bigger — corrupt input?).
|
||||
return None
|
||||
return resized
|
||||
except Exception as exc:
|
||||
logger.warning("image-shrink recovery: re-encode failed — %s", exc)
|
||||
return None
|
||||
|
||||
for msg in api_messages:
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
for part in content:
|
||||
if not isinstance(part, dict):
|
||||
continue
|
||||
ptype = part.get("type")
|
||||
if ptype not in {"image_url", "input_image"}:
|
||||
continue
|
||||
image_value = part.get("image_url")
|
||||
# OpenAI chat.completions: {"image_url": {"url": "data:..."}}
|
||||
# OpenAI Responses: {"image_url": "data:..."}
|
||||
if isinstance(image_value, dict):
|
||||
url = image_value.get("url", "")
|
||||
resized = _shrink_data_url(url)
|
||||
if resized:
|
||||
image_value["url"] = resized
|
||||
changed_count += 1
|
||||
elif isinstance(image_value, str):
|
||||
resized = _shrink_data_url(image_value)
|
||||
if resized:
|
||||
part["image_url"] = resized
|
||||
changed_count += 1
|
||||
|
||||
if changed_count:
|
||||
logger.info(
|
||||
"image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB",
|
||||
changed_count, target_bytes / (1024 * 1024),
|
||||
)
|
||||
return changed_count > 0
|
||||
|
||||
def _anthropic_preserve_dots(self) -> bool:
|
||||
"""True when using an anthropic-compatible endpoint that preserves dots in model names.
|
||||
Alibaba/DashScope keeps dots (e.g. qwen3.5-plus).
|
||||
@@ -7510,9 +7799,10 @@ class AIAgent:
|
||||
)
|
||||
)
|
||||
is_xai_responses = self.provider == "xai" or self._base_url_hostname == "api.x.ai"
|
||||
_msgs_for_codex = self._prepare_messages_for_non_vision_model(api_messages)
|
||||
return _ct.build_kwargs(
|
||||
model=self.model,
|
||||
messages=api_messages,
|
||||
messages=_msgs_for_codex,
|
||||
tools=self.tools,
|
||||
reasoning_config=self.reasoning_config,
|
||||
session_id=getattr(self, "session_id", None),
|
||||
@@ -7591,9 +7881,12 @@ class AIAgent:
|
||||
if _ephemeral_out is not None:
|
||||
self._ephemeral_max_output_tokens = None
|
||||
|
||||
# Strip image parts for non-vision models (no-op when vision-capable).
|
||||
_msgs_for_chat = self._prepare_messages_for_non_vision_model(api_messages)
|
||||
|
||||
return _ct.build_kwargs(
|
||||
model=self.model,
|
||||
messages=api_messages,
|
||||
messages=_msgs_for_chat,
|
||||
tools=self.tools,
|
||||
timeout=self._resolved_api_call_timeout(),
|
||||
max_tokens=self.max_tokens,
|
||||
@@ -7890,39 +8183,45 @@ class AIAgent:
|
||||
api_msg["reasoning_content"] = existing
|
||||
return
|
||||
|
||||
# 2. Healthy session: promote 'reasoning' field to 'reasoning_content'
|
||||
needs_thinking_pad = (
|
||||
self._needs_kimi_tool_reasoning()
|
||||
or self._needs_deepseek_tool_reasoning()
|
||||
)
|
||||
|
||||
# 2. Cross-provider poisoned history (#15748): on DeepSeek/Kimi,
|
||||
# if the source turn has tool_calls AND a 'reasoning' field but no
|
||||
# 'reasoning_content' key, the 'reasoning' text was written by a
|
||||
# prior provider (e.g. MiniMax) — DeepSeek's own _build_assistant_message
|
||||
# always pins reasoning_content="" at creation time for tool-call turns,
|
||||
# so the shape (reasoning set, reasoning_content absent, tool_calls
|
||||
# present) is unreachable from same-provider DeepSeek history. Inject
|
||||
# "" to satisfy the API without leaking another provider's chain of
|
||||
# thought to DeepSeek/Kimi.
|
||||
normalized_reasoning = source_msg.get("reasoning")
|
||||
if (
|
||||
needs_thinking_pad
|
||||
and source_msg.get("tool_calls")
|
||||
and isinstance(normalized_reasoning, str)
|
||||
and normalized_reasoning
|
||||
):
|
||||
api_msg["reasoning_content"] = ""
|
||||
return
|
||||
|
||||
# 3. Healthy session: promote 'reasoning' field to 'reasoning_content'
|
||||
# for providers that use the internal 'reasoning' key.
|
||||
# This must happen BEFORE the DeepSeek/Kimi tool-call check so that
|
||||
# genuine reasoning content is not overwritten by the empty-string
|
||||
# fallback (#15812 regression in PR #15478).
|
||||
normalized_reasoning = source_msg.get("reasoning")
|
||||
if isinstance(normalized_reasoning, str) and normalized_reasoning:
|
||||
api_msg["reasoning_content"] = normalized_reasoning
|
||||
return
|
||||
|
||||
# 3. DeepSeek / Kimi thinking mode: tool-call turns that lack
|
||||
# reasoning_content are "poisoned history" — a prior provider (MiniMax,
|
||||
# etc.) left them empty. DeepSeek returns HTTP 400 if reasoning_content
|
||||
# is absent on replay; inject "" to satisfy the provider's requirement
|
||||
# without forwarding any cross-provider reasoning content.
|
||||
needs_empty_reasoning = (
|
||||
source_msg.get("tool_calls")
|
||||
and (
|
||||
self._needs_kimi_tool_reasoning()
|
||||
or self._needs_deepseek_tool_reasoning()
|
||||
)
|
||||
)
|
||||
if needs_empty_reasoning:
|
||||
api_msg["reasoning_content"] = ""
|
||||
return
|
||||
|
||||
# 4. DeepSeek / Kimi thinking mode: all assistant messages need
|
||||
# reasoning_content. Inject "" to satisfy the provider's requirement
|
||||
# when no explicit reasoning content is present.
|
||||
if (
|
||||
self._needs_kimi_tool_reasoning()
|
||||
or self._needs_deepseek_tool_reasoning()
|
||||
):
|
||||
# when no explicit reasoning content is present. Covers both
|
||||
# tool-call turns (already-poisoned history with no reasoning at all)
|
||||
# and plain text turns.
|
||||
if needs_thinking_pad:
|
||||
api_msg["reasoning_content"] = ""
|
||||
return
|
||||
|
||||
@@ -8459,6 +8758,14 @@ class AIAgent:
|
||||
self._current_tool = tool_names_str
|
||||
self._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}")
|
||||
|
||||
# Capture CLI callbacks from the agent thread so worker threads can
|
||||
# register them locally. Without this, _get_approval_callback() in
|
||||
# terminal_tool returns None in ThreadPoolExecutor workers, causing
|
||||
# the dangerous-command prompt to fall back to input() — which
|
||||
# deadlocks against prompt_toolkit's raw terminal mode (#13617).
|
||||
_parent_approval_cb = _get_approval_callback()
|
||||
_parent_sudo_cb = _get_sudo_password_callback()
|
||||
|
||||
def _run_tool(index, tool_call, function_name, function_args):
|
||||
"""Worker function executed in a thread."""
|
||||
# Register this worker tid so the agent can fan out an interrupt
|
||||
@@ -8485,6 +8792,18 @@ class AIAgent:
|
||||
set_activity_callback(self._touch_activity)
|
||||
except Exception:
|
||||
pass
|
||||
# Propagate approval/sudo callbacks to this worker thread.
|
||||
# Mirrors cli.py run_agent() pattern (GHSA-qg5c-hvr5-hjgr).
|
||||
if _parent_approval_cb is not None:
|
||||
try:
|
||||
_set_approval_callback(_parent_approval_cb)
|
||||
except Exception:
|
||||
pass
|
||||
if _parent_sudo_cb is not None:
|
||||
try:
|
||||
_set_sudo_password_callback(_parent_sudo_cb)
|
||||
except Exception:
|
||||
pass
|
||||
start = time.time()
|
||||
try:
|
||||
result = self._invoke_tool(function_name, function_args, effective_task_id, tool_call.id, messages=messages)
|
||||
@@ -8507,6 +8826,13 @@ class AIAgent:
|
||||
_set_interrupt(False, _worker_tid)
|
||||
except Exception:
|
||||
pass
|
||||
# Clear thread-local callbacks so a recycled worker thread
|
||||
# doesn't hold stale references to a disposed CLI instance.
|
||||
try:
|
||||
_set_approval_callback(None)
|
||||
_set_sudo_password_callback(None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Start spinner for CLI mode (skip when TUI handles tool progress)
|
||||
spinner = None
|
||||
@@ -9881,6 +10207,7 @@ class AIAgent:
|
||||
nous_auth_retry_attempted=False
|
||||
copilot_auth_retry_attempted=False
|
||||
thinking_sig_retry_attempted = False
|
||||
image_shrink_retry_attempted = False
|
||||
has_retried_429 = False
|
||||
restart_with_compressed_messages = False
|
||||
restart_with_length_continuation = False
|
||||
@@ -10802,6 +11129,31 @@ class AIAgent:
|
||||
)
|
||||
if recovered_with_pool:
|
||||
continue
|
||||
|
||||
# Image-too-large recovery: shrink oversized native image
|
||||
# parts in-place and retry once. Triggered by Anthropic's
|
||||
# per-image 5 MB ceiling (400 with "image exceeds 5 MB
|
||||
# maximum") or any other provider that complains about
|
||||
# image size. If shrink fails or a second attempt still
|
||||
# fails, fall through to normal error handling.
|
||||
if (
|
||||
classified.reason == FailoverReason.image_too_large
|
||||
and not image_shrink_retry_attempted
|
||||
):
|
||||
image_shrink_retry_attempted = True
|
||||
if self._try_shrink_image_parts_in_messages(api_messages):
|
||||
self._vprint(
|
||||
f"{self.log_prefix}📐 Image(s) exceeded provider size limit — "
|
||||
f"shrank and retrying...",
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
else:
|
||||
logger.info(
|
||||
"image-shrink recovery: no data-URL image parts found "
|
||||
"or shrink didn't reduce size; surfacing original error."
|
||||
)
|
||||
|
||||
if (
|
||||
self.api_mode == "codex_responses"
|
||||
and self.provider == "openai-codex"
|
||||
|
||||
@@ -43,6 +43,7 @@ AUTHOR_MAP = {
|
||||
"teknium1@gmail.com": "teknium1",
|
||||
"teknium@nousresearch.com": "teknium1",
|
||||
"127238744+teknium1@users.noreply.github.com": "teknium1",
|
||||
"274096618+hermes-agent-dhabibi@users.noreply.github.com": "dhabibi",
|
||||
"johnnncenaaa77@gmail.com": "johnncenae",
|
||||
"focusflow.app.help@gmail.com": "yes999zc",
|
||||
"343873859@qq.com": "DrStrangerUJN",
|
||||
@@ -53,12 +54,17 @@ AUTHOR_MAP = {
|
||||
"maks.mir@yahoo.com": "say8hi",
|
||||
"web3blind@users.noreply.github.com": "web3blind",
|
||||
"julia@alexland.us": "alexg0bot",
|
||||
"christian@scheid.tech": "scheidti",
|
||||
"1060770+benjaminsehl@users.noreply.github.com": "benjaminsehl",
|
||||
"nerijusn76@gmail.com": "Nerijusas",
|
||||
"itonov@proton.me": "Ito-69",
|
||||
"glesstech@gmail.com": "georgeglessner",
|
||||
"maxim.smetanin@gmail.com": "maxims-oss",
|
||||
"CREWorx@users.noreply.github.com": "BadTechBandit",
|
||||
"yoimexex@gmail.com": "Yoimex",
|
||||
"6548898+romanornr@users.noreply.github.com": "romanornr",
|
||||
"foxion37@gmail.com": "foxion37",
|
||||
"bloodcarter@gmail.com": "bloodcarter",
|
||||
# contributors (from noreply pattern)
|
||||
"david.vv@icloud.com": "davidvv",
|
||||
"wangqiang@wangqiangdeMac-mini.local": "xiaoqiang243",
|
||||
@@ -550,6 +556,7 @@ AUTHOR_MAP = {
|
||||
"chenzeshi@live.com": "chen1749144759",
|
||||
"mor.aleksandr@yahoo.com": "MorAlekss",
|
||||
"ash@users.noreply.github.com": "ash",
|
||||
"andrewho.sf@gmail.com": "andrewhosf",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: apple-notes
|
||||
description: Manage Apple Notes via the memo CLI on macOS (create, view, search, edit).
|
||||
description: "Manage Apple Notes via memo CLI: create, search, edit."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: apple-reminders
|
||||
description: Manage Apple Reminders via remindctl CLI (list, add, complete, delete).
|
||||
description: "Apple Reminders via remindctl: add, list, complete."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: findmy
|
||||
description: Track Apple devices and AirTags via FindMy.app on macOS using AppleScript and screen capture.
|
||||
description: "Track Apple devices/AirTags via FindMy.app on macOS."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: claude-code
|
||||
description: Delegate coding tasks to Claude Code (Anthropic's CLI agent). Use for building features, refactoring, PR reviews, and iterative coding. Requires the claude CLI installed.
|
||||
description: "Delegate coding to Claude Code CLI (features, PRs)."
|
||||
version: 2.2.0
|
||||
author: Hermes Agent + Teknium
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: codex
|
||||
description: Delegate coding tasks to OpenAI Codex CLI agent. Use for building features, refactoring, PR reviews, and batch issue fixing. Requires the codex CLI and a git repository.
|
||||
description: "Delegate coding to OpenAI Codex CLI (features, PRs)."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
@@ -14,6 +14,15 @@ metadata:
|
||||
|
||||
Delegate coding tasks to [Codex](https://github.com/openai/codex) via the Hermes terminal. Codex is OpenAI's autonomous coding agent CLI.
|
||||
|
||||
## When to use
|
||||
|
||||
- Building features
|
||||
- Refactoring
|
||||
- PR reviews
|
||||
- Batch issue fixing
|
||||
|
||||
Requires the codex CLI and a git repository.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Codex installed: `npm install -g @openai/codex`
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: hermes-agent
|
||||
description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
|
||||
description: "Configure, extend, or contribute to Hermes Agent."
|
||||
version: 2.0.0
|
||||
author: Hermes Agent + Teknium
|
||||
license: MIT
|
||||
@@ -115,7 +115,7 @@ hermes tools disable NAME Disable a toolset
|
||||
|
||||
hermes skills list List installed skills
|
||||
hermes skills search QUERY Search the skills hub
|
||||
hermes skills install ID Install a skill
|
||||
hermes skills install ID Install a skill (ID can be a hub identifier OR a direct https://…/SKILL.md URL; pass --name to override when frontmatter has no name)
|
||||
hermes skills inspect ID Preview without installing
|
||||
hermes skills config Enable/disable skills per platform
|
||||
hermes skills check Check for updates
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: opencode
|
||||
description: Delegate coding tasks to OpenCode CLI agent for feature implementation, refactoring, PR review, and long-running autonomous sessions. Requires the opencode CLI installed and authenticated.
|
||||
description: "Delegate coding to OpenCode CLI (features, PR review)."
|
||||
version: 1.2.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: architecture-diagram
|
||||
description: Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics. Semantic component colors (cyan=frontend, emerald=backend, violet=database, amber=cloud/AWS, rose=security, orange=message bus), JetBrains Mono font, grid background. Best suited for software architecture, cloud/VPC topology, microservice maps, service-mesh diagrams, database + API layer diagrams, security groups, message buses — anything that fits a tech-infra deck with a dark aesthetic. If a more specialized diagramming skill exists for the subject (scientific, educational, hand-drawn, animated, etc.), prefer that — otherwise this skill can also serve as a general-purpose SVG diagram fallback. Based on Cocoon AI's architecture-diagram-generator (MIT).
|
||||
description: "Dark-themed SVG architecture/cloud/infra diagrams as HTML."
|
||||
version: 1.0.0
|
||||
author: Cocoon AI (hello@cocoon-ai.com), ported by Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: ascii-art
|
||||
description: Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii.co.uk), and LLM fallback. No API keys required.
|
||||
description: "ASCII art: pyfiglet, cowsay, boxes, image-to-ascii."
|
||||
version: 4.0.0
|
||||
author: 0xbyt4, Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
---
|
||||
name: ascii-video
|
||||
description: "Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering. Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output."
|
||||
description: "ASCII video: convert video/audio to colored ASCII MP4/GIF."
|
||||
---
|
||||
|
||||
# ASCII Video Production Pipeline
|
||||
|
||||
## When to use
|
||||
|
||||
Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output.
|
||||
|
||||
## What's inside
|
||||
|
||||
Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering.
|
||||
|
||||
## Creative Standard
|
||||
|
||||
This is visual art. ASCII characters are the medium; cinema is the standard.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: baoyu-comic
|
||||
description: Knowledge comic creator supporting multiple art styles and tones. Creates original educational comics with detailed panel layouts and sequential image generation. Use when user asks to create "知识漫画", "教育漫画", "biography comic", "tutorial comic", or "Logicomix-style comic".
|
||||
description: "Knowledge comics (知识漫画): educational, biography, tutorial."
|
||||
version: 1.56.1
|
||||
author: 宝玉 (JimLiu)
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: baoyu-infographic
|
||||
description: Generate professional infographics with 21 layout types and 21 visual styles. Analyzes content, recommends layout×style combinations, and generates publication-ready infographics. Use when user asks to create "infographic", "visual summary", "信息图", "可视化", or "高密度信息大图".
|
||||
description: "Infographics: 21 layouts x 21 styles (信息图, 可视化)."
|
||||
version: 1.56.1
|
||||
author: 宝玉 (JimLiu)
|
||||
license: MIT
|
||||
|
||||
@@ -0,0 +1,590 @@
|
||||
---
|
||||
name: claude-design
|
||||
description: Design one-off HTML artifacts (landing, deck, prototype).
|
||||
version: 1.0.0
|
||||
author: BadTechBandit
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [design, html, prototype, ux, ui, creative, artifact, deck, motion, design-system]
|
||||
related_skills: [design-md, popular-web-designs, excalidraw, architecture-diagram]
|
||||
---
|
||||
|
||||
# Claude Design for CLI/API Agents
|
||||
|
||||
Use this skill when the user asks for design work that would normally fit Claude Design, but the agent is running in a CLI/API environment instead of the hosted Claude Design web UI.
|
||||
|
||||
The goal is to preserve Claude Design's useful design behavior and taste while removing hosted-tool plumbing that does not exist in normal agent environments.
|
||||
|
||||
**Before starting, check for other web-design skills like `popular-web-designs` (ready-to-paste design systems for Stripe, Linear, Vercel, Notion, etc.) and `design-md` (Google's DESIGN.md token spec format).** If the user wants a known brand's look, load `popular-web-designs` alongside this one and let it supply the visual vocabulary. If the deliverable is a token spec file rather than a rendered artifact, use `design-md` instead. Full decision table below.
|
||||
|
||||
## When To Use This Skill vs `popular-web-designs` vs `design-md`
|
||||
|
||||
Hermes has three design-related skills under `skills/creative/`. They do different jobs — load the right one (or combine them):
|
||||
|
||||
| Skill | What it gives you | Use when the user wants... |
|
||||
|---|---|---|
|
||||
| **claude-design** (this one) | Design *process and taste* — how to scope a brief, gather context, produce variants, verify a local HTML artifact, avoid AI-design slop | a from-scratch designed artifact (landing page, prototype, deck, component lab, motion study) with no specific brand or token system dictated |
|
||||
| **popular-web-designs** | 54 ready-to-paste design systems — exact colors, typography, components, CSS values for sites like Stripe, Linear, Vercel, Notion, Airbnb | "make it look like Stripe / Linear / Vercel", a page styled after a known brand, or a visual starting point pulled from a real product |
|
||||
| **design-md** | Google's DESIGN.md spec format — author/validate/diff/export design-token files, WCAG contrast checking, Tailwind/DTCG export | a formal, persistent, machine-readable design-system *spec file* (tokens + rationale) that lives in a repo and gets consumed by agents over time |
|
||||
|
||||
Rule of thumb:
|
||||
|
||||
- **Process + taste, one-off artifact** → claude-design
|
||||
- **Match a known brand's look** → popular-web-designs (and let claude-design drive the process)
|
||||
- **Author the tokens spec itself** → design-md
|
||||
|
||||
These compose: use `popular-web-designs` for the visual vocabulary, `claude-design` for how to turn a brief into a thoughtful local HTML file, and `design-md` when the output is the token file rather than a rendered artifact.
|
||||
|
||||
## Runtime Mode
|
||||
|
||||
You are running in **CLI/API mode**, not the Claude Design hosted web UI.
|
||||
|
||||
Ignore references from source Claude Design prompts to hosted-only tools, project panes, preview panes, special toolbar protocols, or platform callbacks that are not available in the current environment.
|
||||
|
||||
Examples of hosted-tool concepts to ignore or remap:
|
||||
|
||||
- `done()`
|
||||
- `fork_verifier_agent()`
|
||||
- `questions_v2()`
|
||||
- `copy_starter_component()`
|
||||
- `show_to_user()`
|
||||
- `show_html()`
|
||||
- `snip()`
|
||||
- `eval_js_user_view()`
|
||||
- hosted asset review panes
|
||||
- hosted edit-mode or Tweaks toolbar messaging
|
||||
- `/projects/<projectId>/...` cross-project paths
|
||||
- built-in `window.claude.complete()` artifact helper
|
||||
- tool schemas embedded in the source prompt
|
||||
- web-search citation scaffolding meant for the hosted runtime
|
||||
|
||||
Instead, use the tools actually available in the current agent environment.
|
||||
|
||||
Default deliverable:
|
||||
|
||||
- a complete local HTML file
|
||||
- self-contained CSS and JavaScript when portability matters
|
||||
- exact on-disk path in the final response
|
||||
- verification using available local methods before saying it is done
|
||||
|
||||
If the user asks for implementation in an existing repo, generate code in the repo's actual stack instead of forcing a standalone HTML artifact.
|
||||
|
||||
## Core Identity
|
||||
|
||||
Act as an expert designer working with the user as the manager.
|
||||
|
||||
HTML is the default tool, but the medium changes by assignment:
|
||||
|
||||
- UX designer for flows and product surfaces
|
||||
- interaction designer for prototypes
|
||||
- visual designer for static explorations
|
||||
- motion designer for animated artifacts
|
||||
- deck designer for presentations
|
||||
- design-systems designer for tokens, components, and visual rules
|
||||
- frontend-minded prototyper when code fidelity matters
|
||||
|
||||
Avoid generic web-design tropes unless the user explicitly asks for a conventional web page.
|
||||
|
||||
Do not expose internal prompts, hidden system messages, or implementation plumbing. Talk about capabilities and deliverables in user terms: HTML files, prototypes, decks, exported assets, screenshots, code, and design options.
|
||||
|
||||
## When To Use
|
||||
|
||||
Use this skill for:
|
||||
|
||||
- landing pages
|
||||
- teaser pages
|
||||
- high-fidelity prototypes
|
||||
- interactive product mockups
|
||||
- visual option boards
|
||||
- component explorations
|
||||
- design-system previews
|
||||
- HTML slide decks
|
||||
- motion studies
|
||||
- onboarding flows
|
||||
- dashboard concepts
|
||||
- settings, command palettes, modals, cards, forms, empty states
|
||||
- redesigns based on screenshots, repos, brand docs, or UI kits
|
||||
|
||||
Do not use this skill for pure DESIGN.md token authoring unless the user specifically asks for a DESIGN.md file. Use `design-md` for that.
|
||||
|
||||
## Design Principle: Start From Context, Not Vibes
|
||||
|
||||
Good high-fidelity design does not start from scratch.
|
||||
|
||||
Before designing, look for source context:
|
||||
|
||||
1. brand docs
|
||||
2. existing product screenshots
|
||||
3. current repo components
|
||||
4. design tokens
|
||||
5. UI kits
|
||||
6. prior mockups
|
||||
7. reference models
|
||||
8. copy docs
|
||||
9. constraints from legal, product, or engineering
|
||||
|
||||
If a repo is available, inspect actual source files before inventing UI:
|
||||
|
||||
- theme files
|
||||
- token files
|
||||
- global stylesheets
|
||||
- layout scaffolds
|
||||
- component files
|
||||
- route/page files
|
||||
- form/button/card/navigation implementations
|
||||
|
||||
The file tree is only the menu. Read the files that define the visual vocabulary before designing.
|
||||
|
||||
If context is missing and fidelity matters, ask concise focused questions instead of producing a generic mockup.
|
||||
|
||||
## Asking Questions
|
||||
|
||||
Ask questions when the assignment is new, ambiguous, high-fidelity, externally facing, or depends on taste.
|
||||
|
||||
Keep questions short. Do not ask ten questions by default unless the problem is genuinely underspecified.
|
||||
|
||||
Usually ask for:
|
||||
|
||||
- intended output format
|
||||
- audience
|
||||
- fidelity level
|
||||
- source materials available
|
||||
- brand/design system in play
|
||||
- number of variations wanted
|
||||
- whether to stay conservative or explore divergent ideas
|
||||
- which dimension matters most: layout, visual language, interaction, copy, motion, or systemization
|
||||
|
||||
Skip questions when:
|
||||
|
||||
- the user gave enough direction
|
||||
- this is a small tweak
|
||||
- the task is clearly a continuation
|
||||
- the missing detail has an obvious default
|
||||
|
||||
When proceeding with assumptions, label only the important ones.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Understand the brief**
|
||||
- What is being designed?
|
||||
- Who is it for?
|
||||
- What artifact should exist at the end?
|
||||
- What constraints are locked?
|
||||
|
||||
2. **Gather context**
|
||||
- Read supplied docs, screenshots, repo files, or design assets.
|
||||
- Identify the visual vocabulary before writing code.
|
||||
|
||||
3. **Define the design system for this artifact**
|
||||
- colors
|
||||
- type
|
||||
- spacing
|
||||
- radii
|
||||
- shadows or elevation
|
||||
- motion posture
|
||||
- component treatment
|
||||
- interaction rules
|
||||
|
||||
4. **Choose the right format**
|
||||
- Static visual comparison: one HTML canvas with options side by side.
|
||||
- Interaction/flow: clickable prototype.
|
||||
- Presentation: fixed-size HTML deck with slide navigation.
|
||||
- Component exploration: component lab with variants.
|
||||
- Motion: timeline or state-based animation.
|
||||
|
||||
5. **Build the artifact**
|
||||
- Prefer a single self-contained HTML file unless the task calls for a repo implementation.
|
||||
- Preserve prior versions for major revisions.
|
||||
- Avoid unnecessary dependencies.
|
||||
|
||||
6. **Verify**
|
||||
- Confirm files exist.
|
||||
- Run any available syntax/static checks.
|
||||
- If browser tools are available, open the file and check console errors.
|
||||
- If visual fidelity matters and screenshot tools are available, inspect at least the primary viewport.
|
||||
|
||||
7. **Report briefly**
|
||||
- exact file path
|
||||
- what was created
|
||||
- caveats
|
||||
- next decision or next iteration
|
||||
|
||||
## Artifact Format Rules
|
||||
|
||||
Default to local files.
|
||||
|
||||
For standalone artifacts:
|
||||
|
||||
- create a descriptive filename, e.g. `Landing Page.html`, `Command Palette Prototype.html`, `Design System Board.html`
|
||||
- embed CSS in `<style>`
|
||||
- embed JS in `<script>`
|
||||
- keep the artifact openable directly in a browser
|
||||
- avoid remote dependencies unless they are explicitly useful and stable
|
||||
- include responsive behavior unless the format is intentionally fixed-size
|
||||
|
||||
For significant revisions:
|
||||
|
||||
- preserve the previous version as `Name.html`
|
||||
- create `Name v2.html`, `Name v3.html`, etc.
|
||||
- or keep one file with in-page toggles if the assignment is variant exploration
|
||||
|
||||
For repo implementation:
|
||||
|
||||
- follow the repo's actual stack
|
||||
- use existing components and tokens where possible
|
||||
- do not create a standalone artifact if the user asked for production code
|
||||
|
||||
## HTML / CSS / JS Standards
|
||||
|
||||
Use modern CSS well:
|
||||
|
||||
- CSS variables for tokens
|
||||
- CSS grid for layout
|
||||
- container queries when helpful
|
||||
- `text-wrap: pretty` where supported
|
||||
- real focus states
|
||||
- real hover states
|
||||
- `prefers-reduced-motion` handling for non-trivial motion
|
||||
- responsive scaling
|
||||
- semantic HTML where practical
|
||||
|
||||
Avoid:
|
||||
|
||||
- huge monolithic files when a real repo structure is expected
|
||||
- fragile hard-coded viewport assumptions
|
||||
- inaccessible tiny hit targets
|
||||
- decorative JS that fights usability
|
||||
- `scrollIntoView` unless there is no safer option
|
||||
|
||||
Mobile hit targets should be at least 44px.
|
||||
|
||||
For print documents, text should be at least 12pt.
|
||||
|
||||
For 1920×1080 slide decks, text should generally be 24px or larger.
|
||||
|
||||
## React Guidance for Standalone HTML
|
||||
|
||||
Use plain HTML/CSS/JS by default.
|
||||
|
||||
Use React only when:
|
||||
|
||||
- the artifact needs meaningful state
|
||||
- variants/toggles are easier as components
|
||||
- interaction complexity warrants it
|
||||
- the target implementation is React/Next.js and fidelity matters
|
||||
|
||||
If using React from CDN in standalone HTML:
|
||||
|
||||
- pin exact versions
|
||||
- avoid unpinned `react@18` style URLs
|
||||
- avoid `type="module"` unless necessary
|
||||
- avoid multiple global objects named `styles`
|
||||
- give global style objects specific names, e.g. `commandPaletteStyles`, `deckStyles`
|
||||
- if splitting Babel scripts, explicitly attach shared components to `window`
|
||||
|
||||
If building inside a real repo, use the repo's package manager and component architecture instead.
|
||||
|
||||
## Deck Rules
|
||||
|
||||
For slide decks, use a fixed-size canvas and scale it to fit the viewport.
|
||||
|
||||
Default slide size: 1920×1080, 16:9.
|
||||
|
||||
Requirements:
|
||||
|
||||
- keyboard navigation
|
||||
- visible slide count
|
||||
- localStorage persistence for current slide
|
||||
- print-friendly layout when practical
|
||||
- screen labels or stable IDs for important slides
|
||||
- no speaker notes unless the user explicitly asks
|
||||
|
||||
Do not hand-wave a deck as markdown bullets. Create a designed artifact if asked for a deck.
|
||||
|
||||
Use 1–2 background colors max unless the brand system requires more.
|
||||
|
||||
Keep slides sparse. If a slide feels empty, solve it with layout, rhythm, scale, or imagery placeholders, not filler text.
|
||||
|
||||
## Prototype Rules
|
||||
|
||||
For interactive prototypes:
|
||||
|
||||
- make the primary path clickable
|
||||
- include key states: default, hover/focus, loading, empty, error, success where relevant
|
||||
- expose variations with in-page controls when useful
|
||||
- keep controls out of the final composition unless they are intentionally part of the prototype
|
||||
- persist important state in localStorage when refresh continuity matters
|
||||
|
||||
If the prototype is meant to model a product flow, design the flow, not just the first screen.
|
||||
|
||||
## Variation Rules
|
||||
|
||||
When exploring, default to at least three options:
|
||||
|
||||
1. **Conservative** — closest to existing patterns / lowest risk
|
||||
2. **Strong-fit** — best interpretation of the brief
|
||||
3. **Divergent** — more novel, useful for discovering taste boundaries
|
||||
|
||||
Variations can explore:
|
||||
|
||||
- layout
|
||||
- hierarchy
|
||||
- type scale
|
||||
- density
|
||||
- color posture
|
||||
- surface treatment
|
||||
- motion
|
||||
- interaction model
|
||||
- copy structure
|
||||
- component shape
|
||||
|
||||
Do not create variations that are merely color swaps unless color is the actual question.
|
||||
|
||||
When the user picks a direction, consolidate. Do not leave the project as a pile of options forever.
|
||||
|
||||
## Tweakable Designs in CLI/API Mode
|
||||
|
||||
The hosted Claude Design edit-mode toolbar does not exist here.
|
||||
|
||||
Still preserve the idea: when useful, add in-page controls called `Tweaks`.
|
||||
|
||||
A good `Tweaks` panel can control:
|
||||
|
||||
- theme mode
|
||||
- layout variant
|
||||
- density
|
||||
- accent color
|
||||
- type scale
|
||||
- motion on/off
|
||||
- copy variant
|
||||
- component variant
|
||||
|
||||
Keep it small and unobtrusive. The design should look final when tweaks are hidden.
|
||||
|
||||
Persist tweak values with localStorage when helpful.
|
||||
|
||||
## Content Discipline
|
||||
|
||||
Do not add filler content.
|
||||
|
||||
Every element must earn its place.
|
||||
|
||||
Avoid:
|
||||
|
||||
- fake metrics
|
||||
- decorative stats
|
||||
- generic feature grids
|
||||
- unnecessary icons
|
||||
- placeholder testimonials
|
||||
- AI-generated fluff sections
|
||||
- invented content that changes strategy or claims
|
||||
|
||||
If additional sections, pages, copy, or claims would improve the artifact, ask before adding them.
|
||||
|
||||
When copy is necessary but not final, mark it as draft or placeholder.
|
||||
|
||||
## Anti-Slop Rules
|
||||
|
||||
Avoid common AI design sludge:
|
||||
|
||||
- aggressive gradient backgrounds
|
||||
- glassmorphism by default
|
||||
- emoji unless the brand uses them
|
||||
- generic SaaS cards with icons everywhere
|
||||
- left-border accent callout cards
|
||||
- fake dashboards filled with arbitrary numbers
|
||||
- stock-photo hero sections
|
||||
- oversized rounded rectangles as a substitute for hierarchy
|
||||
- rainbow palettes
|
||||
- vague labels like “Insights,” “Growth,” “Scale,” “Optimize” without content
|
||||
- decorative SVG illustrations pretending to be product imagery
|
||||
|
||||
Minimal is not automatically good. Dense is not automatically cluttered. Choose intentionally.
|
||||
|
||||
## Typography
|
||||
|
||||
Use the existing type system if one exists.
|
||||
|
||||
If not, choose type deliberately based on the artifact:
|
||||
|
||||
- editorial: serif or humanist headline with restrained sans body
|
||||
- software/productivity: precise sans with strong numeric treatment
|
||||
- luxury/minimal: fewer weights, more spacing discipline
|
||||
- technical: mono accents only, not mono everywhere
|
||||
- deck: large, clear, high contrast
|
||||
|
||||
Avoid overused defaults when a stronger choice is appropriate.
|
||||
|
||||
If using web fonts, keep the number of families and weights low.
|
||||
|
||||
Use type as hierarchy before adding boxes, icons, or color.
|
||||
|
||||
## Color
|
||||
|
||||
Use brand/design-system colors first.
|
||||
|
||||
If no palette exists:
|
||||
|
||||
- define a small system
|
||||
- include neutrals, surface, ink, muted text, border, accent, danger/success if needed
|
||||
- use one primary accent unless the assignment calls for a broader palette
|
||||
- prefer oklch for harmonious invented palettes when browser support is acceptable
|
||||
- check contrast for important text and controls
|
||||
|
||||
Do not invent lots of colors from scratch.
|
||||
|
||||
## Layout and Composition
|
||||
|
||||
Design with rhythm:
|
||||
|
||||
- scale
|
||||
- whitespace
|
||||
- density
|
||||
- alignment
|
||||
- repetition
|
||||
- contrast
|
||||
- interruption
|
||||
|
||||
Avoid making every section the same card grid.
|
||||
|
||||
For product UIs, prioritize speed of comprehension over decoration.
|
||||
|
||||
For marketing surfaces, make one idea land per section.
|
||||
|
||||
For dashboards, avoid “data slop.” Only show data that helps the user decide or act.
|
||||
|
||||
## Motion
|
||||
|
||||
Use motion as discipline, not theater.
|
||||
|
||||
Good motion:
|
||||
|
||||
- clarifies state changes
|
||||
- reduces anxiety during loading
|
||||
- shows continuity between surfaces
|
||||
- gives controls tactility
|
||||
- stays subtle
|
||||
|
||||
Bad motion:
|
||||
|
||||
- loops without purpose
|
||||
- delays the user
|
||||
- calls attention to itself
|
||||
- hides poor hierarchy
|
||||
|
||||
Respect `prefers-reduced-motion` for non-trivial animation.
|
||||
|
||||
## Images and Icons
|
||||
|
||||
Use real supplied imagery when available.
|
||||
|
||||
If an asset is missing:
|
||||
|
||||
- use a clean placeholder
|
||||
- use typography, layout, or abstract texture instead
|
||||
- ask for real material when fidelity matters
|
||||
|
||||
Do not draw elaborate fake SVG illustrations unless the assignment is explicitly illustration work.
|
||||
|
||||
Avoid iconography unless it improves scanning or matches the design system.
|
||||
|
||||
## Source-Code Fidelity
|
||||
|
||||
When recreating or extending a UI from a repo:
|
||||
|
||||
1. inspect the repo tree
|
||||
2. identify the actual UI source files
|
||||
3. read theme/token/global style/component files
|
||||
4. lift exact values where appropriate
|
||||
5. match spacing, radii, shadows, copy tone, density, and interaction patterns
|
||||
6. only then design or modify
|
||||
|
||||
Do not build from memory when source files are available.
|
||||
|
||||
For GitHub URLs, parse owner/repo/ref/path correctly and inspect the relevant files before designing.
|
||||
|
||||
## Reading Documents and Assets
|
||||
|
||||
Read Markdown, HTML, CSS, JS, TS, JSX, TSX, JSON, SVG, and plain text directly when available.
|
||||
|
||||
For DOCX/PPTX/PDF, use available local extraction tools if present. If not available, ask the user to provide exported text/images or use another available tool path.
|
||||
|
||||
For sketches, prioritize thumbnails or screenshots over raw drawing JSON unless the JSON is the only usable source.
|
||||
|
||||
## Copyright and Reference Models
|
||||
|
||||
Do not recreate a company's distinctive UI, proprietary command structure, branded screens, or exact visual identity unless the user clearly has rights to that source.
|
||||
|
||||
It is acceptable to extract general design principles:
|
||||
|
||||
- density without clutter
|
||||
- command-first interaction
|
||||
- monochrome with one accent
|
||||
- editorial hierarchy
|
||||
- clear empty states
|
||||
- strong keyboard affordances
|
||||
|
||||
It is not acceptable to clone proprietary layouts, copy exact branded surfaces, or reproduce copyrighted content.
|
||||
|
||||
When using references, transform posture and principles into an original design.
|
||||
|
||||
## Verification
|
||||
|
||||
Before final response, verify as much as the environment allows.
|
||||
|
||||
Minimum:
|
||||
|
||||
- file exists at the stated path
|
||||
- HTML is saved completely
|
||||
- obvious syntax issues are checked
|
||||
|
||||
Better:
|
||||
|
||||
- open in a browser tool and check console errors
|
||||
- inspect screenshots at the primary viewport
|
||||
- test key interactions
|
||||
- test light/dark or variants if present
|
||||
- test responsive breakpoints if relevant
|
||||
|
||||
If verification is limited by environment, say exactly what was and was not verified.
|
||||
|
||||
Never say “done” if the file was not actually written.
|
||||
|
||||
## Final Response Format
|
||||
|
||||
Keep final responses short.
|
||||
|
||||
Include:
|
||||
|
||||
- artifact path
|
||||
- what it contains
|
||||
- verification status
|
||||
- next suggested action, if useful
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
Created: /path/to/Prototype.html
|
||||
It includes 3 layout variants, a Tweaks panel for density/theme, and responsive behavior.
|
||||
Verified: file exists and opened cleanly in browser, no console errors.
|
||||
Next: pick the strongest direction and I’ll tighten copy + motion.
|
||||
```
|
||||
|
||||
## Portable Opening Prompt Pattern
|
||||
|
||||
When adapting a Claude Design style request into CLI/API mode, use this mental translation:
|
||||
|
||||
```text
|
||||
You are running in CLI/API mode, not hosted Claude Design. Ignore references to hosted-only tools or preview panes. Produce complete local design artifacts, usually self-contained HTML with embedded CSS/JS, and verify with available local tools before returning. Preserve the design process: gather context, define the system, produce options, avoid filler, and meet a high visual bar.
|
||||
```
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- Do not paste hosted tool schemas into a skill. They cause fake tool calls.
|
||||
- Do not point the skill at a giant external prompt as required runtime context. That creates drift.
|
||||
- Do not strip the design doctrine while removing tool plumbing.
|
||||
- Do not over-ask when the user already gave enough direction.
|
||||
- Do not under-ask for high-fidelity work with no brand context.
|
||||
- Do not produce generic SaaS layouts and call them designed.
|
||||
- Do not claim browser verification unless it actually happened.
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: ideation
|
||||
title: Creative Ideation — Constraint-Driven Project Generation
|
||||
description: "Generate project ideas through creative constraints. Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made."
|
||||
description: "Generate project ideas via creative constraints."
|
||||
version: 1.0.0
|
||||
author: SHL0MS
|
||||
license: MIT
|
||||
@@ -14,6 +14,10 @@ metadata:
|
||||
|
||||
# Creative Ideation
|
||||
|
||||
## When to use
|
||||
|
||||
Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made.
|
||||
|
||||
Generate project ideas through creative constraints. Constraint + direction = creativity.
|
||||
|
||||
## How It Works
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
---
|
||||
name: design-md
|
||||
description: Author, validate, diff, and export DESIGN.md files — Google's open-source format spec that gives coding agents a persistent, structured understanding of a design system (tokens + rationale in one file). Use when building a design system, porting style rules between projects, generating UI with consistent brand, or auditing accessibility/contrast.
|
||||
description: Author/validate/export Google's DESIGN.md token spec files.
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [design, design-system, tokens, ui, accessibility, wcag, tailwind, dtcg, google]
|
||||
related_skills: [popular-web-designs, excalidraw, architecture-diagram]
|
||||
related_skills: [popular-web-designs, claude-design, excalidraw, architecture-diagram]
|
||||
---
|
||||
|
||||
# DESIGN.md Skill
|
||||
@@ -31,7 +31,9 @@ diffs versions for regressions, and exports to Tailwind or W3C DTCG JSON.
|
||||
- User wants contrast / WCAG accessibility validation on their color palette
|
||||
|
||||
For purely visual inspiration or layout examples, use `popular-web-designs`
|
||||
instead. This skill is for the *formal spec file* itself.
|
||||
instead. For *process and taste* when designing a one-off HTML artifact
|
||||
from scratch (prototype, deck, landing page, component lab), use
|
||||
`claude-design`. This skill is for the *formal spec file* itself.
|
||||
|
||||
## File anatomy
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: excalidraw
|
||||
description: Create hand-drawn style diagrams using Excalidraw JSON format. Generate .excalidraw files for architecture diagrams, flowcharts, sequence diagrams, concept maps, and more. Files can be opened at excalidraw.com or uploaded for shareable links.
|
||||
description: "Hand-drawn Excalidraw JSON diagrams (arch, flow, seq)."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
@@ -16,6 +16,10 @@ metadata:
|
||||
|
||||
Create diagrams by writing standard Excalidraw element JSON and saving as `.excalidraw` files. These files can be drag-and-dropped onto [excalidraw.com](https://excalidraw.com) for viewing and editing. No accounts, no API keys, no rendering libraries -- just JSON.
|
||||
|
||||
## When to use
|
||||
|
||||
Generate `.excalidraw` files for architecture diagrams, flowcharts, sequence diagrams, concept maps, and more. Files can be opened at excalidraw.com or uploaded for shareable links.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Load this skill** (you already did)
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
---
|
||||
name: manim-video
|
||||
description: "Production pipeline for mathematical and technical animations using Manim Community Edition. Creates 3Blue1Brown-style explainer videos, algorithm visualizations, equation derivations, architecture diagrams, and data stories. Use when users request: animated explanations, math animations, concept visualizations, algorithm walkthroughs, technical explainers, 3Blue1Brown style videos, or any programmatic animation with geometric/mathematical content."
|
||||
description: "Manim CE animations: 3Blue1Brown math/algo videos."
|
||||
version: 1.0.0
|
||||
---
|
||||
|
||||
# Manim Video Production Pipeline
|
||||
|
||||
## When to use
|
||||
|
||||
Use when users request: animated explanations, math animations, concept visualizations, algorithm walkthroughs, technical explainers, 3Blue1Brown style videos, or any programmatic animation with geometric/mathematical content. Creates 3Blue1Brown-style explainer videos, algorithm visualizations, equation derivations, architecture diagrams, and data stories using Manim Community Edition.
|
||||
|
||||
## Creative Standard
|
||||
|
||||
This is educational cinema. Every frame teaches. Every animation reveals structure.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: p5js
|
||||
description: "Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export. Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project."
|
||||
description: "p5.js sketches: gen art, shaders, interactive, 3D."
|
||||
version: 1.0.0
|
||||
metadata:
|
||||
hermes:
|
||||
@@ -10,6 +10,14 @@ metadata:
|
||||
|
||||
# p5.js Production Pipeline
|
||||
|
||||
## When to use
|
||||
|
||||
Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project.
|
||||
|
||||
## What's inside
|
||||
|
||||
Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export.
|
||||
|
||||
## Creative Standard
|
||||
|
||||
This is visual art rendered in the browser. The canvas is the medium; the algorithm is the brush.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: pixel-art
|
||||
description: Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc.), and animate them into short videos. Presets cover arcade, SNES, and 10+ era-correct looks. Use `clarify` to let the user pick a style before generating.
|
||||
description: "Pixel art w/ era palettes (NES, Game Boy, PICO-8)."
|
||||
version: 2.0.0
|
||||
author: dodo-reach
|
||||
license: MIT
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
---
|
||||
name: popular-web-designs
|
||||
description: >
|
||||
54 production-quality design systems extracted from real websites. Load a template
|
||||
to generate HTML/CSS that matches the visual identity of sites like Stripe, Linear,
|
||||
Vercel, Notion, Airbnb, and more. Each template includes colors, typography, components,
|
||||
layout rules, and ready-to-use CSS values.
|
||||
description: 54 real design systems (Stripe, Linear, Vercel) as HTML/CSS.
|
||||
version: 1.0.0
|
||||
author: Hermes Agent + Teknium (design systems sourced from VoltAgent/awesome-design-md)
|
||||
license: MIT
|
||||
@@ -27,6 +23,16 @@ triggers:
|
||||
site's complete visual language: color palette, typography hierarchy, component styles, spacing
|
||||
system, shadows, responsive behavior, and practical agent prompts with exact CSS values.
|
||||
|
||||
## Related design skills
|
||||
|
||||
- **`claude-design`** — use for the design *process and taste* (scoping a brief,
|
||||
producing variants, verifying a local HTML artifact, avoiding AI-design slop).
|
||||
Pair it with this skill when the user wants a thoughtfully-designed page styled
|
||||
after a known brand: `claude-design` drives the workflow, this skill supplies
|
||||
the visual vocabulary.
|
||||
- **`design-md`** — use when the deliverable is a formal DESIGN.md token spec
|
||||
file, not a rendered artifact.
|
||||
|
||||
## How to Use
|
||||
|
||||
1. Pick a design from the catalog below
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
---
|
||||
name: songwriting-and-ai-music
|
||||
description: >
|
||||
Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation
|
||||
techniques, phonetic tricks, and lessons learned. These are tools and ideas,
|
||||
not rules. Break any of them when the art calls for it.
|
||||
description: "Songwriting craft and Suno AI music prompts."
|
||||
tags: [songwriting, music, suno, parody, lyrics, creative]
|
||||
triggers:
|
||||
- writing a song
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
---
|
||||
name: jupyter-live-kernel
|
||||
description: >
|
||||
Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb.
|
||||
Load this skill when the task involves exploration, iteration, or inspecting
|
||||
intermediate results — data science, ML experimentation, API exploration, or
|
||||
building up complex code step-by-step. Uses terminal to run CLI commands against
|
||||
a live Jupyter kernel. No new tools required.
|
||||
description: "Iterative Python via live Jupyter kernel (hamelnb)."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: webhook-subscriptions
|
||||
description: Create and manage webhook subscriptions for event-driven agent activation, or for direct push notifications (zero LLM cost). Use when the user wants external services to trigger agent runs OR push notifications to chats.
|
||||
description: "Webhook subscriptions: event-driven agent runs."
|
||||
version: 1.1.0
|
||||
metadata:
|
||||
hermes:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: dogfood
|
||||
description: Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports
|
||||
description: "Exploratory QA of web apps: find bugs, evidence, reports."
|
||||
version: 1.0.0
|
||||
metadata:
|
||||
hermes:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: himalaya
|
||||
description: CLI to manage emails via IMAP/SMTP. Use himalaya to list, read, write, reply, forward, search, and organize emails from the terminal. Supports multiple accounts and message composition with MML (MIME Meta Language).
|
||||
description: "Himalaya CLI: IMAP/SMTP email from terminal."
|
||||
version: 1.0.0
|
||||
author: community
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: minecraft-modpack-server
|
||||
description: Set up a modded Minecraft server from a CurseForge/Modrinth server pack zip. Covers NeoForge/Forge install, Java version, JVM tuning, firewall, LAN config, backups, and launch scripts.
|
||||
description: "Host modded Minecraft servers (CurseForge, Modrinth)."
|
||||
tags: [minecraft, gaming, server, neoforge, forge, modpack]
|
||||
---
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: pokemon-player
|
||||
description: Play Pokemon games autonomously via headless emulation. Starts a game server, reads structured game state from RAM, makes strategic decisions, and sends button inputs — all from the terminal.
|
||||
description: "Play Pokemon via headless emulator + RAM reads."
|
||||
tags: [gaming, pokemon, emulator, pyboy, gameplay, gameboy]
|
||||
---
|
||||
# Pokemon Player
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: codebase-inspection
|
||||
description: Inspect and analyze codebases using pygount for LOC counting, language breakdown, and code-vs-comment ratios. Use when asked to check lines of code, repo size, language composition, or codebase stats.
|
||||
description: "Inspect codebases w/ pygount: LOC, languages, ratios."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: github-auth
|
||||
description: Set up GitHub authentication for the agent using git (universally available) or the gh CLI. Covers HTTPS tokens, SSH keys, credential helpers, and gh auth — with a detection flow to pick the right method automatically.
|
||||
description: "GitHub auth setup: HTTPS tokens, SSH keys, gh CLI login."
|
||||
version: 1.1.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: github-code-review
|
||||
description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
|
||||
description: "Review PRs: diffs, inline comments via gh or REST."
|
||||
version: 1.1.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: github-issues
|
||||
description: Create, manage, triage, and close GitHub issues. Search existing issues, add labels, assign people, and link to PRs. Works with gh CLI or falls back to git + GitHub REST API via curl.
|
||||
description: "Create, triage, label, assign GitHub issues via gh or REST."
|
||||
version: 1.1.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: github-pr-workflow
|
||||
description: Full pull request lifecycle — create branches, commit changes, open PRs, monitor CI status, auto-fix failures, and merge. Works with gh CLI or falls back to git + GitHub REST API via curl.
|
||||
description: "GitHub PR lifecycle: branch, commit, open, CI, merge."
|
||||
version: 1.1.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: github-repo-management
|
||||
description: Clone, create, fork, configure, and manage GitHub repositories. Manage remotes, secrets, releases, and workflows. Works with gh CLI or falls back to git + GitHub REST API via curl.
|
||||
description: "Clone/create/fork repos; manage remotes, releases."
|
||||
version: 1.1.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: native-mcp
|
||||
description: Built-in MCP (Model Context Protocol) client that connects to external MCP servers, discovers their tools, and registers them as native Hermes Agent tools. Supports stdio and HTTP transports with automatic reconnection, security filtering, and zero-config tool injection.
|
||||
description: "MCP client: connect servers, register tools (stdio/HTTP)."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: gif-search
|
||||
description: Search and download GIFs from Tenor using curl. No dependencies beyond curl and jq. Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat.
|
||||
description: "Search/download GIFs from Tenor via curl + jq."
|
||||
version: 1.1.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
@@ -16,6 +16,10 @@ metadata:
|
||||
|
||||
Search and download GIFs directly via the Tenor API using curl. No extra tools needed.
|
||||
|
||||
## When to use
|
||||
|
||||
Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat.
|
||||
|
||||
## Setup
|
||||
|
||||
Set your Tenor API key in your environment (add to `~/.hermes/.env`):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: heartmula
|
||||
description: Set up and run HeartMuLa, the open-source music generation model family (Suno-like). Generates full songs from lyrics + tags with multilingual support.
|
||||
description: "HeartMuLa: Suno-like song generation from lyrics + tags."
|
||||
version: 1.0.0
|
||||
metadata:
|
||||
hermes:
|
||||
@@ -11,7 +11,7 @@ metadata:
|
||||
# HeartMuLa - Open-Source Music Generation
|
||||
|
||||
## Overview
|
||||
HeartMuLa is a family of open-source music foundation models (Apache-2.0) that generates music conditioned on lyrics and tags. Comparable to Suno for open-source. Includes:
|
||||
HeartMuLa is a family of open-source music foundation models (Apache-2.0) that generates music conditioned on lyrics and tags, with multilingual support. Generates full songs from lyrics + tags. Comparable to Suno for open-source. Includes:
|
||||
- **HeartMuLa** - Music language model (3B/7B) for generation from lyrics + tags
|
||||
- **HeartCodec** - 12.5Hz music codec for high-fidelity audio reconstruction
|
||||
- **HeartTranscriptor** - Whisper-based lyrics transcription
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: songsee
|
||||
description: Generate spectrograms and audio feature visualizations (mel, chroma, MFCC, tempogram, etc.) from audio files via CLI. Useful for audio analysis, music production debugging, and visual documentation.
|
||||
description: "Audio spectrograms/features (mel, chroma, MFCC) via CLI."
|
||||
version: 1.0.0
|
||||
author: community
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: spotify
|
||||
description: Control Spotify — play music, search the catalog, manage playlists and library, inspect devices and playback state. Loads when the user asks to play/pause/queue music, search tracks/albums/artists, manage playlists, or check what's playing. Assumes the Hermes Spotify toolset is enabled and `hermes auth spotify` has been run.
|
||||
description: "Spotify: play, search, queue, manage playlists and devices."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
---
|
||||
name: youtube-content
|
||||
description: >
|
||||
Fetch YouTube video transcripts and transform them into structured content
|
||||
(chapters, summaries, threads, blog posts). Use when the user shares a YouTube
|
||||
URL or video link, asks to summarize a video, requests a transcript, or wants
|
||||
to extract and reformat content from any YouTube video.
|
||||
description: "YouTube transcripts to summaries, threads, blogs."
|
||||
---
|
||||
|
||||
# YouTube Content Tool
|
||||
|
||||
## When to use
|
||||
|
||||
Use when the user shares a YouTube URL or video link, asks to summarize a video, requests a transcript, or wants to extract and reformat content from any YouTube video. Transforms transcripts into structured content (chapters, summaries, threads, blog posts).
|
||||
|
||||
Extract transcripts from YouTube videos and convert them into useful formats.
|
||||
|
||||
## Setup
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: evaluating-llms-harness
|
||||
description: Evaluates LLMs across 60+ academic benchmarks (MMLU, HumanEval, GSM8K, TruthfulQA, HellaSwag). Use when benchmarking model quality, comparing models, reporting academic results, or tracking training progress. Industry standard used by EleutherAI, HuggingFace, and major labs. Supports HuggingFace, vLLM, APIs.
|
||||
description: "lm-eval-harness: benchmark LLMs (MMLU, GSM8K, etc.)."
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
@@ -13,6 +13,10 @@ metadata:
|
||||
|
||||
# lm-evaluation-harness - LLM Benchmarking
|
||||
|
||||
## What's inside
|
||||
|
||||
Evaluates LLMs across 60+ academic benchmarks (MMLU, HumanEval, GSM8K, TruthfulQA, HellaSwag). Use when benchmarking model quality, comparing models, reporting academic results, or tracking training progress. Industry standard used by EleutherAI, HuggingFace, and major labs. Supports HuggingFace, vLLM, APIs.
|
||||
|
||||
## Quick start
|
||||
|
||||
lm-evaluation-harness evaluates LLMs across 60+ academic benchmarks using standardized prompts and metrics.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: weights-and-biases
|
||||
description: Track ML experiments with automatic logging, visualize training in real-time, optimize hyperparameters with sweeps, and manage model registry with W&B - collaborative MLOps platform
|
||||
description: "W&B: log ML experiments, sweeps, model registry, dashboards."
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: huggingface-hub
|
||||
description: Hugging Face Hub CLI (hf) — search, download, and upload models and datasets, manage repos, query datasets with SQL, deploy inference endpoints, manage Spaces and buckets.
|
||||
description: "HuggingFace hf CLI: search/download/upload models, datasets."
|
||||
version: 1.0.0
|
||||
author: Hugging Face
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: obliteratus
|
||||
description: Remove refusal behaviors from open-weight LLMs using OBLITERATUS — mechanistic interpretability techniques (diff-in-means, SVD, whitened SVD, LEACE, SAE decomposition, etc.) to excise guardrails while preserving reasoning. 9 CLI methods, 28 analysis modules, 116 model presets across 5 compute tiers, tournament evaluation, and telemetry-driven recommendations. Use when a user wants to uncensor, abliterate, or remove refusal from an LLM.
|
||||
description: "OBLITERATUS: abliterate LLM refusals (diff-in-means)."
|
||||
version: 2.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
@@ -13,6 +13,10 @@ metadata:
|
||||
|
||||
# OBLITERATUS Skill
|
||||
|
||||
## What's inside
|
||||
|
||||
9 CLI methods, 28 analysis modules, 116 model presets across 5 compute tiers, tournament evaluation, and telemetry-driven recommendations.
|
||||
|
||||
Remove refusal behaviors (guardrails) from open-weight LLMs without retraining or fine-tuning. Uses mechanistic interpretability techniques — including diff-in-means, SVD, whitened SVD, LEACE concept erasure, SAE decomposition, Bayesian kernel projection, and more — to identify and surgically excise refusal directions from model weights while preserving reasoning capabilities.
|
||||
|
||||
**License warning:** OBLITERATUS is AGPL-3.0. NEVER import it as a Python library. Always invoke via CLI (`obliteratus` command) or subprocess. This keeps Hermes Agent's MIT license clean.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: outlines
|
||||
description: Guarantee valid JSON/XML/code structure during generation, use Pydantic models for type-safe outputs, support local models (Transformers, vLLM), and maximize inference speed with Outlines - dottxt.ai's structured generation library
|
||||
description: "Outlines: structured JSON/regex/Pydantic LLM generation."
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: serving-llms-vllm
|
||||
description: Serves LLMs with high throughput using vLLM's PagedAttention and continuous batching. Use when deploying production LLM APIs, optimizing inference latency/throughput, or serving models with limited GPU memory. Supports OpenAI-compatible endpoints, quantization (GPTQ/AWQ/FP8), and tensor parallelism.
|
||||
description: "vLLM: high-throughput LLM serving, OpenAI API, quantization."
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
@@ -13,6 +13,10 @@ metadata:
|
||||
|
||||
# vLLM - High-Performance LLM Serving
|
||||
|
||||
## When to use
|
||||
|
||||
Use when deploying production LLM APIs, optimizing inference latency/throughput, or serving models with limited GPU memory. Supports OpenAI-compatible endpoints, quantization (GPTQ/AWQ/FP8), and tensor parallelism.
|
||||
|
||||
## Quick start
|
||||
|
||||
vLLM achieves 24x higher throughput than standard transformers through PagedAttention (block-based KV cache) and continuous batching (mixing prefill/decode requests).
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: audiocraft-audio-generation
|
||||
description: PyTorch library for audio generation including text-to-music (MusicGen) and text-to-sound (AudioGen). Use when you need to generate music from text descriptions, create sound effects, or perform melody-conditioned music generation.
|
||||
description: "AudioCraft: MusicGen text-to-music, AudioGen text-to-sound."
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: segment-anything-model
|
||||
description: Foundation model for image segmentation with zero-shot transfer. Use when you need to segment any object in images using points, boxes, or masks as prompts, or automatically generate all object masks in an image.
|
||||
description: "SAM: zero-shot image segmentation via points, boxes, masks."
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user