feat: add optional smart model routing

Add a conservative cheap-vs-strong routing option that can send very short/simple turns to a cheaper model across providers while keeping the primary model for complex work. Wire it through CLI, gateway, and cron, and document the config.yaml workflow.
test: fake minisweagent for docker cwd mount regressions
2026-03-16 06:39:03 -07:00 · 2026-03-16 05:40:05 -07:00 · 2026-03-16 05:20:56 -07:00 · 2026-03-16 05:20:21 -07:00 · 2026-03-16 05:02:58 -07:00 · 2026-03-16 04:58:28 -07:00
158 changed files with 11907 additions and 1344 deletions
@@ -235,6 +235,7 @@ hermes_cli/skin_engine.py    # SkinConfig dataclass, built-in skins, YAML loader
 | Spinner verbs | `spinner.thinking_verbs` | `display.py` |
 | Spinner wings (optional) | `spinner.wings` | `display.py` |
 | Tool output prefix | `tool_prefix` | `display.py` |
+| Per-tool emojis | `tool_emojis` | `display.py` → `get_tool_emoji()` |
 | Agent name | `branding.agent_name` | `banner.py`, `cli.py` |
 | Welcome message | `branding.welcome` | `cli.py` |
 | Response box label | `branding.response_label` | `cli.py` |
@@ -62,6 +62,24 @@ hermes doctor       # Diagnose any issues

 📖 **[Full documentation →](https://hermes-agent.nousresearch.com/docs/)**

+## CLI vs Messaging Quick Reference
+
+Hermes has two entry points: start the terminal UI with `hermes`, or run the gateway and talk to it from Telegram, Discord, Slack, WhatsApp, Signal, or Email. Once you're in a conversation, many slash commands are shared across both interfaces.
+
+| Action | CLI | Messaging platforms |
+|---------|-----|---------------------|
+| Start chatting | `hermes` | Run `hermes gateway setup` + `hermes gateway start`, then send the bot a message |
+| Start fresh conversation | `/new` or `/reset` | `/new` or `/reset` |
+| Change model | `/model [provider:model]` | `/model [provider:model]` |
+| Set a personality | `/personality [name]` | `/personality [name]` |
+| Retry or undo the last turn | `/retry`, `/undo` | `/retry`, `/undo` |
+| Compress context / check usage | `/compress`, `/usage`, `/insights [--days N]` | `/compress`, `/usage`, `/insights [days]` |
+| Browse skills | `/skills` or `/<skill-name>` | `/skills` or `/<skill-name>` |
+| Interrupt current work | `Ctrl+C` or send a new message | `/stop` or send a new message |
+| Platform-specific status | `/platforms` | `/status`, `/sethome` |
+
+For the full command lists, see the [CLI guide](https://hermes-agent.nousresearch.com/docs/user-guide/cli) and the [Messaging Gateway guide](https://hermes-agent.nousresearch.com/docs/user-guide/messaging).
+
 ---

 ## Documentation
@@ -42,19 +42,16 @@ def _setup_logging() -> None:

 def _load_env() -> None:
    """Load .env from HERMES_HOME (default ``~/.hermes``)."""
-    from dotenv import load_dotenv
+    from hermes_cli.env_loader import load_hermes_dotenv

    hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
-    env_file = hermes_home / ".env"
-    if env_file.exists():
-        try:
-            load_dotenv(dotenv_path=env_file, encoding="utf-8")
-        except UnicodeDecodeError:
-            load_dotenv(dotenv_path=env_file, encoding="latin-1")
-        logging.getLogger(__name__).info("Loaded env from %s", env_file)
+    loaded = load_hermes_dotenv(hermes_home=hermes_home)
+    if loaded:
+        for env_file in loaded:
+            logging.getLogger(__name__).info("Loaded env from %s", env_file)
    else:
        logging.getLogger(__name__).info(
-            "No .env found at %s, using system env", env_file
+            "No .env found at %s, using system env", hermes_home / ".env"
        )


@@ -497,6 +497,66 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
    return result


+def _image_source_from_openai_url(url: str) -> Dict[str, str]:
+    """Convert an OpenAI-style image URL/data URL into Anthropic image source."""
+    url = str(url or "").strip()
+    if not url:
+        return {"type": "url", "url": ""}
+
+    if url.startswith("data:"):
+        header, _, data = url.partition(",")
+        media_type = "image/jpeg"
+        if header.startswith("data:"):
+            mime_part = header[len("data:"):].split(";", 1)[0].strip()
+            if mime_part.startswith("image/"):
+                media_type = mime_part
+        return {
+            "type": "base64",
+            "media_type": media_type,
+            "data": data,
+        }
+
+    return {"type": "url", "url": url}
+
+
+def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]:
+    """Convert a single OpenAI-style content part to Anthropic format."""
+    if part is None:
+        return None
+    if isinstance(part, str):
+        return {"type": "text", "text": part}
+    if not isinstance(part, dict):
+        return {"type": "text", "text": str(part)}
+
+    ptype = part.get("type")
+
+    if ptype == "input_text":
+        block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")}
+    elif ptype in {"image_url", "input_image"}:
+        image_value = part.get("image_url", {})
+        url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "")
+        block = {"type": "image", "source": _image_source_from_openai_url(url)}
+    else:
+        block = dict(part)
+
+    if isinstance(part.get("cache_control"), dict) and "cache_control" not in block:
+        block["cache_control"] = dict(part["cache_control"])
+    return block
+
+
+def _convert_content_to_anthropic(content: Any) -> Any:
+    """Convert OpenAI-style multimodal content arrays to Anthropic blocks."""
+    if not isinstance(content, list):
+        return content
+
+    converted = []
+    for part in content:
+        block = _convert_content_part_to_anthropic(part)
+        if block is not None:
+            converted.append(block)
+    return converted
+
+
 def convert_messages_to_anthropic(
    messages: List[Dict],
 ) -> Tuple[Optional[Any], List[Dict]]:
@@ -533,11 +593,9 @@ def convert_messages_to_anthropic(
            blocks = []
            if content:
                if isinstance(content, list):
-                    for part in content:
-                        if isinstance(part, dict):
-                            blocks.append(dict(part))
-                        elif part is not None:
-                            blocks.append({"type": "text", "text": str(part)})
+                    converted_content = _convert_content_to_anthropic(content)
+                    if isinstance(converted_content, list):
+                        blocks.extend(converted_content)
                else:
                    blocks.append({"type": "text", "text": str(content)})
            for tc in m.get("tool_calls", []):
@@ -587,12 +645,11 @@ def convert_messages_to_anthropic(

        # Regular user message
        if isinstance(content, list):
-            converted_blocks = []
-            for part in content:
-                converted = _convert_user_content_part_to_anthropic(part)
-                if converted is not None:
-                    converted_blocks.append(converted)
-            result.append({"role": "user", "content": converted_blocks or [{"type": "text", "text": ""}]})
+            converted_blocks = _convert_content_to_anthropic(content)
+            result.append({
+                "role": "user",
+                "content": converted_blocks or [{"type": "text", "text": ""}],
+            })
        else:
            result.append({"role": "user", "content": content})

@@ -83,7 +83,10 @@ _AUTH_JSON_PATH = get_hermes_home() / "auth.json"

 # Codex fallback: uses the Responses API (the only endpoint the Codex
 # OAuth token can access) with a fast model for auxiliary tasks.
-_CODEX_AUX_MODEL = "gpt-5.3-codex"
+# ChatGPT-backed Codex accounts currently reject gpt-5.3-codex for these
+# auxiliary flows, while gpt-5.2-codex remains broadly available and supports
+# vision via Responses.
+_CODEX_AUX_MODEL = "gpt-5.2-codex"
 _CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex"


@@ -59,6 +59,32 @@ def get_skin_tool_prefix() -> str:
    return "┊"


+def get_tool_emoji(tool_name: str, default: str = "⚡") -> str:
+    """Get the display emoji for a tool.
+
+    Resolution order:
+    1. Active skin's ``tool_emojis`` overrides (if a skin is loaded)
+    2. Tool registry's per-tool ``emoji`` field
+    3. *default* fallback
+    """
+    # 1. Skin override
+    skin = _get_skin()
+    if skin and skin.tool_emojis:
+        override = skin.tool_emojis.get(tool_name)
+        if override:
+            return override
+    # 2. Registry default
+    try:
+        from tools.registry import registry
+        emoji = registry.get_emoji(tool_name, default="")
+        if emoji:
+            return emoji
+    except Exception:
+        pass
+    # 3. Hardcoded fallback
+    return default
+
+
 # =========================================================================
 # Tool preview (one-line summary of a tool call's primary argument)
 # =========================================================================
@@ -20,65 +20,16 @@ import json
 import time
 from collections import Counter, defaultdict
 from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List

-# =========================================================================
-# Model pricing (USD per million tokens) — approximate as of early 2026
-# =========================================================================
-MODEL_PRICING = {
-    # OpenAI
-    "gpt-4o": {"input": 2.50, "output": 10.00},
-    "gpt-4o-mini": {"input": 0.15, "output": 0.60},
-    "gpt-4.1": {"input": 2.00, "output": 8.00},
-    "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
-    "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
-    "gpt-4.5-preview": {"input": 75.00, "output": 150.00},
-    "gpt-5": {"input": 10.00, "output": 30.00},
-    "gpt-5.4": {"input": 10.00, "output": 30.00},
-    "o3": {"input": 10.00, "output": 40.00},
-    "o3-mini": {"input": 1.10, "output": 4.40},
-    "o4-mini": {"input": 1.10, "output": 4.40},
-    # Anthropic
-    "claude-opus-4-20250514": {"input": 15.00, "output": 75.00},
-    "claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
-    "claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
-    "claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00},
-    "claude-3-opus-20240229": {"input": 15.00, "output": 75.00},
-    "claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
-    # DeepSeek
-    "deepseek-chat": {"input": 0.14, "output": 0.28},
-    "deepseek-reasoner": {"input": 0.55, "output": 2.19},
-    # Google
-    "gemini-2.5-pro": {"input": 1.25, "output": 10.00},
-    "gemini-2.5-flash": {"input": 0.15, "output": 0.60},
-    "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
-    # Meta (via providers)
-    "llama-4-maverick": {"input": 0.50, "output": 0.70},
-    "llama-4-scout": {"input": 0.20, "output": 0.30},
-    # Z.AI / GLM (direct provider — pricing not published externally, treat as local)
-    "glm-5": {"input": 0.0, "output": 0.0},
-    "glm-4.7": {"input": 0.0, "output": 0.0},
-    "glm-4.5": {"input": 0.0, "output": 0.0},
-    "glm-4.5-flash": {"input": 0.0, "output": 0.0},
-    # Kimi / Moonshot (direct provider — pricing not published externally, treat as local)
-    "kimi-k2.5": {"input": 0.0, "output": 0.0},
-    "kimi-k2-thinking": {"input": 0.0, "output": 0.0},
-    "kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0},
-    "kimi-k2-0905-preview": {"input": 0.0, "output": 0.0},
-    # MiniMax (direct provider — pricing not published externally, treat as local)
-    "MiniMax-M2.5": {"input": 0.0, "output": 0.0},
-    "MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0},
-    "MiniMax-M2.1": {"input": 0.0, "output": 0.0},
-}
+from agent.usage_pricing import DEFAULT_PRICING, estimate_cost_usd, format_duration_compact, get_pricing, has_known_pricing

-# Fallback: unknown/custom models get zero cost (we can't assume pricing
-# for self-hosted models, custom OAI endpoints, local inference, etc.)
-_DEFAULT_PRICING = {"input": 0.0, "output": 0.0}
+_DEFAULT_PRICING = DEFAULT_PRICING


 def _has_known_pricing(model_name: str) -> bool:
    """Check if a model has known pricing (vs unknown/custom endpoint)."""
-    return _get_pricing(model_name) is not _DEFAULT_PRICING
+    return has_known_pricing(model_name)


 def _get_pricing(model_name: str) -> Dict[str, float]:
@@ -87,67 +38,17 @@ def _get_pricing(model_name: str) -> Dict[str, float]:
    Returns _DEFAULT_PRICING (zero cost) for unknown/custom models —
    we can't assume costs for self-hosted endpoints, local inference, etc.
    """
-    if not model_name:
-        return _DEFAULT_PRICING
-
-    # Strip provider prefix (e.g., "anthropic/claude-..." -> "claude-...")
-    bare = model_name.split("/")[-1].lower()
-
-    # Exact match first
-    if bare in MODEL_PRICING:
-        return MODEL_PRICING[bare]
-
-    # Fuzzy prefix match — prefer the LONGEST matching key to avoid
-    # e.g. "gpt-4o" matching before "gpt-4o-mini" for "gpt-4o-mini-2024-07-18"
-    best_match = None
-    best_len = 0
-    for key, price in MODEL_PRICING.items():
-        if bare.startswith(key) and len(key) > best_len:
-            best_match = price
-            best_len = len(key)
-    if best_match:
-        return best_match
-
-    # Keyword heuristics (checked in most-specific-first order)
-    if "opus" in bare:
-        return {"input": 15.00, "output": 75.00}
-    if "sonnet" in bare:
-        return {"input": 3.00, "output": 15.00}
-    if "haiku" in bare:
-        return {"input": 0.80, "output": 4.00}
-    if "gpt-4o-mini" in bare:
-        return {"input": 0.15, "output": 0.60}
-    if "gpt-4o" in bare:
-        return {"input": 2.50, "output": 10.00}
-    if "gpt-5" in bare:
-        return {"input": 10.00, "output": 30.00}
-    if "deepseek" in bare:
-        return {"input": 0.14, "output": 0.28}
-    if "gemini" in bare:
-        return {"input": 0.15, "output": 0.60}
-
-    return _DEFAULT_PRICING
+    return get_pricing(model_name)


 def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    """Estimate the USD cost for a given model and token counts."""
-    pricing = _get_pricing(model)
-    return (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
+    return estimate_cost_usd(model, input_tokens, output_tokens)


 def _format_duration(seconds: float) -> str:
    """Format seconds into a human-readable duration string."""
-    if seconds < 60:
-        return f"{seconds:.0f}s"
-    minutes = seconds / 60
-    if minutes < 60:
-        return f"{minutes:.0f}m"
-    hours = minutes / 60
-    if hours < 24:
-        remaining_min = int(minutes % 60)
-        return f"{int(hours)}h {remaining_min}m" if remaining_min else f"{int(hours)}h"
-    days = hours / 24
-    return f"{days:.1f}d"
+    return format_duration_compact(seconds)


 def _bar_chart(values: List[int], max_width: int = 20) -> List[str]:
@@ -0,0 +1,184 @@
+"""Helpers for optional cheap-vs-strong model routing."""
+
+from __future__ import annotations
+
+import os
+import re
+from typing import Any, Dict, Optional
+
+_COMPLEX_KEYWORDS = {
+    "debug",
+    "debugging",
+    "implement",
+    "implementation",
+    "refactor",
+    "patch",
+    "traceback",
+    "stacktrace",
+    "exception",
+    "error",
+    "analyze",
+    "analysis",
+    "investigate",
+    "architecture",
+    "design",
+    "compare",
+    "benchmark",
+    "optimize",
+    "optimise",
+    "review",
+    "terminal",
+    "shell",
+    "tool",
+    "tools",
+    "pytest",
+    "test",
+    "tests",
+    "plan",
+    "planning",
+    "delegate",
+    "subagent",
+    "cron",
+    "docker",
+    "kubernetes",
+}
+
+_URL_RE = re.compile(r"https?://|www\.", re.IGNORECASE)
+
+
+def _coerce_bool(value: Any, default: bool = False) -> bool:
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _coerce_int(value: Any, default: int) -> int:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """Return the configured cheap-model route when a message looks simple.
+
+    Conservative by design: if the message has signs of code/tool/debugging/
+    long-form work, keep the primary model.
+    """
+    cfg = routing_config or {}
+    if not _coerce_bool(cfg.get("enabled"), False):
+        return None
+
+    cheap_model = cfg.get("cheap_model") or {}
+    if not isinstance(cheap_model, dict):
+        return None
+    provider = str(cheap_model.get("provider") or "").strip().lower()
+    model = str(cheap_model.get("model") or "").strip()
+    if not provider or not model:
+        return None
+
+    text = (user_message or "").strip()
+    if not text:
+        return None
+
+    max_chars = _coerce_int(cfg.get("max_simple_chars"), 160)
+    max_words = _coerce_int(cfg.get("max_simple_words"), 28)
+
+    if len(text) > max_chars:
+        return None
+    if len(text.split()) > max_words:
+        return None
+    if text.count("\n") > 1:
+        return None
+    if "```" in text or "`" in text:
+        return None
+    if _URL_RE.search(text):
+        return None
+
+    lowered = text.lower()
+    words = {token.strip(".,:;!?()[]{}\"'`") for token in lowered.split()}
+    if words & _COMPLEX_KEYWORDS:
+        return None
+
+    route = dict(cheap_model)
+    route["provider"] = provider
+    route["model"] = model
+    route["routing_reason"] = "simple_turn"
+    return route
+
+
+def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:
+    """Resolve the effective model/runtime for one turn.
+
+    Returns a dict with model/runtime/signature/label fields.
+    """
+    route = choose_cheap_model_route(user_message, routing_config)
+    if not route:
+        return {
+            "model": primary.get("model"),
+            "runtime": {
+                "api_key": primary.get("api_key"),
+                "base_url": primary.get("base_url"),
+                "provider": primary.get("provider"),
+                "api_mode": primary.get("api_mode"),
+            },
+            "label": None,
+            "signature": (
+                primary.get("model"),
+                primary.get("provider"),
+                primary.get("base_url"),
+                primary.get("api_mode"),
+            ),
+        }
+
+    from hermes_cli.runtime_provider import resolve_runtime_provider
+
+    explicit_api_key = None
+    api_key_env = str(route.get("api_key_env") or "").strip()
+    if api_key_env:
+        explicit_api_key = os.getenv(api_key_env) or None
+
+    try:
+        runtime = resolve_runtime_provider(
+            requested=route.get("provider"),
+            explicit_api_key=explicit_api_key,
+            explicit_base_url=route.get("base_url"),
+        )
+    except Exception:
+        return {
+            "model": primary.get("model"),
+            "runtime": {
+                "api_key": primary.get("api_key"),
+                "base_url": primary.get("base_url"),
+                "provider": primary.get("provider"),
+                "api_mode": primary.get("api_mode"),
+            },
+            "label": None,
+            "signature": (
+                primary.get("model"),
+                primary.get("provider"),
+                primary.get("base_url"),
+                primary.get("api_mode"),
+            ),
+        }
+
+    return {
+        "model": route.get("model"),
+        "runtime": {
+            "api_key": runtime.get("api_key"),
+            "base_url": runtime.get("base_url"),
+            "provider": runtime.get("provider"),
+            "api_mode": runtime.get("api_mode"),
+        },
+        "label": f"smart route → {route.get('model')} ({runtime.get('provider')})",
+        "signature": (
+            route.get("model"),
+            runtime.get("provider"),
+            runtime.get("base_url"),
+            runtime.get("api_mode"),
+        ),
+    }
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+from decimal import Decimal
+from typing import Dict
+
+
+MODEL_PRICING = {
+    "gpt-4o": {"input": 2.50, "output": 10.00},
+    "gpt-4o-mini": {"input": 0.15, "output": 0.60},
+    "gpt-4.1": {"input": 2.00, "output": 8.00},
+    "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
+    "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
+    "gpt-4.5-preview": {"input": 75.00, "output": 150.00},
+    "gpt-5": {"input": 10.00, "output": 30.00},
+    "gpt-5.4": {"input": 10.00, "output": 30.00},
+    "o3": {"input": 10.00, "output": 40.00},
+    "o3-mini": {"input": 1.10, "output": 4.40},
+    "o4-mini": {"input": 1.10, "output": 4.40},
+    "claude-opus-4-20250514": {"input": 15.00, "output": 75.00},
+    "claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
+    "claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
+    "claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00},
+    "claude-3-opus-20240229": {"input": 15.00, "output": 75.00},
+    "claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
+    "deepseek-chat": {"input": 0.14, "output": 0.28},
+    "deepseek-reasoner": {"input": 0.55, "output": 2.19},
+    "gemini-2.5-pro": {"input": 1.25, "output": 10.00},
+    "gemini-2.5-flash": {"input": 0.15, "output": 0.60},
+    "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
+    "llama-4-maverick": {"input": 0.50, "output": 0.70},
+    "llama-4-scout": {"input": 0.20, "output": 0.30},
+    "glm-5": {"input": 0.0, "output": 0.0},
+    "glm-4.7": {"input": 0.0, "output": 0.0},
+    "glm-4.5": {"input": 0.0, "output": 0.0},
+    "glm-4.5-flash": {"input": 0.0, "output": 0.0},
+    "kimi-k2.5": {"input": 0.0, "output": 0.0},
+    "kimi-k2-thinking": {"input": 0.0, "output": 0.0},
+    "kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0},
+    "kimi-k2-0905-preview": {"input": 0.0, "output": 0.0},
+    "MiniMax-M2.5": {"input": 0.0, "output": 0.0},
+    "MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0},
+    "MiniMax-M2.1": {"input": 0.0, "output": 0.0},
+}
+
+DEFAULT_PRICING = {"input": 0.0, "output": 0.0}
+
+
+def get_pricing(model_name: str) -> Dict[str, float]:
+    if not model_name:
+        return DEFAULT_PRICING
+
+    bare = model_name.split("/")[-1].lower()
+    if bare in MODEL_PRICING:
+        return MODEL_PRICING[bare]
+
+    best_match = None
+    best_len = 0
+    for key, price in MODEL_PRICING.items():
+        if bare.startswith(key) and len(key) > best_len:
+            best_match = price
+            best_len = len(key)
+    if best_match:
+        return best_match
+
+    if "opus" in bare:
+        return {"input": 15.00, "output": 75.00}
+    if "sonnet" in bare:
+        return {"input": 3.00, "output": 15.00}
+    if "haiku" in bare:
+        return {"input": 0.80, "output": 4.00}
+    if "gpt-4o-mini" in bare:
+        return {"input": 0.15, "output": 0.60}
+    if "gpt-4o" in bare:
+        return {"input": 2.50, "output": 10.00}
+    if "gpt-5" in bare:
+        return {"input": 10.00, "output": 30.00}
+    if "deepseek" in bare:
+        return {"input": 0.14, "output": 0.28}
+    if "gemini" in bare:
+        return {"input": 0.15, "output": 0.60}
+
+    return DEFAULT_PRICING
+
+
+def has_known_pricing(model_name: str) -> bool:
+    pricing = get_pricing(model_name)
+    return pricing is not DEFAULT_PRICING and any(
+        float(value) > 0 for value in pricing.values()
+    )
+
+
+def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
+    pricing = get_pricing(model)
+    total = (
+        Decimal(input_tokens) * Decimal(str(pricing["input"]))
+        + Decimal(output_tokens) * Decimal(str(pricing["output"]))
+    ) / Decimal("1000000")
+    return float(total)
+
+
+def format_duration_compact(seconds: float) -> str:
+    if seconds < 60:
+        return f"{seconds:.0f}s"
+    minutes = seconds / 60
+    if minutes < 60:
+        return f"{minutes:.0f}m"
+    hours = minutes / 60
+    if hours < 24:
+        remaining_min = int(minutes % 60)
+        return f"{int(hours)}h {remaining_min}m" if remaining_min else f"{int(hours)}h"
+    days = hours / 24
+    return f"{days:.1f}d"
+
+
+def format_token_count_compact(value: int) -> str:
+    abs_value = abs(int(value))
+    if abs_value < 1_000:
+        return str(int(value))
+
+    sign = "-" if value < 0 else ""
+    units = ((1_000_000_000, "B"), (1_000_000, "M"), (1_000, "K"))
+    for threshold, suffix in units:
+        if abs_value >= threshold:
+            scaled = abs_value / threshold
+            if scaled < 10:
+                text = f"{scaled:.2f}"
+            elif scaled < 100:
+                text = f"{scaled:.1f}"
+            else:
+                text = f"{scaled:.0f}"
+            text = text.rstrip("0").rstrip(".")
+            return f"{sign}{text}{suffix}"
+
+    return f"{value:,}"
@@ -51,6 +51,20 @@ model:
 #   # Data policy: "allow" (default) or "deny" to exclude providers that may store data
 #   # data_collection: "deny"

+# =============================================================================
+# Smart Model Routing (optional)
+# =============================================================================
+# Use a cheaper model for short/simple turns while keeping your main model for
+# more complex requests. Disabled by default.
+#
+# smart_model_routing:
+#   enabled: true
+#   max_simple_chars: 160
+#   max_simple_words: 28
+#   cheap_model:
+#     provider: openrouter
+#     model: google/gemini-2.5-flash
+
 # =============================================================================
 # Git Worktree Isolation
 # =============================================================================
@@ -76,8 +90,9 @@ model:
 #   - Messaging (Telegram/Discord): Uses MESSAGING_CWD from .env (default: home)
 terminal:
  backend: "local"
-  cwd: "."  # For local backend: "." = current directory. Ignored for remote backends.
+  cwd: "."  # For local backend: "." = current directory. Ignored for remote backends unless a backend documents otherwise.
  timeout: 180
+  docker_mount_cwd_to_workspace: false  # SECURITY: off by default. Opt in to mount the launch cwd into Docker /workspace.
  lifetime_seconds: 300
  # sudo_password: ""  # Enable sudo commands (pipes via sudo -S) - SECURITY WARNING: plaintext!

@@ -107,6 +122,7 @@ terminal:
 #   timeout: 180
 #   lifetime_seconds: 300
 #   docker_image: "nikolaik/python-nodejs:python3.11-nodejs20"
+#   docker_mount_cwd_to_workspace: true   # Explicit opt-in: mount your launch cwd into /workspace

 # -----------------------------------------------------------------------------
 # OPTION 4: Singularity/Apptainer container
@@ -333,6 +349,12 @@ session_reset:
  idle_minutes: 1440   # Inactivity timeout in minutes (default: 1440 = 24 hours)
  at_hour: 4           # Daily reset hour, 0-23 local time (default: 4 AM)

+# When true, group/channel chats use one session per participant when the platform
+# provides a user ID. This is the secure default and prevents users in the same
+# room from sharing context, interrupts, and token costs. Set false only if you
+# explicitly want one shared "room brain" per group/channel.
+group_sessions_per_user: true
+
 # =============================================================================
 # Skills Configuration
 # =============================================================================
@@ -58,26 +58,20 @@ except (ImportError, AttributeError):
 import threading
 import queue

+from agent.usage_pricing import estimate_cost_usd, format_duration_compact, format_token_count_compact, has_known_pricing
+from hermes_cli.banner import _format_context_length
+
 _COMMAND_SPINNER_FRAMES = ("⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏")


-# Load .env from ~/.hermes/.env first, then project root as dev fallback
-from dotenv import load_dotenv
+# Load .env from ~/.hermes/.env first, then project root as dev fallback.
+# User-managed env files should override stale shell exports on restart.
 from hermes_constants import OPENROUTER_BASE_URL
+from hermes_cli.env_loader import load_hermes_dotenv

 _hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
-_user_env = _hermes_home / ".env"
 _project_env = Path(__file__).parent / '.env'
-if _user_env.exists():
-    try:
-        load_dotenv(dotenv_path=_user_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_user_env, encoding="latin-1")
-elif _project_env.exists():
-    try:
-        load_dotenv(dotenv_path=_project_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_project_env, encoding="latin-1")
+load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)

 # Point mini-swe-agent at ~/.hermes/ so it shares our config
 os.environ.setdefault("MSWEA_GLOBAL_CONFIG_DIR", str(_hermes_home))
@@ -171,6 +165,7 @@ def load_cli_config() -> Dict[str, Any]:
            "modal_image": "python:3.11",
            "daytona_image": "nikolaik/python-nodejs:python3.11-nodejs20",
            "docker_volumes": [],  # host:container volume mounts for Docker backend
+            "docker_mount_cwd_to_workspace": False,  # explicit opt-in only; default off for sandbox isolation
        },
        "browser": {
            "inactivity_timeout": 120,  # Auto-cleanup inactive browser sessions after 2 min
@@ -181,6 +176,12 @@ def load_cli_config() -> Dict[str, Any]:
            "threshold": 0.50,    # Compress at 50% of model's context limit
            "summary_model": "google/gemini-3-flash-preview",  # Fast/cheap model for summaries
        },
+        "smart_model_routing": {
+            "enabled": False,
+            "max_simple_chars": 160,
+            "max_simple_words": 28,
+            "cheap_model": {},
+        },
        "agent": {
            "max_turns": 90,  # Default max tool-calling iterations (shared with subagents)
            "verbose": False,
@@ -336,7 +337,10 @@ def load_cli_config() -> Dict[str, Any]:
        "container_disk": "TERMINAL_CONTAINER_DISK",
        "container_persistent": "TERMINAL_CONTAINER_PERSISTENT",
        "docker_volumes": "TERMINAL_DOCKER_VOLUMES",
+        "docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
        "sandbox_dir": "TERMINAL_SANDBOX_DIR",
+        # Persistent shell (non-local backends)
+        "persistent_shell": "TERMINAL_PERSISTENT_SHELL",
        # Sudo support (works with all backends)
        "sudo_password": "SUDO_PASSWORD",
    }
@@ -1119,6 +1123,10 @@ class HermesCLI:
        fb = CLI_CONFIG.get("fallback_model") or {}
        self._fallback_model = fb if fb.get("provider") and fb.get("model") else None

+        # Optional cheap-vs-strong routing for simple turns
+        self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
+        self._active_agent_route_signature = None
+
        # Agent will be initialized on first use
        self.agent: Optional[AIAgent] = None
        self._app = None  # prompt_toolkit Application (set in run())
@@ -1201,6 +1209,153 @@ class HermesCLI:
            self._last_invalidate = now
            self._app.invalidate()

+    def _status_bar_context_style(self, percent_used: Optional[int]) -> str:
+        if percent_used is None:
+            return "class:status-bar-dim"
+        if percent_used >= 95:
+            return "class:status-bar-critical"
+        if percent_used > 80:
+            return "class:status-bar-bad"
+        if percent_used >= 50:
+            return "class:status-bar-warn"
+        return "class:status-bar-good"
+
+    def _build_context_bar(self, percent_used: Optional[int], width: int = 10) -> str:
+        safe_percent = max(0, min(100, percent_used or 0))
+        filled = round((safe_percent / 100) * width)
+        return f"[{('█' * filled) + ('░' * max(0, width - filled))}]"
+
+    def _get_status_bar_snapshot(self) -> Dict[str, Any]:
+        model_name = self.model or "unknown"
+        model_short = model_name.split("/")[-1] if "/" in model_name else model_name
+        if len(model_short) > 26:
+            model_short = f"{model_short[:23]}..."
+
+        elapsed_seconds = max(0.0, (datetime.now() - self.session_start).total_seconds())
+        snapshot = {
+            "model_name": model_name,
+            "model_short": model_short,
+            "duration": format_duration_compact(elapsed_seconds),
+            "context_tokens": 0,
+            "context_length": None,
+            "context_percent": None,
+            "session_prompt_tokens": 0,
+            "session_completion_tokens": 0,
+            "session_total_tokens": 0,
+            "session_api_calls": 0,
+            "session_cost": 0.0,
+            "pricing_known": has_known_pricing(model_name),
+            "compressions": 0,
+        }
+
+        agent = getattr(self, "agent", None)
+        if not agent:
+            return snapshot
+
+        snapshot["session_prompt_tokens"] = getattr(agent, "session_prompt_tokens", 0) or 0
+        snapshot["session_completion_tokens"] = getattr(agent, "session_completion_tokens", 0) or 0
+        snapshot["session_total_tokens"] = getattr(agent, "session_total_tokens", 0) or 0
+        snapshot["session_api_calls"] = getattr(agent, "session_api_calls", 0) or 0
+        snapshot["session_cost"] = estimate_cost_usd(
+            model_name,
+            snapshot["session_prompt_tokens"],
+            snapshot["session_completion_tokens"],
+        )
+
+        compressor = getattr(agent, "context_compressor", None)
+        if compressor:
+            context_tokens = getattr(compressor, "last_prompt_tokens", 0) or 0
+            context_length = getattr(compressor, "context_length", 0) or 0
+            snapshot["context_tokens"] = context_tokens
+            snapshot["context_length"] = context_length or None
+            snapshot["compressions"] = getattr(compressor, "compression_count", 0) or 0
+            if context_length:
+                snapshot["context_percent"] = max(0, min(100, round((context_tokens / context_length) * 100)))
+
+        return snapshot
+
+    def _build_status_bar_text(self, width: Optional[int] = None) -> str:
+        try:
+            snapshot = self._get_status_bar_snapshot()
+            width = width or shutil.get_terminal_size((80, 24)).columns
+            percent = snapshot["context_percent"]
+            percent_label = f"{percent}%" if percent is not None else "--"
+            cost_label = f"${snapshot['session_cost']:.2f}" if snapshot["pricing_known"] else "cost n/a"
+            duration_label = snapshot["duration"]
+
+            if width < 52:
+                return f"⚕ {snapshot['model_short']} · {duration_label}"
+            if width < 76:
+                return f"⚕ {snapshot['model_short']} · {percent_label} · {cost_label} · {duration_label}"
+
+            if snapshot["context_length"]:
+                ctx_total = _format_context_length(snapshot["context_length"])
+                ctx_used = format_token_count_compact(snapshot["context_tokens"])
+                context_label = f"{ctx_used}/{ctx_total}"
+            else:
+                context_label = "ctx --"
+
+            return f"⚕ {snapshot['model_short']} │ {context_label} │ {percent_label} │ {cost_label} │ {duration_label}"
+        except Exception:
+            return f"⚕ {self.model if getattr(self, 'model', None) else 'Hermes'}"
+
+    def _get_status_bar_fragments(self):
+        try:
+            snapshot = self._get_status_bar_snapshot()
+            width = shutil.get_terminal_size((80, 24)).columns
+            cost_label = f"${snapshot['session_cost']:.2f}" if snapshot["pricing_known"] else "cost n/a"
+            duration_label = snapshot["duration"]
+
+            if width < 52:
+                return [
+                    ("class:status-bar", " ⚕ "),
+                    ("class:status-bar-strong", snapshot["model_short"]),
+                    ("class:status-bar-dim", " · "),
+                    ("class:status-bar-dim", duration_label),
+                    ("class:status-bar", " "),
+                ]
+
+            percent = snapshot["context_percent"]
+            percent_label = f"{percent}%" if percent is not None else "--"
+            if width < 76:
+                return [
+                    ("class:status-bar", " ⚕ "),
+                    ("class:status-bar-strong", snapshot["model_short"]),
+                    ("class:status-bar-dim", " · "),
+                    (self._status_bar_context_style(percent), percent_label),
+                    ("class:status-bar-dim", " · "),
+                    ("class:status-bar-dim", cost_label),
+                    ("class:status-bar-dim", " · "),
+                    ("class:status-bar-dim", duration_label),
+                    ("class:status-bar", " "),
+                ]
+
+            if snapshot["context_length"]:
+                ctx_total = _format_context_length(snapshot["context_length"])
+                ctx_used = format_token_count_compact(snapshot["context_tokens"])
+                context_label = f"{ctx_used}/{ctx_total}"
+            else:
+                context_label = "ctx --"
+
+            bar_style = self._status_bar_context_style(percent)
+            return [
+                ("class:status-bar", " ⚕ "),
+                ("class:status-bar-strong", snapshot["model_short"]),
+                ("class:status-bar-dim", " │ "),
+                ("class:status-bar-dim", context_label),
+                ("class:status-bar-dim", " │ "),
+                (bar_style, self._build_context_bar(percent)),
+                ("class:status-bar-dim", " "),
+                (bar_style, percent_label),
+                ("class:status-bar-dim", " │ "),
+                ("class:status-bar-dim", cost_label),
+                ("class:status-bar-dim", " │ "),
+                ("class:status-bar-dim", duration_label),
+                ("class:status-bar", " "),
+            ]
+        except Exception:
+            return [("class:status-bar", f" {self._build_status_bar_text()} ")]
+
    def _normalize_model_for_provider(self, resolved_provider: str) -> bool:
        """Strip provider prefixes and swap the default model for Codex.

@@ -1351,10 +1506,27 @@ class HermesCLI:
        # routing, or the effective model changed.
        if (credentials_changed or routing_changed or model_changed) and self.agent is not None:
            self.agent = None
+            self._active_agent_route_signature = None

        return True

-    def _init_agent(self) -> bool:
+    def _resolve_turn_agent_config(self, user_message: str) -> dict:
+        """Resolve model/runtime overrides for a single user turn."""
+        from agent.smart_model_routing import resolve_turn_route
+
+        return resolve_turn_route(
+            user_message,
+            self._smart_model_routing,
+            {
+                "model": self.model,
+                "api_key": self.api_key,
+                "base_url": self.base_url,
+                "provider": self.provider,
+                "api_mode": self.api_mode,
+            },
+        )
+
+    def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None) -> bool:
        """
        Initialize the agent on first use.
        When resuming a session, restores conversation history from SQLite.
@@ -1414,16 +1586,23 @@ class HermesCLI:
                pass
        
        try:
+            runtime = runtime_override or {
+                "api_key": self.api_key,
+                "base_url": self.base_url,
+                "provider": self.provider,
+                "api_mode": self.api_mode,
+            }
+            effective_model = model_override or self.model
            self.agent = AIAgent(
-                model=self.model,
-                api_key=self.api_key,
-                base_url=self.base_url,
-                provider=self.provider,
-                api_mode=self.api_mode,
+                model=effective_model,
+                api_key=runtime.get("api_key"),
+                base_url=runtime.get("base_url"),
+                provider=runtime.get("provider"),
+                api_mode=runtime.get("api_mode"),
                max_iterations=self.max_turns,
                enabled_toolsets=self.enabled_toolsets,
                verbose_logging=self.verbose,
-                quiet_mode=True,
+                quiet_mode=not self.verbose,
                ephemeral_system_prompt=self.system_prompt if self.system_prompt else None,
                prefill_messages=self.prefill_messages or None,
                reasoning_config=self.reasoning_config,
@@ -1437,7 +1616,7 @@ class HermesCLI:
                platform="cli",
                session_db=self._session_db,
                clarify_callback=self._clarify_callback,
-                reasoning_callback=self._on_reasoning if self.show_reasoning else None,
+                reasoning_callback=self._on_reasoning if (self.show_reasoning or self.verbose) else None,
                honcho_session_key=None,  # resolved by run_agent via config sessions map / title
                fallback_model=self._fallback_model,
                thinking_callback=self._on_thinking,
@@ -1446,7 +1625,13 @@ class HermesCLI:
                pass_session_id=self.pass_session_id,
                tool_progress_callback=self._on_tool_progress,
            )
-            # Apply any pending title now that the session exists in the DB
+            self._active_agent_route_signature = (
+                effective_model,
+                runtime.get("provider"),
+                runtime.get("base_url"),
+                runtime.get("api_mode"),
+            )
+
            if self._pending_title and self._session_db:
                try:
                    self._session_db.set_session_title(self.session_id, self._pending_title)
@@ -1736,7 +1921,14 @@ class HermesCLI:
        return False

    def _handle_rollback_command(self, command: str):
-        """Handle /rollback — list or restore filesystem checkpoints."""
+        """Handle /rollback — list, diff, or restore filesystem checkpoints.
+
+        Syntax:
+            /rollback                 — list checkpoints
+            /rollback <N>             — restore checkpoint N (also undoes last chat turn)
+            /rollback diff <N>        — preview changes since checkpoint N
+            /rollback <N> <file>      — restore a single file from checkpoint N
+        """
        from tools.checkpoint_manager import CheckpointManager, format_checkpoint_list

        if not hasattr(self, 'agent') or not self.agent:
@@ -1751,38 +1943,89 @@ class HermesCLI:
            return

        cwd = os.getenv("TERMINAL_CWD", os.getcwd())
-        parts = command.split(maxsplit=1)
-        arg = parts[1].strip() if len(parts) > 1 else ""
+        parts = command.split()
+        args = parts[1:] if len(parts) > 1 else []

-        if not arg:
+        if not args:
            # List checkpoints
            checkpoints = mgr.list_checkpoints(cwd)
            print(format_checkpoint_list(checkpoints, cwd))
-        else:
-            # Restore by number or hash
+            return
+
+        # Handle /rollback diff <N>
+        if args[0].lower() == "diff":
+            if len(args) < 2:
+                print("  Usage: /rollback diff <N>")
+                return
            checkpoints = mgr.list_checkpoints(cwd)
            if not checkpoints:
                print(f"  No checkpoints found for {cwd}")
                return
-
-            target_hash = None
-            try:
-                idx = int(arg) - 1  # 1-indexed for user
-                if 0 <= idx < len(checkpoints):
-                    target_hash = checkpoints[idx]["hash"]
-                else:
-                    print(f"  Invalid checkpoint number. Use 1-{len(checkpoints)}.")
-                    return
-            except ValueError:
-                # Try as a git hash
-                target_hash = arg
-
-            result = mgr.restore(cwd, target_hash)
+            target_hash = self._resolve_checkpoint_ref(args[1], checkpoints)
+            if not target_hash:
+                return
+            result = mgr.diff(cwd, target_hash)
            if result["success"]:
-                print(f"  ✅ Restored to checkpoint {result['restored_to']}: {result['reason']}")
-                print(f"  A pre-rollback snapshot was saved automatically.")
+                stat = result.get("stat", "")
+                diff = result.get("diff", "")
+                if not stat and not diff:
+                    print("  No changes since this checkpoint.")
+                else:
+                    if stat:
+                        print(f"\n{stat}")
+                    if diff:
+                        # Limit diff output to avoid terminal flood
+                        diff_lines = diff.splitlines()
+                        if len(diff_lines) > 80:
+                            print("\n".join(diff_lines[:80]))
+                            print(f"\n  ... ({len(diff_lines) - 80} more lines, showing first 80)")
+                        else:
+                            print(f"\n{diff}")
            else:
                print(f"  ❌ {result['error']}")
+            return
+
+        # Resolve checkpoint reference (number or hash)
+        checkpoints = mgr.list_checkpoints(cwd)
+        if not checkpoints:
+            print(f"  No checkpoints found for {cwd}")
+            return
+
+        target_hash = self._resolve_checkpoint_ref(args[0], checkpoints)
+        if not target_hash:
+            return
+
+        # Check for file-level restore: /rollback <N> <file>
+        file_path = args[1] if len(args) > 1 else None
+
+        result = mgr.restore(cwd, target_hash, file_path=file_path)
+        if result["success"]:
+            if file_path:
+                print(f"  ✅ Restored {file_path} from checkpoint {result['restored_to']}: {result['reason']}")
+            else:
+                print(f"  ✅ Restored to checkpoint {result['restored_to']}: {result['reason']}")
+            print(f"  A pre-rollback snapshot was saved automatically.")
+
+            # Also undo the last conversation turn so the agent's context
+            # matches the restored filesystem state
+            if self.conversation_history:
+                self.undo_last()
+                print(f"  Chat turn undone to match restored file state.")
+        else:
+            print(f"  ❌ {result['error']}")
+
+    def _resolve_checkpoint_ref(self, ref: str, checkpoints: list) -> str | None:
+        """Resolve a checkpoint number or hash to a full commit hash."""
+        try:
+            idx = int(ref) - 1  # 1-indexed for user
+            if 0 <= idx < len(checkpoints):
+                return checkpoints[idx]["hash"]
+            else:
+                print(f"  Invalid checkpoint number. Use 1-{len(checkpoints)}.")
+                return None
+        except ValueError:
+            # Treat as a git hash
+            return ref

    def _handle_paste_command(self):
        """Handle /paste — explicitly check clipboard for an image.
@@ -2920,6 +3163,12 @@ class HermesCLI:
                # Parse provider:model syntax (e.g. "openrouter:anthropic/claude-sonnet-4.5")
                current_provider = self.provider or self.requested_provider or "openrouter"
                target_provider, new_model = parse_model_input(raw_input, current_provider)
+                # Auto-detect provider when no explicit provider:model syntax was used
+                if target_provider == current_provider:
+                    from hermes_cli.models import detect_provider_for_model
+                    detected = detect_provider_for_model(new_model, current_provider)
+                    if detected:
+                        target_provider, new_model = detected
                provider_changed = target_provider != current_provider

                # If provider is changing, re-resolve credentials for the new provider
@@ -3160,14 +3409,16 @@ class HermesCLI:
        _cprint(f"  Task ID: {task_id}")
        _cprint(f"  You can continue chatting — results will appear when done.\n")

+        turn_route = self._resolve_turn_agent_config(prompt)
+
        def run_background():
            try:
                bg_agent = AIAgent(
-                    model=self.model,
-                    api_key=self.api_key,
-                    base_url=self.base_url,
-                    provider=self.provider,
-                    api_mode=self.api_mode,
+                    model=turn_route["model"],
+                    api_key=turn_route["runtime"].get("api_key"),
+                    base_url=turn_route["runtime"].get("base_url"),
+                    provider=turn_route["runtime"].get("provider"),
+                    api_mode=turn_route["runtime"].get("api_mode"),
                    max_iterations=self.max_turns,
                    enabled_toolsets=self.enabled_toolsets,
                    quiet_mode=True,
@@ -3294,12 +3545,17 @@ class HermesCLI:
        if self.agent:
            self.agent.verbose_logging = self.verbose
            self.agent.quiet_mode = not self.verbose
+            # Auto-enable reasoning display in verbose mode
+            if self.verbose:
+                self.agent.reasoning_callback = self._on_reasoning
+            elif not self.show_reasoning:
+                self.agent.reasoning_callback = None

        labels = {
            "off": "[dim]Tool progress: OFF[/] — silent mode, just the final response.",
            "new": "[yellow]Tool progress: NEW[/] — show each new tool (skip repeats).",
            "all": "[green]Tool progress: ALL[/] — show every tool call.",
-            "verbose": "[bold green]Tool progress: VERBOSE[/] — full args, results, and debug logs.",
+            "verbose": "[bold green]Tool progress: VERBOSE[/] — full args, results, think blocks, and debug logs.",
        }
        self.console.print(labels.get(self.tool_progress_mode, ""))

@@ -3366,13 +3622,17 @@ class HermesCLI:

    def _on_reasoning(self, reasoning_text: str):
        """Callback for intermediate reasoning display during tool-call loops."""
-        lines = reasoning_text.strip().splitlines()
-        if len(lines) > 5:
-            preview = "\n".join(lines[:5])
-            preview += f"\n  ... ({len(lines) - 5} more lines)"
+        if self.verbose:
+            # Verbose mode: show full reasoning text
+            _cprint(f"  {_DIM}[thinking] {reasoning_text.strip()}{_RST}")
        else:
-            preview = reasoning_text.strip()
-        _cprint(f"  {_DIM}[thinking] {preview}{_RST}")
+            lines = reasoning_text.strip().splitlines()
+            if len(lines) > 5:
+                preview = "\n".join(lines[:5])
+                preview += f"\n  ... ({len(lines) - 5} more lines)"
+            else:
+                preview = reasoning_text.strip()
+            _cprint(f"  {_DIM}[thinking] {preview}{_RST}")

    def _manual_compress(self):
        """Manually trigger context compression on the current conversation."""
@@ -3439,17 +3699,34 @@ class HermesCLI:
        compressions = compressor.compression_count

        msg_count = len(self.conversation_history)
+        cost = estimate_cost_usd(agent.model, prompt, completion)
+        prompt_cost = estimate_cost_usd(agent.model, prompt, 0)
+        completion_cost = estimate_cost_usd(agent.model, 0, completion)
+        pricing_known = has_known_pricing(agent.model)
+        elapsed = format_duration_compact((datetime.now() - self.session_start).total_seconds())

        print(f"  📊 Session Token Usage")
        print(f"  {'─' * 40}")
+        print(f"  Model:                     {agent.model}")
        print(f"  Prompt tokens (input):     {prompt:>10,}")
        print(f"  Completion tokens (output): {completion:>9,}")
        print(f"  Total tokens:              {total:>10,}")
        print(f"  API calls:                 {calls:>10,}")
+        print(f"  Session duration:          {elapsed:>10}")
+        if pricing_known:
+            print(f"  Input cost:              ${prompt_cost:>10.4f}")
+            print(f"  Output cost:             ${completion_cost:>10.4f}")
+            print(f"  Total cost:              ${cost:>10.4f}")
+        else:
+            print(f"  Input cost:              {'n/a':>10}")
+            print(f"  Output cost:             {'n/a':>10}")
+            print(f"  Total cost:              {'n/a':>10}")
        print(f"  {'─' * 40}")
        print(f"  Current context:  {last_prompt:,} / {ctx_len:,} ({pct:.0f}%)")
        print(f"  Messages:         {msg_count}")
        print(f"  Compressions:     {compressions}")
+        if not pricing_known:
+            print(f"  Note:             Pricing unknown for {agent.model}")

        if self.verbose:
            logging.getLogger().setLevel(logging.DEBUG)
@@ -3493,6 +3770,56 @@ class HermesCLI:
        except Exception as e:
            print(f"  Error generating insights: {e}")

+    def _check_config_mcp_changes(self) -> None:
+        """Detect mcp_servers changes in config.yaml and auto-reload MCP connections.
+
+        Called from process_loop every CONFIG_WATCH_INTERVAL seconds.
+        Compares config.yaml mtime + mcp_servers section against the last
+        known state.  When a change is detected, triggers _reload_mcp() and
+        informs the user so they know the tool list has been refreshed.
+        """
+        import time
+        import yaml as _yaml
+
+        CONFIG_WATCH_INTERVAL = 5.0  # seconds between config.yaml stat() calls
+
+        now = time.monotonic()
+        if now - self._last_config_check < CONFIG_WATCH_INTERVAL:
+            return
+        self._last_config_check = now
+
+        from hermes_cli.config import get_config_path as _get_config_path
+        cfg_path = _get_config_path()
+        if not cfg_path.exists():
+            return
+
+        try:
+            mtime = cfg_path.stat().st_mtime
+        except OSError:
+            return
+
+        if mtime == self._config_mtime:
+            return  # File unchanged — fast path
+
+        # File changed — check whether mcp_servers section changed
+        self._config_mtime = mtime
+        try:
+            with open(cfg_path, encoding="utf-8") as f:
+                new_cfg = _yaml.safe_load(f) or {}
+        except Exception:
+            return
+
+        new_mcp = new_cfg.get("mcp_servers") or {}
+        if new_mcp == self._config_mcp_servers:
+            return  # mcp_servers unchanged (some other section was edited)
+
+        self._config_mcp_servers = new_mcp
+        # Notify user and reload
+        print()
+        print("🔄 MCP server config changed — reloading connections...")
+        with self._busy_command(self._slow_command_status("/reload-mcp")):
+            self._reload_mcp()
+
    def _reload_mcp(self):
        """Reload MCP servers: disconnect all, re-read config.yaml, reconnect.

@@ -4311,8 +4638,16 @@ class HermesCLI:
        if not self._ensure_runtime_credentials():
            return None

+        turn_route = self._resolve_turn_agent_config(message)
+        if turn_route["signature"] != self._active_agent_route_signature:
+            self.agent = None
+
        # Initialize agent if needed
-        if not self._init_agent():
+        if not self._init_agent(
+            model_override=turn_route["model"],
+            runtime_override=turn_route["runtime"],
+            route_label=turn_route["label"],
+        ):
            return None
        
        # Pre-process images through the vision tool (Gemini Flash) so the
@@ -4758,6 +5093,12 @@ class HermesCLI:
        self._interrupt_queue = queue.Queue()   # For messages typed while agent is running
        self._should_exit = False
        self._last_ctrl_c_time = 0  # Track double Ctrl+C for force exit
+        # Config file watcher — detect mcp_servers changes and auto-reload
+        from hermes_cli.config import get_config_path as _get_config_path
+        _cfg_path = _get_config_path()
+        self._config_mtime: float = _cfg_path.stat().st_mtime if _cfg_path.exists() else 0.0
+        self._config_mcp_servers: dict = self.config.get("mcp_servers") or {}
+        self._last_config_check: float = 0.0  # monotonic time of last check

        # Clarify tool state: interactive question/answer with the user.
        # When the agent calls the clarify tool, _clarify_state is set and
@@ -4806,7 +5147,7 @@ class HermesCLI:
        # Ensure tirith security scanner is available (downloads if needed)
        try:
            from tools.tirith_security import ensure_installed
-            ensure_installed()
+            ensure_installed(log_failures=False)
        except Exception:
            pass  # Non-fatal — fail-open at scan time if unavailable
        
@@ -5593,6 +5934,11 @@ class HermesCLI:
            filter=Condition(lambda: cli_ref._voice_mode),
        )

+        status_bar = Window(
+            content=FormattedTextControl(lambda: cli_ref._get_status_bar_fragments()),
+            height=1,
+        )
+
        # Layout: interactive prompt widgets + ruled input at bottom.
        # The sudo, approval, and clarify widgets appear above the input when
        # the corresponding interactive prompt is active.
@@ -5605,6 +5951,7 @@ class HermesCLI:
                clarify_widget,
                spinner_widget,
                spacer,
+                status_bar,
                input_rule_top,
                image_bar,
                input_area,
@@ -5621,6 +5968,13 @@ class HermesCLI:
            'prompt': '#FFF8DC',
            'prompt-working': '#888888 italic',
            'hint': '#555555 italic',
+            'status-bar': 'bg:#1a1a2e #C0C0C0',
+            'status-bar-strong': 'bg:#1a1a2e #FFD700 bold',
+            'status-bar-dim': 'bg:#1a1a2e #8B8682',
+            'status-bar-good': 'bg:#1a1a2e #8FBC8F bold',
+            'status-bar-warn': 'bg:#1a1a2e #FFD700 bold',
+            'status-bar-bad': 'bg:#1a1a2e #FF8C00 bold',
+            'status-bar-critical': 'bg:#1a1a2e #FF6B6B bold',
            # Bronze horizontal rules around the input area
            'input-rule': '#CD7F32',
            # Clipboard image attachment badges
@@ -5673,12 +6027,20 @@ class HermesCLI:
        def spinner_loop():
            import time as _time

+            last_idle_refresh = 0.0
            while not self._should_exit:
-                if self._command_running and self._app:
+                if not self._app:
+                    _time.sleep(0.1)
+                    continue
+                if self._command_running:
                    self._invalidate(min_interval=0.1)
                    _time.sleep(0.1)
                else:
-                    _time.sleep(0.05)
+                    now = _time.monotonic()
+                    if now - last_idle_refresh >= 1.0:
+                        last_idle_refresh = now
+                        self._invalidate(min_interval=1.0)
+                    _time.sleep(0.2)

        spinner_thread = threading.Thread(target=spinner_loop, daemon=True)
        spinner_thread.start()
@@ -5691,6 +6053,9 @@ class HermesCLI:
                    try:
                        user_input = self._pending_input.get(timeout=0.1)
                    except queue.Empty:
+                        # Periodic config watcher — auto-reload MCP on mcp_servers change
+                        if not self._agent_running:
+                            self._check_config_mcp_changes()
                        continue
                    
                    if not user_input:
@@ -6011,13 +6376,21 @@ def main(
            # Quiet mode: suppress banner, spinner, tool previews.
            # Only print the final response and parseable session info.
            cli.tool_progress_mode = "off"
-            if cli._init_agent():
-                cli.agent.quiet_mode = True
-                result = cli.agent.run_conversation(query)
-                response = result.get("final_response", "") if isinstance(result, dict) else str(result)
-                if response:
-                    print(response)
-                print(f"\nsession_id: {cli.session_id}")
+            if cli._ensure_runtime_credentials():
+                turn_route = cli._resolve_turn_agent_config(query)
+                if turn_route["signature"] != cli._active_agent_route_signature:
+                    cli.agent = None
+                if cli._init_agent(
+                    model_override=turn_route["model"],
+                    runtime_override=turn_route["runtime"],
+                    route_label=turn_route["label"],
+                ):
+                    cli.agent.quiet_mode = True
+                    result = cli.agent.run_conversation(query)
+                    response = result.get("final_response", "") if isinstance(result, dict) else str(result)
+                    if response:
+                        print(response)
+                    print(f"\nsession_id: {cli.session_id}")
        else:
            cli.show_banner()
            cli.console.print(f"[bold blue]Query:[/] {query}")
@@ -315,6 +315,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:

        # Provider routing
        pr = _cfg.get("provider_routing", {})
+        smart_routing = _cfg.get("smart_model_routing", {}) or {}

        from hermes_cli.runtime_provider import (
            resolve_runtime_provider,
@@ -331,12 +332,25 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            message = format_runtime_provider_error(exc)
            raise RuntimeError(message) from exc

+        from agent.smart_model_routing import resolve_turn_route
+        turn_route = resolve_turn_route(
+            prompt,
+            smart_routing,
+            {
+                "model": model,
+                "api_key": runtime.get("api_key"),
+                "base_url": runtime.get("base_url"),
+                "provider": runtime.get("provider"),
+                "api_mode": runtime.get("api_mode"),
+            },
+        )
+
        agent = AIAgent(
-            model=model,
-            api_key=runtime.get("api_key"),
-            base_url=runtime.get("base_url"),
-            provider=runtime.get("provider"),
-            api_mode=runtime.get("api_mode"),
+            model=turn_route["model"],
+            api_key=turn_route["runtime"].get("api_key"),
+            base_url=turn_route["runtime"].get("base_url"),
+            provider=turn_route["runtime"].get("provider"),
+            api_mode=turn_route["runtime"].get("api_mode"),
            max_iterations=max_iterations,
            reasoning_config=reasoning_config,
            prefill_messages=prefill_messages,
@@ -97,10 +97,11 @@ class SessionResetPolicy:
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "SessionResetPolicy":
        # Handle both missing keys and explicit null values (YAML null → None)
+        mode = data.get("mode")
        at_hour = data.get("at_hour")
        idle_minutes = data.get("idle_minutes")
        return cls(
-            mode=data.get("mode", "both"),
+            mode=mode if mode is not None else "both",
            at_hour=at_hour if at_hour is not None else 4,
            idle_minutes=idle_minutes if idle_minutes is not None else 1440,
        )
@@ -174,7 +175,10 @@ class GatewayConfig:

    # STT settings
    stt_enabled: bool = True  # Whether to auto-transcribe inbound voice messages
-    
+
+    # Session isolation in shared chats
+    group_sessions_per_user: bool = True  # Isolate group/channel sessions per participant when user IDs are available
+
    def get_connected_platforms(self) -> List[Platform]:
        """Return list of platforms that are enabled and configured."""
        connected = []
@@ -239,6 +243,7 @@ class GatewayConfig:
            "sessions_dir": str(self.sessions_dir),
            "always_log_local": self.always_log_local,
            "stt_enabled": self.stt_enabled,
+            "group_sessions_per_user": self.group_sessions_per_user,
        }
    
    @classmethod
@@ -279,6 +284,8 @@ class GatewayConfig:
        if stt_enabled is None:
            stt_enabled = data.get("stt", {}).get("enabled") if isinstance(data.get("stt"), dict) else None

+        group_sessions_per_user = data.get("group_sessions_per_user")
+
        return cls(
            platforms=platforms,
            default_reset_policy=default_policy,
@@ -289,6 +296,7 @@ class GatewayConfig:
            sessions_dir=sessions_dir,
            always_log_local=data.get("always_log_local", True),
            stt_enabled=_coerce_bool(stt_enabled, True),
+            group_sessions_per_user=_coerce_bool(group_sessions_per_user, True),
        )


@@ -344,6 +352,14 @@ def load_gateway_config() -> GatewayConfig:
            if isinstance(stt_cfg, dict) and "enabled" in stt_cfg:
                config.stt_enabled = _coerce_bool(stt_cfg.get("enabled"), True)

+            # Bridge group session isolation from config.yaml into gateway runtime.
+            # Secure default is per-user isolation in shared chats.
+            if "group_sessions_per_user" in yaml_cfg:
+                config.group_sessions_per_user = _coerce_bool(
+                    yaml_cfg.get("group_sessions_per_user"),
+                    True,
+                )
+
            # Bridge discord settings from config.yaml to env vars
            # (env vars take precedence — only set if not already defined)
            discord_cfg = yaml_cfg.get("discord", {})
@@ -288,6 +288,7 @@ class MessageEvent:
    message_id: Optional[str] = None
    
    # Media attachments
+    # media_urls: local file paths (for vision tool access)
    media_urls: List[str] = field(default_factory=list)
    media_types: List[str] = field(default_factory=list)
    
@@ -355,6 +356,10 @@ class BasePlatformAdapter(ABC):
        # Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
        self._active_sessions: Dict[str, asyncio.Event] = {}
        self._pending_messages: Dict[str, MessageEvent] = {}
+        # Background message-processing tasks spawned by handle_message().
+        # Gateway shutdown cancels these so an old gateway instance doesn't keep
+        # working on a task after --replace or manual restarts.
+        self._background_tasks: set[asyncio.Task] = set()
        # Chats where auto-TTS on voice input is disabled (set by /voice off)
        self._auto_tts_disabled_chats: set = set()

@@ -747,11 +752,32 @@ class BasePlatformAdapter(ABC):
        if not self._message_handler:
            return
        
-        session_key = build_session_key(event.source)
+        session_key = build_session_key(
+            event.source,
+            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
+        )
        
        # Check if there's already an active handler for this session
        if session_key in self._active_sessions:
-            # Store this as a pending message - it will interrupt the running agent
+            # Special case: photo bursts/albums frequently arrive as multiple near-
+            # simultaneous messages. Queue them without interrupting the active run,
+            # then process them immediately after the current task finishes.
+            if event.message_type == MessageType.PHOTO:
+                print(f"[{self.name}] 🖼️ Queuing photo follow-up for session {session_key} without interrupt")
+                existing = self._pending_messages.get(session_key)
+                if existing and existing.message_type == MessageType.PHOTO:
+                    existing.media_urls.extend(event.media_urls)
+                    existing.media_types.extend(event.media_types)
+                    if event.text:
+                        if not existing.text:
+                            existing.text = event.text
+                        elif event.text not in existing.text:
+                            existing.text = f"{existing.text}\n\n{event.text}".strip()
+                else:
+                    self._pending_messages[session_key] = event
+                return  # Don't interrupt now - will run after current task completes
+
+            # Default behavior for non-photo follow-ups: interrupt the running agent
            print(f"[{self.name}] ⚡ New message while session {session_key} is active - triggering interrupt")
            self._pending_messages[session_key] = event
            # Signal the interrupt (the processing task checks this)
@@ -759,7 +785,15 @@ class BasePlatformAdapter(ABC):
            return  # Don't process now - will be handled after current task finishes
        
        # Spawn background task to process this message
-        asyncio.create_task(self._process_message_background(event, session_key))
+        task = asyncio.create_task(self._process_message_background(event, session_key))
+        try:
+            self._background_tasks.add(task)
+        except TypeError:
+            # Some tests stub create_task() with lightweight sentinels that are not
+            # hashable and do not support lifecycle callbacks.
+            return
+        if hasattr(task, "add_done_callback"):
+            task.add_done_callback(self._background_tasks.discard)
    
    @staticmethod
    def _get_human_delay() -> float:
@@ -969,6 +1003,21 @@ class BasePlatformAdapter(ABC):
            if session_key in self._active_sessions:
                del self._active_sessions[session_key]
    
+    async def cancel_background_tasks(self) -> None:
+        """Cancel any in-flight background message-processing tasks.
+
+        Used during gateway shutdown/replacement so active sessions from the old
+        process do not keep running after adapters are being torn down.
+        """
+        tasks = [task for task in self._background_tasks if not task.done()]
+        for task in tasks:
+            task.cancel()
+        if tasks:
+            await asyncio.gather(*tasks, return_exceptions=True)
+        self._background_tasks.clear()
+        self._pending_messages.clear()
+        self._active_sessions.clear()
+
    def has_pending_interrupt(self, session_key: str) -> bool:
        """Check if there's a pending interrupt for a session."""
        return session_key in self._active_sessions and self._active_sessions[session_key].is_set()
@@ -87,8 +87,9 @@ class VoiceReceiver:
    SAMPLE_RATE = 48000        # Discord native rate
    CHANNELS = 2               # Discord sends stereo

-    def __init__(self, voice_client):
+    def __init__(self, voice_client, allowed_user_ids: set = None):
        self._vc = voice_client
+        self._allowed_user_ids = allowed_user_ids or set()
        self._running = False

        # Decryption
@@ -274,19 +275,21 @@ class VoiceReceiver:
        if self._dave_session:
            with self._lock:
                user_id = self._ssrc_to_user.get(ssrc, 0)
-            if user_id == 0:
-                if self._packet_debug_count <= 10:
-                    logger.warning("DAVE skip: unknown user for ssrc=%d", ssrc)
-                return  # unknown user, can't DAVE-decrypt
-            try:
-                import davey
-                decrypted = self._dave_session.decrypt(
-                    user_id, davey.MediaType.audio, decrypted
-                )
-            except Exception as e:
-                if self._packet_debug_count <= 10:
-                    logger.warning("DAVE decrypt failed for ssrc=%d: %s", ssrc, e)
-                return
+            if user_id:
+                try:
+                    import davey
+                    decrypted = self._dave_session.decrypt(
+                        user_id, davey.MediaType.audio, decrypted
+                    )
+                except Exception as e:
+                    # Unencrypted passthrough — use NaCl-decrypted data as-is
+                    if "Unencrypted" not in str(e):
+                        if self._packet_debug_count <= 10:
+                            logger.warning("DAVE decrypt failed for ssrc=%d: %s", ssrc, e)
+                        return
+            # If SSRC unknown (no SPEAKING event yet), skip DAVE and try
+            # Opus decode directly — audio may be in passthrough mode.
+            # Buffer will get a user_id when SPEAKING event arrives later.

        # --- Opus decode -> PCM ---
        try:
@@ -304,6 +307,32 @@ class VoiceReceiver:
    # Silence detection
    # ------------------------------------------------------------------

+    def _infer_user_for_ssrc(self, ssrc: int) -> int:
+        """Try to infer user_id for an unmapped SSRC.
+
+        When the bot rejoins a voice channel, Discord may not resend
+        SPEAKING events for users already speaking.  If exactly one
+        allowed user is in the channel, map the SSRC to them.
+        """
+        try:
+            channel = self._vc.channel
+            if not channel:
+                return 0
+            bot_id = self._vc.user.id if self._vc.user else 0
+            allowed = self._allowed_user_ids
+            candidates = [
+                m.id for m in channel.members
+                if m.id != bot_id and (not allowed or str(m.id) in allowed)
+            ]
+            if len(candidates) == 1:
+                uid = candidates[0]
+                self._ssrc_to_user[ssrc] = uid
+                logger.info("Auto-mapped ssrc=%d -> user=%d (sole allowed member)", ssrc, uid)
+                return uid
+        except Exception:
+            pass
+        return 0
+
    def check_silence(self) -> list:
        """Return list of (user_id, pcm_bytes) for completed utterances."""
        now = time.monotonic()
@@ -322,6 +351,10 @@ class VoiceReceiver:

                if silence_duration >= self.SILENCE_THRESHOLD and buf_duration >= self.MIN_SPEECH_DURATION:
                    user_id = ssrc_user_map.get(ssrc, 0)
+                    if not user_id:
+                        # SSRC not mapped (SPEAKING event missing after bot rejoin).
+                        # Infer from allowed users in the voice channel.
+                        user_id = self._infer_user_for_ssrc(ssrc)
                    if user_id:
                        completed.append((user_id, bytes(buf)))
                    self._buffers[ssrc] = bytearray()
@@ -400,6 +433,9 @@ class DiscordAdapter(BasePlatformAdapter):
        self._voice_listen_tasks: Dict[int, asyncio.Task] = {}  # guild_id -> listen loop
        self._voice_input_callback: Optional[Callable] = None  # set by run.py
        self._on_voice_disconnect: Optional[Callable] = None  # set by run.py
+        # Track threads where the bot has participated so follow-up messages
+        # in those threads don't require @mention.
+        self._bot_participated_threads: set = set()
    
    async def connect(self) -> bool:
        """Connect to Discord and start receiving events."""
@@ -580,7 +616,7 @@ class DiscordAdapter(BasePlatformAdapter):
        """Send a message to a Discord channel."""
        if not self._client:
            return SendResult(success=False, error="Not connected")
-        
+
        try:
            # Get the channel
            channel = self._client.get_channel(int(chat_id))
@@ -695,13 +731,14 @@ class DiscordAdapter(BasePlatformAdapter):
    ) -> SendResult:
        """Play auto-TTS audio.

-        When the bot is in a voice channel for this chat's guild, skip the
-        file attachment — the gateway runner plays audio in the VC instead.
+        When the bot is in a voice channel for this chat's guild, play
+        directly in the VC instead of sending as a file attachment.
        """
        for gid, text_ch_id in self._voice_text_channels.items():
            if str(text_ch_id) == str(chat_id) and self.is_in_voice_channel(gid):
-                logger.debug("[%s] Skipping play_tts for %s — VC playback handled by runner", self.name, chat_id)
-                return SendResult(success=True)
+                logger.info("[%s] Playing TTS in voice channel (guild=%d)", self.name, gid)
+                success = await self.play_in_voice_channel(gid, audio_path)
+                return SendResult(success=success)
        return await self.send_voice(chat_id=chat_id, audio_path=audio_path, **kwargs)

    async def send_voice(
@@ -805,7 +842,7 @@ class DiscordAdapter(BasePlatformAdapter):

        # Start voice receiver (Phase 2: listen to users)
        try:
-            receiver = VoiceReceiver(vc)
+            receiver = VoiceReceiver(vc, allowed_user_ids=self._allowed_user_ids)
            receiver.start()
            self._voice_receivers[guild_id] = receiver
            self._voice_listen_tasks[guild_id] = asyncio.ensure_future(
@@ -1001,14 +1038,32 @@ class DiscordAdapter(BasePlatformAdapter):
    # Voice listening (Phase 2)
    # ------------------------------------------------------------------

+    # UDP keepalive interval in seconds — prevents Discord from dropping
+    # the UDP route after ~60s of silence.
+    _KEEPALIVE_INTERVAL = 15
+
    async def _voice_listen_loop(self, guild_id: int):
        """Periodically check for completed utterances and process them."""
        receiver = self._voice_receivers.get(guild_id)
        if not receiver:
            return
+        last_keepalive = time.monotonic()
        try:
            while receiver._running:
                await asyncio.sleep(0.2)
+
+                # Send periodic UDP keepalive to prevent Discord from
+                # dropping the UDP session after ~60s of silence.
+                now = time.monotonic()
+                if now - last_keepalive >= self._KEEPALIVE_INTERVAL:
+                    last_keepalive = now
+                    try:
+                        vc = self._voice_clients.get(guild_id)
+                        if vc and vc.is_connected():
+                            vc._connection.send_packet(b'\xf8\xff\xfe')
+                    except Exception:
+                        pass
+
                completed = receiver.check_silence()
                for user_id, pcm_data in completed:
                    if not self._is_allowed_user(str(user_id)):
@@ -1746,14 +1801,13 @@ class DiscordAdapter(BasePlatformAdapter):
    async def _handle_message(self, message: DiscordMessage) -> None:
        """Handle incoming Discord messages."""
        # In server channels (not DMs), require the bot to be @mentioned
-        # UNLESS the channel is in the free-response list.
+        # UNLESS the channel is in the free-response list or the message is
+        # in a thread where the bot has already participated.
        #
-        # Config:
-        #   DISCORD_FREE_RESPONSE_CHANNELS: Comma-separated channel IDs where the
-        #       bot responds to every message without needing a mention.
-        #   DISCORD_REQUIRE_MENTION: Set to "false" to disable mention requirement
-        #       globally (all channels become free-response). Default: "true".
-        #       Can also be set via discord.require_mention in config.yaml.
+        # Config (all settable via discord.* in config.yaml):
+        #   discord.require_mention: Require @mention in server channels (default: true)
+        #   discord.free_response_channels: Channel IDs where bot responds without mention
+        #   discord.auto_thread: Auto-create thread on @mention in channels (default: true)

        thread_id = None
        parent_channel_id = None
@@ -1772,7 +1826,11 @@ class DiscordAdapter(BasePlatformAdapter):
            require_mention = os.getenv("DISCORD_REQUIRE_MENTION", "true").lower() not in ("false", "0", "no")
            is_free_channel = bool(channel_ids & free_channels)

-            if require_mention and not is_free_channel:
+            # Skip the mention check if the message is in a thread where
+            # the bot has previously participated (auto-created or replied in).
+            in_bot_thread = is_thread and thread_id in self._bot_participated_threads
+
+            if require_mention and not is_free_channel and not in_bot_thread:
                if self._client.user not in message.mentions:
                    return

@@ -1781,17 +1839,18 @@ class DiscordAdapter(BasePlatformAdapter):
                message.content = message.content.replace(f"<@!{self._client.user.id}>", "").strip()

        # Auto-thread: when enabled, automatically create a thread for every
-        # new message in a text channel so each conversation is isolated.
+        # @mention in a text channel so each conversation is isolated (like Slack).
        # Messages already inside threads or DMs are unaffected.
        auto_threaded_channel = None
        if not is_thread and not isinstance(message.channel, discord.DMChannel):
-            auto_thread = os.getenv("DISCORD_AUTO_THREAD", "").lower() in ("true", "1", "yes")
+            auto_thread = os.getenv("DISCORD_AUTO_THREAD", "true").lower() in ("true", "1", "yes")
            if auto_thread:
                thread = await self._auto_create_thread(message)
                if thread:
                    is_thread = True
                    thread_id = str(thread.id)
                    auto_threaded_channel = thread
+                    self._bot_participated_threads.add(thread_id)

        # Determine message type
        msg_type = MessageType.TEXT
@@ -1891,7 +1950,12 @@ class DiscordAdapter(BasePlatformAdapter):
            reply_to_message_id=str(message.reference.message_id) if message.reference else None,
            timestamp=message.created_at,
        )
-        
+
+        # Track thread participation so the bot won't require @mention for
+        # follow-up messages in threads it has already engaged in.
+        if thread_id:
+            self._bot_participated_threads.add(thread_id)
+
        await self.handle_message(event)


@@ -111,6 +111,11 @@ class TelegramAdapter(BasePlatformAdapter):
        super().__init__(config, Platform.TELEGRAM)
        self._app: Optional[Application] = None
        self._bot: Optional[Bot] = None
+        # Buffer rapid/album photo updates so Telegram image bursts are handled
+        # as a single MessageEvent instead of self-interrupting multiple turns.
+        self._media_batch_delay_seconds = float(os.getenv("HERMES_TELEGRAM_MEDIA_BATCH_DELAY_SECONDS", "0.8"))
+        self._pending_photo_batches: Dict[str, MessageEvent] = {}
+        self._pending_photo_batch_tasks: Dict[str, asyncio.Task] = {}
        self._media_group_events: Dict[str, MessageEvent] = {}
        self._media_group_tasks: Dict[str, asyncio.Task] = {}
        self._token_lock_identity: Optional[str] = None
@@ -289,13 +294,19 @@ class TelegramAdapter(BasePlatformAdapter):
                release_scoped_lock("telegram-bot-token", self._token_lock_identity)
            except Exception as e:
                logger.warning("[%s] Error releasing Telegram token lock: %s", self.name, e, exc_info=True)
-        
+
+        for task in self._pending_photo_batch_tasks.values():
+            if task and not task.done():
+                task.cancel()
+        self._pending_photo_batch_tasks.clear()
+        self._pending_photo_batches.clear()
+
        self._mark_disconnected()
        self._app = None
        self._bot = None
        self._token_lock_identity = None
        logger.info("[%s] Disconnected from Telegram", self.name)
-    
+
    async def send(
        self,
        chat_id: str,
@@ -311,6 +322,14 @@ class TelegramAdapter(BasePlatformAdapter):
            # Format and split message if needed
            formatted = self.format_message(content)
            chunks = self.truncate_message(formatted, self.MAX_MESSAGE_LENGTH)
+            if len(chunks) > 1:
+                # truncate_message appends a raw " (1/2)" suffix. Escape the
+                # MarkdownV2-special parentheses so Telegram doesn't reject the
+                # chunk and fall back to plain text.
+                chunks = [
+                    re.sub(r" \((\d+)/(\d+)\)$", r" \\(\1/\2\\)", chunk)
+                    for chunk in chunks
+                ]
            
            message_ids = []
            thread_id = metadata.get("thread_id") if metadata else None
@@ -807,6 +826,52 @@ class TelegramAdapter(BasePlatformAdapter):
        event.text = "\n".join(parts)
        await self.handle_message(event)

+    def _photo_batch_key(self, event: MessageEvent, msg: Message) -> str:
+        """Return a batching key for Telegram photos/albums."""
+        from gateway.session import build_session_key
+        session_key = build_session_key(
+            event.source,
+            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
+        )
+        media_group_id = getattr(msg, "media_group_id", None)
+        if media_group_id:
+            return f"{session_key}:album:{media_group_id}"
+        return f"{session_key}:photo-burst"
+
+    async def _flush_photo_batch(self, batch_key: str) -> None:
+        """Send a buffered photo burst/album as a single MessageEvent."""
+        current_task = asyncio.current_task()
+        try:
+            await asyncio.sleep(self._media_batch_delay_seconds)
+            event = self._pending_photo_batches.pop(batch_key, None)
+            if not event:
+                return
+            logger.info("[Telegram] Flushing photo batch %s with %d image(s)", batch_key, len(event.media_urls))
+            await self.handle_message(event)
+        finally:
+            if self._pending_photo_batch_tasks.get(batch_key) is current_task:
+                self._pending_photo_batch_tasks.pop(batch_key, None)
+
+    def _enqueue_photo_event(self, batch_key: str, event: MessageEvent) -> None:
+        """Merge photo events into a pending batch and schedule flush."""
+        existing = self._pending_photo_batches.get(batch_key)
+        if existing is None:
+            self._pending_photo_batches[batch_key] = event
+        else:
+            existing.media_urls.extend(event.media_urls)
+            existing.media_types.extend(event.media_types)
+            if event.text:
+                if not existing.text:
+                    existing.text = event.text
+                elif event.text not in existing.text:
+                    existing.text = f"{existing.text}\n\n{event.text}".strip()
+
+        prior_task = self._pending_photo_batch_tasks.get(batch_key)
+        if prior_task and not prior_task.done():
+            prior_task.cancel()
+
+        self._pending_photo_batch_tasks[batch_key] = asyncio.create_task(self._flush_photo_batch(batch_key))
+
    async def _handle_media_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
        """Handle incoming media messages, downloading images to local cache."""
        if not update.message:
@@ -858,14 +923,22 @@ class TelegramAdapter(BasePlatformAdapter):
                        if file_obj.file_path.lower().endswith(candidate):
                            ext = candidate
                            break
-                # Save to cache and populate media_urls with the local path
+                # Save to local cache (for vision tool access)
                cached_path = cache_image_from_bytes(bytes(image_bytes), ext=ext)
                event.media_urls = [cached_path]
-                event.media_types = [f"image/{ext.lstrip('.')}"]
+                event.media_types = [f"image/{ext.lstrip('.')}" ]
                logger.info("[Telegram] Cached user photo at %s", cached_path)
+                media_group_id = getattr(msg, "media_group_id", None)
+                if media_group_id:
+                    await self._queue_media_group_event(str(media_group_id), event)
+                else:
+                    batch_key = self._photo_batch_key(event, msg)
+                    self._enqueue_photo_event(batch_key, event)
+                return
+
            except Exception as e:
                logger.warning("[Telegram] Failed to cache photo: %s", e, exc_info=True)
-        
+
        # Download voice/audio messages to cache for STT transcription
        if msg.voice:
            try:
@@ -29,22 +29,61 @@ from pathlib import Path
 from datetime import datetime
 from typing import Dict, Optional, Any, List

+# ---------------------------------------------------------------------------
+# SSL certificate auto-detection for NixOS and other non-standard systems.
+# Must run BEFORE any HTTP library (discord, aiohttp, etc.) is imported.
+# ---------------------------------------------------------------------------
+def _ensure_ssl_certs() -> None:
+    """Set SSL_CERT_FILE if the system doesn't expose CA certs to Python."""
+    if "SSL_CERT_FILE" in os.environ:
+        return  # user already configured it
+
+    import ssl
+
+    # 1. Python's compiled-in defaults
+    paths = ssl.get_default_verify_paths()
+    for candidate in (paths.cafile, paths.openssl_cafile):
+        if candidate and os.path.exists(candidate):
+            os.environ["SSL_CERT_FILE"] = candidate
+            return
+
+    # 2. certifi (ships its own Mozilla bundle)
+    try:
+        import certifi
+        os.environ["SSL_CERT_FILE"] = certifi.where()
+        return
+    except ImportError:
+        pass
+
+    # 3. Common distro / macOS locations
+    for candidate in (
+        "/etc/ssl/certs/ca-certificates.crt",               # Debian/Ubuntu/Gentoo
+        "/etc/pki/tls/certs/ca-bundle.crt",                 # RHEL/CentOS 7
+        "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", # RHEL/CentOS 8+
+        "/etc/ssl/ca-bundle.pem",                            # SUSE/OpenSUSE
+        "/etc/ssl/cert.pem",                                 # Alpine / macOS
+        "/etc/pki/tls/cert.pem",                             # Fedora
+        "/usr/local/etc/openssl@1.1/cert.pem",               # macOS Homebrew Intel
+        "/opt/homebrew/etc/openssl@1.1/cert.pem",            # macOS Homebrew ARM
+    ):
+        if os.path.exists(candidate):
+            os.environ["SSL_CERT_FILE"] = candidate
+            return
+
+_ensure_ssl_certs()
+
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent))

 # Resolve Hermes home directory (respects HERMES_HOME override)
 _hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))

-# Load environment variables from ~/.hermes/.env first
-from dotenv import load_dotenv
+# Load environment variables from ~/.hermes/.env first.
+# User-managed env files should override stale shell exports on restart.
+from dotenv import load_dotenv  # backward-compat for tests that monkeypatch this symbol
+from hermes_cli.env_loader import load_hermes_dotenv
 _env_path = _hermes_home / '.env'
-if _env_path.exists():
-    try:
-        load_dotenv(_env_path, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(_env_path, encoding="latin-1")
-# Also try project .env as fallback
-load_dotenv()
+load_hermes_dotenv(hermes_home=_hermes_home, project_env=Path(__file__).resolve().parents[1] / '.env')

 # Bridge config.yaml values into the environment so os.getenv() picks them up.
 # config.yaml is authoritative for terminal settings — overrides .env.
@@ -81,6 +120,7 @@ if _config_path.exists():
                "container_persistent": "TERMINAL_CONTAINER_PERSISTENT",
                "docker_volumes": "TERMINAL_DOCKER_VOLUMES",
                "sandbox_dir": "TERMINAL_SANDBOX_DIR",
+                "persistent_shell": "TERMINAL_PERSISTENT_SHELL",
            }
            for _cfg_key, _env_var in _terminal_env_map.items():
                if _cfg_key in _terminal_cfg:
@@ -278,6 +318,7 @@ class GatewayRunner:
        self._show_reasoning = self._load_show_reasoning()
        self._provider_routing = self._load_provider_routing()
        self._fallback_model = self._load_fallback_model()
+        self._smart_model_routing = self._load_smart_model_routing()

        # Wire process registry into session store for reset protection
        from tools.process_registry import process_registry
@@ -309,7 +350,7 @@ class GatewayRunner:
        # Ensure tirith security scanner is available (downloads if needed)
        try:
            from tools.tirith_security import ensure_installed
-            ensure_installed()
+            ensure_installed(log_failures=False)
        except Exception:
            pass  # Non-fatal — fail-open at scan time if unavailable
        
@@ -438,7 +479,11 @@ class GatewayRunner:

    # -----------------------------------------------------------------

-    def _flush_memories_for_session(self, old_session_id: str):
+    def _flush_memories_for_session(
+        self,
+        old_session_id: str,
+        honcho_session_key: Optional[str] = None,
+    ):
        """Prompt the agent to save memories/skills before context is lost.

        Synchronous worker — meant to be called via run_in_executor from
@@ -466,6 +511,7 @@ class GatewayRunner:
                quiet_mode=True,
                enabled_toolsets=["memory", "skills"],
                session_id=old_session_id,
+                honcho_session_key=honcho_session_key,
            )

            # Build conversation history from transcript
@@ -493,6 +539,7 @@ class GatewayRunner:
            tmp_agent.run_conversation(
                user_message=flush_prompt,
                conversation_history=msgs,
+                sync_honcho=False,
            )
            logger.info("Pre-reset memory flush completed for session %s", old_session_id)
            # Flush any queued Honcho writes before the session is dropped
@@ -504,10 +551,19 @@ class GatewayRunner:
        except Exception as e:
            logger.debug("Pre-reset memory flush failed for session %s: %s", old_session_id, e)

-    async def _async_flush_memories(self, old_session_id: str):
+    async def _async_flush_memories(
+        self,
+        old_session_id: str,
+        honcho_session_key: Optional[str] = None,
+    ):
        """Run the sync memory flush in a thread pool so it won't block the event loop."""
        loop = asyncio.get_event_loop()
-        await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id)
+        await loop.run_in_executor(
+            None,
+            self._flush_memories_for_session,
+            old_session_id,
+            honcho_session_key,
+        )

    @property
    def should_exit_cleanly(self) -> bool:
@@ -517,6 +573,33 @@ class GatewayRunner:
    def exit_reason(self) -> Optional[str]:
        return self._exit_reason

+    def _session_key_for_source(self, source: SessionSource) -> str:
+        """Resolve the current session key for a source, honoring gateway config when available."""
+        if hasattr(self, "session_store") and self.session_store is not None:
+            try:
+                session_key = self.session_store._generate_session_key(source)
+                if isinstance(session_key, str) and session_key:
+                    return session_key
+            except Exception:
+                pass
+        config = getattr(self, "config", None)
+        return build_session_key(
+            source,
+            group_sessions_per_user=getattr(config, "group_sessions_per_user", True),
+        )
+
+    def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict:
+        from agent.smart_model_routing import resolve_turn_route
+
+        primary = {
+            "model": model,
+            "api_key": runtime_kwargs.get("api_key"),
+            "base_url": runtime_kwargs.get("base_url"),
+            "provider": runtime_kwargs.get("provider"),
+            "api_mode": runtime_kwargs.get("api_mode"),
+        }
+        return resolve_turn_route(user_message, getattr(self, "_smart_model_routing", {}), primary)
+
    async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
        """React to a non-retryable adapter failure after startup."""
        logger.error(
@@ -719,6 +802,20 @@ class GatewayRunner:
            pass
        return None

+    @staticmethod
+    def _load_smart_model_routing() -> dict:
+        """Load optional smart cheap-vs-strong model routing config."""
+        try:
+            import yaml as _y
+            cfg_path = _hermes_home / "config.yaml"
+            if cfg_path.exists():
+                with open(cfg_path, encoding="utf-8") as _f:
+                    cfg = _y.safe_load(_f) or {}
+                return cfg.get("smart_model_routing", {}) or {}
+        except Exception:
+            pass
+        return {}
+
    async def start(self) -> bool:
        """
        Start the gateway and all configured platform adapters.
@@ -883,7 +980,7 @@ class GatewayRunner:
                        entry.session_id, key,
                    )
                    try:
-                        await self._async_flush_memories(entry.session_id)
+                        await self._async_flush_memories(entry.session_id, key)
                        self._shutdown_gateway_honcho(key)
                        self.session_store._pre_flushed_sessions.add(entry.session_id)
                    except Exception as e:
@@ -900,8 +997,19 @@ class GatewayRunner:
        """Stop the gateway and disconnect all adapters."""
        logger.info("Stopping gateway...")
        self._running = False
-        
+
+        for session_key, agent in list(self._running_agents.items()):
+            try:
+                agent.interrupt("Gateway shutting down")
+                logger.debug("Interrupted running agent for session %s during shutdown", session_key[:20])
+            except Exception as e:
+                logger.debug("Failed interrupting agent during shutdown: %s", e)
+
        for platform, adapter in list(self.adapters.items()):
+            try:
+                await adapter.cancel_background_tasks()
+            except Exception as e:
+                logger.debug("✗ %s background-task cancel error: %s", platform.value, e)
            try:
                await adapter.disconnect()
                logger.info("✓ %s disconnected", platform.value)
@@ -909,6 +1017,9 @@ class GatewayRunner:
                logger.error("✗ %s disconnect error: %s", platform.value, e)

        self.adapters.clear()
+        self._running_agents.clear()
+        self._pending_messages.clear()
+        self._pending_approvals.clear()
        self._shutdown_all_gateway_honcho()
        self._shutdown_event.set()
        
@@ -931,6 +1042,12 @@ class GatewayRunner:
        config: Any
    ) -> Optional[BasePlatformAdapter]:
        """Create the appropriate adapter for a platform."""
+        if hasattr(config, "extra") and isinstance(config.extra, dict):
+            config.extra.setdefault(
+                "group_sessions_per_user",
+                self.config.group_sessions_per_user,
+            )
+
        if platform == Platform.TELEGRAM:
            from gateway.platforms.telegram import TelegramAdapter, check_telegram_requirements
            if not check_telegram_requirements():
@@ -1095,11 +1212,39 @@ class GatewayRunner:
                        )
            return None
        
-        # PRIORITY: If an agent is already running for this session, interrupt it
-        # immediately. This is before command parsing to minimize latency -- the
-        # user's "stop" message reaches the agent as fast as possible.
-        _quick_key = build_session_key(source)
+        # PRIORITY handling when an agent is already running for this session.
+        # Default behavior is to interrupt immediately so user text/stop messages
+        # are handled with minimal latency.
+        #
+        # Special case: Telegram/photo bursts often arrive as multiple near-
+        # simultaneous updates. Do NOT interrupt for photo-only follow-ups here;
+        # let the adapter-level batching/queueing logic absorb them.
+        _quick_key = self._session_key_for_source(source)
        if _quick_key in self._running_agents:
+            if event.get_command() == "status":
+                return await self._handle_status_command(event)
+
+            if event.message_type == MessageType.PHOTO:
+                logger.debug("PRIORITY photo follow-up for session %s — queueing without interrupt", _quick_key[:20])
+                adapter = self.adapters.get(source.platform)
+                if adapter:
+                    # Reuse adapter queue semantics so photo bursts merge cleanly.
+                    if _quick_key in adapter._pending_messages:
+                        existing = adapter._pending_messages[_quick_key]
+                        if getattr(existing, "message_type", None) == MessageType.PHOTO:
+                            existing.media_urls.extend(event.media_urls)
+                            existing.media_types.extend(event.media_types)
+                            if event.text:
+                                if not existing.text:
+                                    existing.text = event.text
+                                elif event.text not in existing.text:
+                                    existing.text = f"{existing.text}\n\n{event.text}".strip()
+                        else:
+                            adapter._pending_messages[_quick_key] = event
+                    else:
+                        adapter._pending_messages[_quick_key] = event
+                return None
+
            running_agent = self._running_agents[_quick_key]
            logger.debug("PRIORITY interrupt for session %s", _quick_key[:20])
            running_agent.interrupt(event.text)
@@ -1263,7 +1408,7 @@ class GatewayRunner:
                logger.debug("Skill command check failed (non-fatal): %s", e)
        
        # Check for pending exec approval responses
-        session_key_preview = build_session_key(source)
+        session_key_preview = self._session_key_for_source(source)
        if session_key_preview in self._pending_approvals:
            user_text = event.text.strip().lower()
            if user_text in ("yes", "y", "approve", "ok", "go", "do it"):
@@ -1787,6 +1932,8 @@ class GatewayRunner:
            # Update session with actual prompt token count and model from the agent
            self.session_store.update_session(
                session_entry.session_key,
+                input_tokens=agent_result.get("input_tokens", 0),
+                output_tokens=agent_result.get("output_tokens", 0),
                last_prompt_tokens=agent_result.get("last_prompt_tokens", 0),
                model=agent_result.get("model"),
            )
@@ -1813,14 +1960,16 @@ class GatewayRunner:
        source = event.source
        
        # Get existing session key
-        session_key = self.session_store._generate_session_key(source)
+        session_key = self._session_key_for_source(source)
        
        # Flush memories in the background (fire-and-forget) so the user
        # gets the "Session reset!" response immediately.
        try:
            old_entry = self.session_store._entries.get(session_key)
            if old_entry:
-                asyncio.create_task(self._async_flush_memories(old_entry.session_id))
+                asyncio.create_task(
+                    self._async_flush_memories(old_entry.session_id, session_key)
+                )
        except Exception as e:
            logger.debug("Gateway memory flush on reset failed: %s", e)

@@ -1984,6 +2133,12 @@ class GatewayRunner:

        # Parse provider:model syntax
        target_provider, new_model = parse_model_input(args, current_provider)
+        # Auto-detect provider when no explicit provider:model syntax was used
+        if target_provider == current_provider:
+            from hermes_cli.models import detect_provider_for_model
+            detected = detect_provider_for_model(new_model, current_provider)
+            if detected:
+                target_provider, new_model = detected
        provider_changed = target_provider != current_provider

        # Resolve credentials for the target provider (for API probe)
@@ -2396,6 +2551,13 @@ class GatewayRunner:
        except Exception as e:
            logger.warning("Failed to join voice channel: %s", e)
            adapter._voice_input_callback = None
+            err_lower = str(e).lower()
+            if "pynacl" in err_lower or "nacl" in err_lower or "davey" in err_lower:
+                return (
+                    "Voice dependencies are missing (PyNaCl / davey). "
+                    "Install or reinstall Hermes with the messaging extra, e.g. "
+                    "`pip install hermes-agent[messaging]`."
+                )
            return f"Failed to join voice channel: {e}"

        if success:
@@ -2536,18 +2698,9 @@ class GatewayRunner:
        if has_agent_tts:
            return False

-        # Dedup: base adapter auto-TTS already handles voice input.
-        # Exception: Discord voice channel — play_tts override is a no-op,
-        # so the runner must handle VC playback.
-        skip_double = is_voice_input
-        if skip_double:
-            adapter = self.adapters.get(event.source.platform)
-            guild_id = self._get_guild_id(event)
-            if (guild_id and adapter
-                    and hasattr(adapter, "is_in_voice_channel")
-                    and adapter.is_in_voice_channel(guild_id)):
-                skip_double = False
-        if skip_double:
+        # Dedup: base adapter auto-TTS already handles voice input
+        # (play_tts plays in VC when connected, so runner can skip).
+        if is_voice_input:
            return False

        return True
@@ -2768,11 +2921,12 @@ class GatewayRunner:
            max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
            reasoning_config = self._load_reasoning_config()
            self._reasoning_config = reasoning_config
+            turn_route = self._resolve_turn_agent_config(prompt, model, runtime_kwargs)

            def run_sync():
                agent = AIAgent(
-                    model=model,
-                    **runtime_kwargs,
+                    model=turn_route["model"],
+                    **turn_route["runtime"],
                    max_iterations=max_iterations,
                    quiet_mode=True,
                    verbose_logging=False,
@@ -3045,7 +3199,7 @@ class GatewayRunner:
            return "Session database not available."

        source = event.source
-        session_key = build_session_key(source)
+        session_key = self._session_key_for_source(source)
        name = event.get_command_args().strip()

        if not name:
@@ -3089,7 +3243,9 @@ class GatewayRunner:

        # Flush memories for current session before switching
        try:
-            asyncio.create_task(self._async_flush_memories(current_entry.session_id))
+            asyncio.create_task(
+                self._async_flush_memories(current_entry.session_id, session_key)
+            )
        except Exception as e:
            logger.debug("Memory flush on resume failed: %s", e)

@@ -3117,7 +3273,7 @@ class GatewayRunner:
    async def _handle_usage_command(self, event: MessageEvent) -> str:
        """Handle /usage command -- show token usage for the session's last agent run."""
        source = event.source
-        session_key = build_session_key(source)
+        session_key = self._session_key_for_source(source)

        agent = self._running_agents.get(session_key)
        if agent and hasattr(agent, "session_total_tokens") and agent.session_api_calls > 0:
@@ -3469,10 +3625,12 @@ class GatewayRunner:
        os.environ["HERMES_SESSION_CHAT_ID"] = context.source.chat_id
        if context.source.chat_name:
            os.environ["HERMES_SESSION_CHAT_NAME"] = context.source.chat_name
+        if context.source.thread_id:
+            os.environ["HERMES_SESSION_THREAD_ID"] = str(context.source.thread_id)
    
    def _clear_session_env(self) -> None:
        """Clear session environment variables."""
-        for var in ["HERMES_SESSION_PLATFORM", "HERMES_SESSION_CHAT_ID", "HERMES_SESSION_CHAT_NAME"]:
+        for var in ["HERMES_SESSION_PLATFORM", "HERMES_SESSION_CHAT_ID", "HERMES_SESSION_CHAT_NAME", "HERMES_SESSION_THREAD_ID"]:
            if var in os.environ:
                del os.environ[var]
    
@@ -3584,7 +3742,10 @@ class GatewayRunner:
                    )
                else:
                    error = result.get("error", "unknown error")
-                    if "No STT provider" in error or "not set" in error:
+                    if (
+                        "No STT provider" in error
+                        or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set")
+                    ):
                        enriched_parts.append(
                            "[The user sent a voice message but I can't listen "
                            "to it right now~ No STT provider is configured "
@@ -3629,6 +3790,7 @@ class GatewayRunner:
        session_key = watcher.get("session_key", "")
        platform_name = watcher.get("platform", "")
        chat_id = watcher.get("chat_id", "")
+        thread_id = watcher.get("thread_id", "")
        notify_mode = self._load_background_notifications_mode()

        logger.debug("Process watcher started: %s (every %ss, notify=%s)",
@@ -3676,7 +3838,8 @@ class GatewayRunner:
                            break
                    if adapter and chat_id:
                        try:
-                            await adapter.send(chat_id, message_text)
+                            send_meta = {"thread_id": thread_id} if thread_id else None
+                            await adapter.send(chat_id, message_text, metadata=send_meta)
                        except Exception as e:
                            logger.error("Watcher delivery error: %s", e)
                break
@@ -3695,7 +3858,8 @@ class GatewayRunner:
                        break
                if adapter and chat_id:
                    try:
-                        await adapter.send(chat_id, message_text)
+                        send_meta = {"thread_id": thread_id} if thread_id else None
+                        await adapter.send(chat_id, message_text, metadata=send_meta)
                    except Exception as e:
                        logger.error("Watcher delivery error: %s", e)

@@ -3806,45 +3970,8 @@ class GatewayRunner:
            last_tool[0] = tool_name
            
            # Build progress message with primary argument preview
-            tool_emojis = {
-                "terminal": "💻",
-                "process": "⚙️",
-                "web_search": "🔍",
-                "web_extract": "📄",
-                "read_file": "📖",
-                "write_file": "✍️",
-                "patch": "🔧",
-                "search": "🔎",
-                "search_files": "🔎",
-                "list_directory": "📂",
-                "image_generate": "🎨",
-                "text_to_speech": "🔊",
-                "browser_navigate": "🌐",
-                "browser_click": "👆",
-                "browser_type": "⌨️",
-                "browser_snapshot": "📸",
-                "browser_scroll": "📜",
-                "browser_back": "◀️",
-                "browser_press": "⌨️",
-                "browser_close": "🚪",
-                "browser_get_images": "🖼️",
-                "browser_vision": "👁️",
-                "moa_query": "🧠",
-                "mixture_of_agents": "🧠",
-                "vision_analyze": "👁️",
-                "skill_view": "📚",
-                "skills_list": "📋",
-                "todo": "📋",
-                "memory": "🧠",
-                "session_search": "🔍",
-                "send_message": "📨",
-                "cronjob": "⏰",
-                "execute_code": "🐍",
-                "delegate_task": "🔀",
-                "clarify": "❓",
-                "skill_manage": "📝",
-            }
-            emoji = tool_emojis.get(tool_name, "⚙️")
+            from agent.display import get_tool_emoji
+            emoji = get_tool_emoji(tool_name, default="⚙️")
            
            # Verbose mode: show detailed arguments
            if progress_mode == "verbose" and args:
@@ -4033,9 +4160,10 @@ class GatewayRunner:
            honcho_manager, honcho_config = self._get_or_create_gateway_honcho(session_key)
            reasoning_config = self._load_reasoning_config()
            self._reasoning_config = reasoning_config
+            turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)
            agent = AIAgent(
-                model=model,
-                **runtime_kwargs,
+                model=turn_route["model"],
+                **turn_route["runtime"],
                max_iterations=max_iterations,
                quiet_mode=True,
                verbose_logging=False,
@@ -4126,11 +4254,15 @@ class GatewayRunner:
            # Return final response, or a message if something went wrong
            final_response = result.get("final_response")

-            # Extract last actual prompt token count from the agent's compressor
+            # Extract actual token counts from the agent instance used for this run
            _last_prompt_toks = 0
+            _input_toks = 0
+            _output_toks = 0
            _agent = agent_holder[0]
            if _agent and hasattr(_agent, "context_compressor"):
                _last_prompt_toks = getattr(_agent.context_compressor, "last_prompt_tokens", 0)
+                _input_toks = getattr(_agent, "session_prompt_tokens", 0)
+                _output_toks = getattr(_agent, "session_completion_tokens", 0)
            _resolved_model = getattr(_agent, "model", None) if _agent else None

            if not final_response:
@@ -4142,6 +4274,8 @@ class GatewayRunner:
                    "tools": tools_holder[0] or [],
                    "history_offset": len(agent_history),
                    "last_prompt_tokens": _last_prompt_toks,
+                    "input_tokens": _input_toks,
+                    "output_tokens": _output_toks,
                    "model": _resolved_model,
                }
            
@@ -4205,6 +4339,8 @@ class GatewayRunner:
                "tools": tools_holder[0] or [],
                "history_offset": len(agent_history),
                "last_prompt_tokens": _last_prompt_toks,
+                "input_tokens": _input_toks,
+                "output_tokens": _output_toks,
                "model": _resolved_model,
                "session_id": effective_session_id,
            }
@@ -315,31 +315,47 @@ class SessionEntry:
        )


-def build_session_key(source: SessionSource) -> str:
+def build_session_key(source: SessionSource, group_sessions_per_user: bool = True) -> str:
    """Build a deterministic session key from a message source.

    This is the single source of truth for session key construction.

    DM rules:
-      - WhatsApp DMs include chat_id (multi-user support).
-      - Other DMs include thread_id when present (e.g. Slack threaded DMs),
-        so each DM thread gets its own session while top-level DMs share one.
-      - Without thread_id or chat_id, all DMs share a single session.
+      - DMs include chat_id when present, so each private conversation is isolated.
+      - thread_id further differentiates threaded DMs within the same DM chat.
+      - Without chat_id, thread_id is used as a best-effort fallback.
+      - Without thread_id or chat_id, DMs share a single session.

    Group/channel rules:
-      - thread_id differentiates threads within a channel.
-      - Without thread_id, all messages in a channel share one session.
+      - chat_id identifies the parent group/channel.
+      - user_id/user_id_alt isolates participants within that parent chat when available when
+        ``group_sessions_per_user`` is enabled.
+      - thread_id differentiates threads within that parent chat.
+      - Without participant identifiers, or when isolation is disabled, messages fall back to one
+        shared session per chat.
+      - Without identifiers, messages fall back to one session per platform/chat_type.
    """
    platform = source.platform.value
    if source.chat_type == "dm":
+        if source.chat_id:
+            if source.thread_id:
+                return f"agent:main:{platform}:dm:{source.chat_id}:{source.thread_id}"
+            return f"agent:main:{platform}:dm:{source.chat_id}"
        if source.thread_id:
            return f"agent:main:{platform}:dm:{source.thread_id}"
-        if platform == "whatsapp" and source.chat_id:
-            return f"agent:main:{platform}:dm:{source.chat_id}"
        return f"agent:main:{platform}:dm"
+
+    participant_id = source.user_id_alt or source.user_id
+    key_parts = ["agent:main", platform, source.chat_type]
+
+    if source.chat_id:
+        key_parts.append(source.chat_id)
    if source.thread_id:
-        return f"agent:main:{platform}:{source.chat_type}:{source.chat_id}:{source.thread_id}"
-    return f"agent:main:{platform}:{source.chat_type}:{source.chat_id}"
+        key_parts.append(source.thread_id)
+    if group_sessions_per_user and participant_id:
+        key_parts.append(str(participant_id))
+
+    return ":".join(key_parts)


 class SessionStore:
@@ -418,7 +434,10 @@ class SessionStore:
    
    def _generate_session_key(self, source: SessionSource) -> str:
        """Generate a session key from a source."""
-        return build_session_key(source)
+        return build_session_key(
+            source,
+            group_sessions_per_user=getattr(self.config, "group_sessions_per_user", True),
+        )
    
    def _is_session_expired(self, entry: SessionEntry) -> bool:
        """Check if a session has expired based on its reset policy.
@@ -83,8 +83,7 @@ def _looks_like_gateway_process(pid: int) -> bool:
    """Return True when the live PID still looks like the Hermes gateway."""
    cmdline = _read_process_cmdline(pid)
    if not cmdline:
-        # If we cannot inspect the process, fall back to the liveness check.
-        return True
+        return False

    patterns = (
        "hermes_cli.main gateway",
@@ -94,6 +93,24 @@ def _looks_like_gateway_process(pid: int) -> bool:
    return any(pattern in cmdline for pattern in patterns)


+def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
+    """Validate gateway identity from PID-file metadata when cmdline is unavailable."""
+    if record.get("kind") != _GATEWAY_KIND:
+        return False
+
+    argv = record.get("argv")
+    if not isinstance(argv, list) or not argv:
+        return False
+
+    cmdline = " ".join(str(part) for part in argv)
+    patterns = (
+        "hermes_cli.main gateway",
+        "hermes gateway",
+        "gateway/run.py",
+    )
+    return any(pattern in cmdline for pattern in patterns)
+
+
 def _build_pid_record() -> dict:
    return {
        "pid": os.getpid(),
@@ -325,8 +342,9 @@ def get_running_pid() -> Optional[int]:
        return None

    if not _looks_like_gateway_process(pid):
-        remove_pid_file()
-        return None
+        if not _record_looks_like_gateway(record):
+            remove_pid_file()
+            return None

    return pid

@@ -147,6 +147,14 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        api_key_env_vars=("MINIMAX_CN_API_KEY",),
        base_url_env_var="MINIMAX_CN_BASE_URL",
    ),
+    "deepseek": ProviderConfig(
+        id="deepseek",
+        name="DeepSeek",
+        auth_type="api_key",
+        inference_base_url="https://api.deepseek.com/v1",
+        api_key_env_vars=("DEEPSEEK_API_KEY",),
+        base_url_env_var="DEEPSEEK_BASE_URL",
+    ),
 }


@@ -118,6 +118,14 @@ DEFAULT_CONFIG = {
        # Each entry is "host_path:container_path" (standard Docker -v syntax).
        # Example: ["/home/user/projects:/workspace/projects", "/data:/data"]
        "docker_volumes": [],
+        # Explicit opt-in: mount the host cwd into /workspace for Docker sessions.
+        # Default off because passing host directories into a sandbox weakens isolation.
+        "docker_mount_cwd_to_workspace": False,
+        # Persistent shell — keep a long-lived bash shell across execute() calls
+        # so cwd/env vars/shell variables survive between commands.
+        # Enabled by default for non-local backends (SSH); local is always opt-in
+        # via TERMINAL_LOCAL_PERSISTENT env var.
+        "persistent_shell": True,
    },
    
    "browser": {
@@ -129,7 +137,7 @@ DEFAULT_CONFIG = {
    # When enabled, the agent takes a snapshot of the working directory once per
    # conversation turn (on first write_file/patch call).  Use /rollback to restore.
    "checkpoints": {
-        "enabled": False,
+        "enabled": True,
        "max_snapshots": 50,  # Max checkpoints to keep per directory
    },
    
@@ -139,6 +147,12 @@ DEFAULT_CONFIG = {
        "summary_model": "google/gemini-3-flash-preview",
        "summary_provider": "auto",
    },
+    "smart_model_routing": {
+        "enabled": False,
+        "max_simple_chars": 160,
+        "max_simple_words": 28,
+        "cheap_model": {},
+    },
    
    # Auxiliary model config — provider:model for each side task.
    # Format: provider is the provider name, model is the model slug.
@@ -280,6 +294,7 @@ DEFAULT_CONFIG = {
    "discord": {
        "require_mention": True,       # Require @mention to respond in server channels
        "free_response_channels": "",  # Comma-separated channel IDs where bot responds without mention
+        "auto_thread": True,           # Auto-create threads on @mention in channels (like Slack)
    },

    # Permanently allowed dangerous command patterns (added via "always" approval)
@@ -423,6 +438,20 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "DEEPSEEK_API_KEY": {
+        "description": "DeepSeek API key for direct DeepSeek access",
+        "prompt": "DeepSeek API Key",
+        "url": "https://platform.deepseek.com/api_keys",
+        "password": True,
+        "category": "provider",
+    },
+    "DEEPSEEK_BASE_URL": {
+        "description": "Custom DeepSeek API base URL (advanced)",
+        "prompt": "DeepSeek Base URL",
+        "url": "",
+        "password": False,
+        "category": "provider",
+    },

    # ── Tool API keys ──
    "FIRECRAWL_API_KEY": {
@@ -967,6 +996,19 @@ _FALLBACK_COMMENT = """
 # fallback_model:
 #   provider: openrouter
 #   model: anthropic/claude-sonnet-4
+#
+# ── Smart Model Routing ────────────────────────────────────────────────
+# Optional cheap-vs-strong routing for simple turns.
+# Keeps the primary model for complex work, but can route short/simple
+# messages to a cheaper model across providers.
+#
+# smart_model_routing:
+#   enabled: true
+#   max_simple_chars: 160
+#   max_simple_words: 28
+#   cheap_model:
+#     provider: openrouter
+#     model: google/gemini-2.5-flash
 """


@@ -997,6 +1039,19 @@ _COMMENTED_SECTIONS = """
 # fallback_model:
 #   provider: openrouter
 #   model: anthropic/claude-sonnet-4
+#
+# ── Smart Model Routing ────────────────────────────────────────────────
+# Optional cheap-vs-strong routing for simple turns.
+# Keeps the primary model for complex work, but can route short/simple
+# messages to a cheaper model across providers.
+#
+# smart_model_routing:
+#   enabled: true
+#   max_simple_chars: 160
+#   max_simple_words: 28
+#   cheap_model:
+#     provider: openrouter
+#     model: google/gemini-2.5-flash
 """


@@ -1387,9 +1442,11 @@ def set_config_value(key: str, value: str):
        "terminal.singularity_image": "TERMINAL_SINGULARITY_IMAGE",
        "terminal.modal_image": "TERMINAL_MODAL_IMAGE",
        "terminal.daytona_image": "TERMINAL_DAYTONA_IMAGE",
+        "terminal.docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
        "terminal.cwd": "TERMINAL_CWD",
        "terminal.timeout": "TERMINAL_TIMEOUT",
        "terminal.sandbox_dir": "TERMINAL_SANDBOX_DIR",
+        "terminal.persistent_shell": "TERMINAL_PERSISTENT_SHELL",
    }
    if key in _config_to_env_sync:
        save_env_value(_config_to_env_sync[key], str(value))
@@ -0,0 +1,46 @@
+"""Helpers for loading Hermes .env files consistently across entrypoints."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Iterable
+
+from dotenv import load_dotenv
+
+
+def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
+    try:
+        load_dotenv(dotenv_path=path, override=override, encoding="utf-8")
+    except UnicodeDecodeError:
+        load_dotenv(dotenv_path=path, override=override, encoding="latin-1")
+
+
+def load_hermes_dotenv(
+    *,
+    hermes_home: str | os.PathLike | None = None,
+    project_env: str | os.PathLike | None = None,
+) -> list[Path]:
+    """Load Hermes environment files with user config taking precedence.
+
+    Behavior:
+    - `~/.hermes/.env` overrides stale shell-exported values when present.
+    - project `.env` acts as a dev fallback and only fills missing values when
+      the user env exists.
+    - if no user env exists, the project `.env` also overrides stale shell vars.
+    """
+    loaded: list[Path] = []
+
+    home_path = Path(hermes_home or os.getenv("HERMES_HOME", Path.home() / ".hermes"))
+    user_env = home_path / ".env"
+    project_env_path = Path(project_env) if project_env else None
+
+    if user_env.exists():
+        _load_dotenv_with_fallback(user_env, override=True)
+        loaded.append(user_env)
+
+    if project_env_path and project_env_path.exists():
+        _load_dotenv_with_fallback(project_env_path, override=not loaded)
+        loaded.append(project_env_path)
+
+    return loaded
@@ -119,14 +119,35 @@ def is_windows() -> bool:
 # Service Configuration
 # =============================================================================

-SERVICE_NAME = "hermes-gateway"
+_SERVICE_BASE = "hermes-gateway"
 SERVICE_DESCRIPTION = "Hermes Agent Gateway - Messaging Platform Integration"


+def get_service_name() -> str:
+    """Derive a systemd service name scoped to this HERMES_HOME.
+
+    Default ``~/.hermes`` returns ``hermes-gateway`` (backward compatible).
+    Any other HERMES_HOME appends a short hash so multiple installations
+    can each have their own systemd service without conflicting.
+    """
+    import hashlib
+    from pathlib import Path as _Path  # local import to avoid monkeypatch interference
+    home = _Path(os.getenv("HERMES_HOME", _Path.home() / ".hermes")).resolve()
+    default = (_Path.home() / ".hermes").resolve()
+    if home == default:
+        return _SERVICE_BASE
+    suffix = hashlib.sha256(str(home).encode()).hexdigest()[:8]
+    return f"{_SERVICE_BASE}-{suffix}"
+
+
+SERVICE_NAME = _SERVICE_BASE  # backward-compat for external importers; prefer get_service_name()
+
+
 def get_systemd_unit_path(system: bool = False) -> Path:
+    name = get_service_name()
    if system:
-        return Path("/etc/systemd/system") / f"{SERVICE_NAME}.service"
-    return Path.home() / ".config" / "systemd" / "user" / f"{SERVICE_NAME}.service"
+        return Path("/etc/systemd/system") / f"{name}.service"
+    return Path.home() / ".config" / "systemd" / "user" / f"{name}.service"


 def _systemctl_cmd(system: bool = False) -> list[str]:
@@ -362,6 +383,8 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
    sane_path = f"{venv_bin}:{node_bin}:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
    hermes_cli = shutil.which("hermes") or f"{python_path} -m hermes_cli.main"

+    hermes_home = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")).resolve())
+
    if system:
        username, group_name, home_dir = _system_service_identity(run_as_user)
        return f"""[Unit]
@@ -380,6 +403,7 @@ Environment="USER={username}"
 Environment="LOGNAME={username}"
 Environment="PATH={sane_path}"
 Environment="VIRTUAL_ENV={venv_dir}"
+Environment="HERMES_HOME={hermes_home}"
 Restart=on-failure
 RestartSec=10
 KillMode=mixed
@@ -403,6 +427,7 @@ ExecStop={hermes_cli} gateway stop
 WorkingDirectory={working_dir}
 Environment="PATH={sane_path}"
 Environment="VIRTUAL_ENV={venv_dir}"
+Environment="HERMES_HOME={hermes_home}"
 Restart=on-failure
 RestartSec=10
 KillMode=mixed
@@ -455,7 +480,7 @@ def _print_linger_enable_warning(username: str, detail: str | None = None) -> No
    print(f"    sudo loginctl enable-linger {username}")
    print()
    print("  Then restart the gateway:")
-    print(f"    systemctl --user restart {SERVICE_NAME}.service")
+    print(f"    systemctl --user restart {get_service_name()}.service")
    print()


@@ -526,7 +551,7 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
    unit_path.write_text(generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8")

    subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True)
-    subprocess.run(_systemctl_cmd(system) + ["enable", SERVICE_NAME], check=True)
+    subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True)

    print()
    print(f"✓ {_service_scope_label(system).capitalize()} service installed and enabled!")
@@ -534,7 +559,7 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
    print("Next steps:")
    print(f"  {'sudo ' if system else ''}hermes gateway start{scope_flag}              # Start the service")
    print(f"  {'sudo ' if system else ''}hermes gateway status{scope_flag}             # Check status")
-    print(f"  {'journalctl' if system else 'journalctl --user'} -u {SERVICE_NAME} -f  # View logs")
+    print(f"  {'journalctl' if system else 'journalctl --user'} -u {get_service_name()} -f  # View logs")
    print()

    if system:
@@ -552,8 +577,8 @@ def systemd_uninstall(system: bool = False):
    if system:
        _require_root_for_system_service("uninstall")

-    subprocess.run(_systemctl_cmd(system) + ["stop", SERVICE_NAME], check=False)
-    subprocess.run(_systemctl_cmd(system) + ["disable", SERVICE_NAME], check=False)
+    subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=False)
+    subprocess.run(_systemctl_cmd(system) + ["disable", get_service_name()], check=False)

    unit_path = get_systemd_unit_path(system=system)
    if unit_path.exists():
@@ -569,7 +594,7 @@ def systemd_start(system: bool = False):
    if system:
        _require_root_for_system_service("start")
    refresh_systemd_unit_if_needed(system=system)
-    subprocess.run(_systemctl_cmd(system) + ["start", SERVICE_NAME], check=True)
+    subprocess.run(_systemctl_cmd(system) + ["start", get_service_name()], check=True)
    print(f"✓ {_service_scope_label(system).capitalize()} service started")


@@ -578,7 +603,7 @@ def systemd_stop(system: bool = False):
    system = _select_systemd_scope(system)
    if system:
        _require_root_for_system_service("stop")
-    subprocess.run(_systemctl_cmd(system) + ["stop", SERVICE_NAME], check=True)
+    subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=True)
    print(f"✓ {_service_scope_label(system).capitalize()} service stopped")


@@ -588,7 +613,7 @@ def systemd_restart(system: bool = False):
    if system:
        _require_root_for_system_service("restart")
    refresh_systemd_unit_if_needed(system=system)
-    subprocess.run(_systemctl_cmd(system) + ["restart", SERVICE_NAME], check=True)
+    subprocess.run(_systemctl_cmd(system) + ["restart", get_service_name()], check=True)
    print(f"✓ {_service_scope_label(system).capitalize()} service restarted")


@@ -613,12 +638,12 @@ def systemd_status(deep: bool = False, system: bool = False):
        print()

    subprocess.run(
-        _systemctl_cmd(system) + ["status", SERVICE_NAME, "--no-pager"],
+        _systemctl_cmd(system) + ["status", get_service_name(), "--no-pager"],
        capture_output=False,
    )

    result = subprocess.run(
-        _systemctl_cmd(system) + ["is-active", SERVICE_NAME],
+        _systemctl_cmd(system) + ["is-active", get_service_name()],
        capture_output=True,
        text=True,
    )
@@ -657,7 +682,7 @@ def systemd_status(deep: bool = False, system: bool = False):
    if deep:
        print()
        print("Recent logs:")
-        subprocess.run(_journalctl_cmd(system) + ["-u", SERVICE_NAME, "-n", "20", "--no-pager"])
+        subprocess.run(_journalctl_cmd(system) + ["-u", get_service_name(), "-n", "20", "--no-pager"])


 # =============================================================================
@@ -1118,7 +1143,7 @@ def _is_service_running() -> bool:

        if user_unit_exists:
            result = subprocess.run(
-                _systemctl_cmd(False) + ["is-active", SERVICE_NAME],
+                _systemctl_cmd(False) + ["is-active", get_service_name()],
                capture_output=True, text=True
            )
            if result.stdout.strip() == "active":
@@ -1126,7 +1151,7 @@ def _is_service_running() -> bool:

        if system_unit_exists:
            result = subprocess.run(
-                _systemctl_cmd(True) + ["is-active", SERVICE_NAME],
+                _systemctl_cmd(True) + ["is-active", get_service_name()],
                capture_output=True, text=True
            )
            if result.stdout.strip() == "active":
@@ -54,16 +54,11 @@ from typing import Optional
 PROJECT_ROOT = Path(__file__).parent.parent.resolve()
 sys.path.insert(0, str(PROJECT_ROOT))

-# Load .env from ~/.hermes/.env first, then project root as dev fallback
-from dotenv import load_dotenv
-from hermes_cli.config import get_env_path, get_hermes_home
-_user_env = get_env_path()
-if _user_env.exists():
-    try:
-        load_dotenv(dotenv_path=_user_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_user_env, encoding="latin-1")
-load_dotenv(dotenv_path=PROJECT_ROOT / '.env', override=False)
+# Load .env from ~/.hermes/.env first, then project root as dev fallback.
+# User-managed env files should override stale shell exports on restart.
+from hermes_cli.config import get_hermes_home
+from hermes_cli.env_loader import load_hermes_dotenv
+load_hermes_dotenv(project_env=PROJECT_ROOT / '.env')

 # Point mini-swe-agent at ~/.hermes/ so it shares our config
 os.environ.setdefault("MSWEA_GLOBAL_CONFIG_DIR", str(get_hermes_home()))
@@ -1117,8 +1112,32 @@ def _model_flow_custom(config):

    effective_key = api_key or current_key

+    from hermes_cli.models import probe_api_models
+
+    probe = probe_api_models(effective_key, effective_url)
+    if probe.get("used_fallback") and probe.get("resolved_base_url"):
+        print(
+            f"Warning: endpoint verification worked at {probe['resolved_base_url']}/models, "
+            f"not the exact URL you entered. Saving the working base URL instead."
+        )
+        effective_url = probe["resolved_base_url"]
+        if base_url:
+            base_url = effective_url
+    elif probe.get("models") is not None:
+        print(
+            f"Verified endpoint via {probe.get('probed_url')} "
+            f"({len(probe.get('models') or [])} model(s) visible)"
+        )
+    else:
+        print(
+            f"Warning: could not verify this endpoint via {probe.get('probed_url')}. "
+            f"Hermes will still save it."
+        )
+        if probe.get("suggested_base_url"):
+            print(f"  If this server expects /v1, try base URL: {probe['suggested_base_url']}")
+
    if base_url:
-        save_env_value("OPENAI_BASE_URL", base_url)
+        save_env_value("OPENAI_BASE_URL", effective_url)
    if api_key:
        save_env_value("OPENAI_API_KEY", api_key)

@@ -2032,6 +2051,16 @@ def _resolve_stash_selector(git_cmd: list[str], cwd: Path, stash_ref: str) -> Op



+def _print_stash_cleanup_guidance(stash_ref: str, stash_selector: Optional[str] = None) -> None:
+    print("  Check `git status` first so you don't accidentally reapply the same change twice.")
+    print("  Find the saved entry with: git stash list --format='%gd %H %s'")
+    if stash_selector:
+        print(f"  Remove it with: git stash drop {stash_selector}")
+    else:
+        print(f"  Look for commit {stash_ref}, then drop its selector with: git stash drop stash@{{N}}")
+
+
+
 def _restore_stashed_changes(
    git_cmd: list[str],
    cwd: Path,
@@ -2072,7 +2101,7 @@ def _restore_stashed_changes(
    if stash_selector is None:
        print("⚠ Local changes were restored, but Hermes couldn't find the stash entry to drop.")
        print("  The stash was left in place. You can remove it manually after checking the result.")
-        print(f"  Look for commit {stash_ref} in `git stash list --format='%gd %H'` and drop that selector.")
+        _print_stash_cleanup_guidance(stash_ref)
    else:
        drop = subprocess.run(
            git_cmd + ["stash", "drop", stash_selector],
@@ -2087,7 +2116,7 @@ def _restore_stashed_changes(
            if drop.stderr.strip():
                print(drop.stderr.strip())
            print("  The stash was left in place. You can remove it manually after checking the result.")
-            print(f"  If needed: git stash drop {stash_selector}")
+            _print_stash_cleanup_guidance(stash_ref, stash_selector)

    print("⚠ Local changes were restored on top of the updated codebase.")
    print("  Review `git diff` / `git status` if Hermes behaves unexpectedly.")
@@ -2272,26 +2301,60 @@ def cmd_update(args):
        print()
        print("✓ Update complete!")
        
-        # Auto-restart gateway if it's running as a systemd service
+        # Auto-restart gateway if it's running.
+        # Uses the PID file (scoped to HERMES_HOME) to find this
+        # installation's gateway — safe with multiple installations.
        try:
-            check = subprocess.run(
-                ["systemctl", "--user", "is-active", "hermes-gateway"],
-                capture_output=True, text=True, timeout=5,
-            )
-            if check.stdout.strip() == "active":
-                print()
-                print("→ Gateway service is running — restarting to pick up changes...")
-                restart = subprocess.run(
-                    ["systemctl", "--user", "restart", "hermes-gateway"],
-                    capture_output=True, text=True, timeout=15,
+            from gateway.status import get_running_pid, remove_pid_file
+            from hermes_cli.gateway import get_service_name
+            import signal as _signal
+
+            _gw_service_name = get_service_name()
+            existing_pid = get_running_pid()
+            has_systemd_service = False
+
+            try:
+                check = subprocess.run(
+                    ["systemctl", "--user", "is-active", _gw_service_name],
+                    capture_output=True, text=True, timeout=5,
                )
-                if restart.returncode == 0:
-                    print("✓ Gateway restarted.")
-                else:
-                    print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
-                    print("  Try manually: hermes gateway restart")
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            pass  # No systemd (macOS, WSL1, etc.) — skip silently
+                has_systemd_service = check.stdout.strip() == "active"
+            except (FileNotFoundError, subprocess.TimeoutExpired):
+                pass
+
+            if existing_pid or has_systemd_service:
+                print()
+
+                # Kill the PID-file-tracked process (may be manual or systemd)
+                if existing_pid:
+                    try:
+                        os.kill(existing_pid, _signal.SIGTERM)
+                        print(f"→ Stopped gateway process (PID {existing_pid})")
+                    except ProcessLookupError:
+                        pass  # Already gone
+                    except PermissionError:
+                        print(f"⚠ Permission denied killing gateway PID {existing_pid}")
+                    remove_pid_file()
+
+                # Restart the systemd service (starts a fresh process)
+                if has_systemd_service:
+                    import time as _time
+                    _time.sleep(1)  # Brief pause for port/socket release
+                    print("→ Restarting gateway service...")
+                    restart = subprocess.run(
+                        ["systemctl", "--user", "restart", _gw_service_name],
+                        capture_output=True, text=True, timeout=15,
+                    )
+                    if restart.returncode == 0:
+                        print("✓ Gateway restarted.")
+                    else:
+                        print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
+                        print("  Try manually: hermes gateway restart")
+                elif existing_pid:
+                    print("  ℹ️  Gateway was running manually (not as a service).")
+                    print("  Restart it with: hermes gateway run")
+        except Exception as e:
+            logger.debug("Gateway restart during update failed: %s", e)
        
        print()
        print("Tip: You can now select a provider and model:")
@@ -3093,7 +3156,11 @@ For more help on a command:

        elif action == "export":
            if args.session_id:
-                data = db.export_session(args.session_id)
+                resolved_session_id = db.resolve_session_id(args.session_id)
+                if not resolved_session_id:
+                    print(f"Session '{args.session_id}' not found.")
+                    return
+                data = db.export_session(resolved_session_id)
                if not data:
                    print(f"Session '{args.session_id}' not found.")
                    return
@@ -3108,13 +3175,17 @@ For more help on a command:
                print(f"Exported {len(sessions)} sessions to {args.output}")

        elif action == "delete":
+            resolved_session_id = db.resolve_session_id(args.session_id)
+            if not resolved_session_id:
+                print(f"Session '{args.session_id}' not found.")
+                return
            if not args.yes:
-                confirm = input(f"Delete session '{args.session_id}' and all its messages? [y/N] ")
+                confirm = input(f"Delete session '{resolved_session_id}' and all its messages? [y/N] ")
                if confirm.lower() not in ("y", "yes"):
                    print("Cancelled.")
                    return
-            if db.delete_session(args.session_id):
-                print(f"Deleted session '{args.session_id}'.")
+            if db.delete_session(resolved_session_id):
+                print(f"Deleted session '{resolved_session_id}'.")
            else:
                print(f"Session '{args.session_id}' not found.")

@@ -3130,10 +3201,14 @@ For more help on a command:
            print(f"Pruned {count} session(s).")

        elif action == "rename":
+            resolved_session_id = db.resolve_session_id(args.session_id)
+            if not resolved_session_id:
+                print(f"Session '{args.session_id}' not found.")
+                return
            title = " ".join(args.title)
            try:
-                if db.set_session_title(args.session_id, title):
-                    print(f"Session '{args.session_id}' renamed to: {title}")
+                if db.set_session_title(resolved_session_id, title):
+                    print(f"Session '{resolved_session_id}' renamed to: {title}")
                else:
                    print(f"Session '{args.session_id}' not found.")
            except ValueError as e:
@@ -78,6 +78,10 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "claude-sonnet-4-20250514",
        "claude-haiku-4-5-20251001",
    ],
+    "deepseek": [
+        "deepseek-chat",
+        "deepseek-reasoner",
+    ],
 }

 _PROVIDER_LABELS = {
@@ -89,6 +93,7 @@ _PROVIDER_LABELS = {
    "minimax": "MiniMax",
    "minimax-cn": "MiniMax (China)",
    "anthropic": "Anthropic",
+    "deepseek": "DeepSeek",
    "custom": "Custom endpoint",
 }

@@ -103,6 +108,7 @@ _PROVIDER_ALIASES = {
    "minimax_cn": "minimax-cn",
    "claude": "anthropic",
    "claude-code": "anthropic",
+    "deep-seek": "deepseek",
 }


@@ -136,7 +142,7 @@ def list_available_providers() -> list[dict[str, str]]:
    # Canonical providers in display order
    _PROVIDER_ORDER = [
        "openrouter", "nous", "openai-codex",
-        "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic",
+        "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
    ]
    # Build reverse alias map
    aliases_for: dict[str, list[str]] = {}
@@ -212,6 +218,111 @@ def curated_models_for_provider(provider: Optional[str]) -> list[tuple[str, str]
    return [(m, "") for m in models]


+def detect_provider_for_model(
+    model_name: str,
+    current_provider: str,
+) -> Optional[tuple[str, str]]:
+    """Auto-detect the best provider for a model name.
+
+    Returns ``(provider_id, model_name)`` — the model name may be remapped
+    (e.g. bare ``deepseek-chat`` → ``deepseek/deepseek-chat`` for OpenRouter).
+    Returns ``None`` when no confident match is found.
+
+    Priority:
+    1. Direct provider with credentials (highest)
+    2. Direct provider without credentials → remap to OpenRouter slug
+    3. OpenRouter catalog match
+    """
+    name = (model_name or "").strip()
+    if not name:
+        return None
+
+    name_lower = name.lower()
+
+    # Aggregators list other providers' models — never auto-switch TO them
+    _AGGREGATORS = {"nous", "openrouter"}
+
+    # If the model belongs to the current provider's catalog, don't suggest switching
+    current_models = _PROVIDER_MODELS.get(current_provider, [])
+    if any(name_lower == m.lower() for m in current_models):
+        return None
+
+    # --- Step 1: check static provider catalogs for a direct match ---
+    direct_match: Optional[str] = None
+    for pid, models in _PROVIDER_MODELS.items():
+        if pid == current_provider or pid in _AGGREGATORS:
+            continue
+        if any(name_lower == m.lower() for m in models):
+            direct_match = pid
+            break
+
+    if direct_match:
+        # Check if we have credentials for this provider
+        has_creds = False
+        try:
+            from hermes_cli.auth import PROVIDER_REGISTRY
+            pconfig = PROVIDER_REGISTRY.get(direct_match)
+            if pconfig:
+                import os
+                for env_var in pconfig.api_key_env_vars:
+                    if os.getenv(env_var, "").strip():
+                        has_creds = True
+                        break
+        except Exception:
+            pass
+
+        if has_creds:
+            return (direct_match, name)
+
+        # No direct creds — try to find this model on OpenRouter instead
+        or_slug = _find_openrouter_slug(name)
+        if or_slug:
+            return ("openrouter", or_slug)
+        # Still return the direct provider — credential resolution will
+        # give a clear error rather than silently using the wrong provider
+        return (direct_match, name)
+
+    # --- Step 2: check OpenRouter catalog ---
+    # First try exact match (handles provider/model format)
+    or_slug = _find_openrouter_slug(name)
+    if or_slug:
+        if current_provider != "openrouter":
+            return ("openrouter", or_slug)
+        # Already on openrouter, just return the resolved slug
+        if or_slug != name:
+            return ("openrouter", or_slug)
+        return None  # already on openrouter with matching name
+
+    return None
+
+
+def _find_openrouter_slug(model_name: str) -> Optional[str]:
+    """Find the full OpenRouter model slug for a bare or partial model name.
+
+    Handles:
+    - Exact match: ``anthropic/claude-opus-4.6`` → as-is
+    - Bare name: ``deepseek-chat`` → ``deepseek/deepseek-chat``
+    - Bare name: ``claude-opus-4.6`` → ``anthropic/claude-opus-4.6``
+    """
+    name_lower = model_name.strip().lower()
+    if not name_lower:
+        return None
+
+    # Exact match (already has provider/ prefix)
+    for mid, _ in OPENROUTER_MODELS:
+        if name_lower == mid.lower():
+            return mid
+
+    # Try matching just the model part (after the /)
+    for mid, _ in OPENROUTER_MODELS:
+        if "/" in mid:
+            _, model_part = mid.split("/", 1)
+            if name_lower == model_part.lower():
+                return mid
+
+    return None
+
+
 def normalize_provider(provider: Optional[str]) -> str:
    """Normalize provider aliases to Hermes' canonical provider ids.

@@ -308,6 +419,62 @@ def _fetch_anthropic_models(timeout: float = 5.0) -> Optional[list[str]]:
        return None


+def probe_api_models(
+    api_key: Optional[str],
+    base_url: Optional[str],
+    timeout: float = 5.0,
+) -> dict[str, Any]:
+    """Probe an OpenAI-compatible ``/models`` endpoint with light URL heuristics."""
+    normalized = (base_url or "").strip().rstrip("/")
+    if not normalized:
+        return {
+            "models": None,
+            "probed_url": None,
+            "resolved_base_url": "",
+            "suggested_base_url": None,
+            "used_fallback": False,
+        }
+
+    if normalized.endswith("/v1"):
+        alternate_base = normalized[:-3].rstrip("/")
+    else:
+        alternate_base = normalized + "/v1"
+
+    candidates: list[tuple[str, bool]] = [(normalized, False)]
+    if alternate_base and alternate_base != normalized:
+        candidates.append((alternate_base, True))
+
+    tried: list[str] = []
+    headers: dict[str, str] = {}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    for candidate_base, is_fallback in candidates:
+        url = candidate_base.rstrip("/") + "/models"
+        tried.append(url)
+        req = urllib.request.Request(url, headers=headers)
+        try:
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                data = json.loads(resp.read().decode())
+                return {
+                    "models": [m.get("id", "") for m in data.get("data", [])],
+                    "probed_url": url,
+                    "resolved_base_url": candidate_base.rstrip("/"),
+                    "suggested_base_url": alternate_base if alternate_base != candidate_base else normalized,
+                    "used_fallback": is_fallback,
+                }
+        except Exception:
+            continue
+
+    return {
+        "models": None,
+        "probed_url": tried[-1] if tried else normalized.rstrip("/") + "/models",
+        "resolved_base_url": normalized,
+        "suggested_base_url": alternate_base if alternate_base != normalized else None,
+        "used_fallback": False,
+    }
+
+
 def fetch_api_models(
    api_key: Optional[str],
    base_url: Optional[str],
@@ -318,22 +485,7 @@ def fetch_api_models(
    Returns a list of model ID strings, or ``None`` if the endpoint could not
    be reached (network error, timeout, auth failure, etc.).
    """
-    if not base_url:
-        return None
-
-    url = base_url.rstrip("/") + "/models"
-    headers: dict[str, str] = {}
-    if api_key:
-        headers["Authorization"] = f"Bearer {api_key}"
-
-    req = urllib.request.Request(url, headers=headers)
-    try:
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            data = json.loads(resp.read().decode())
-            # Standard OpenAI format: {"data": [{"id": "model-name", ...}, ...]}
-            return [m.get("id", "") for m in data.get("data", [])]
-    except Exception:
-        return None
+    return probe_api_models(api_key, base_url, timeout=timeout).get("models")


 def validate_requested_model(
@@ -376,13 +528,53 @@ def validate_requested_model(
            "message": "Model names cannot contain spaces.",
        }

-    # Custom endpoints can serve any model — skip validation
    if normalized == "custom":
+        probe = probe_api_models(api_key, base_url)
+        api_models = probe.get("models")
+        if api_models is not None:
+            if requested in set(api_models):
+                return {
+                    "accepted": True,
+                    "persist": True,
+                    "recognized": True,
+                    "message": None,
+                }
+
+            suggestions = get_close_matches(requested, api_models, n=3, cutoff=0.5)
+            suggestion_text = ""
+            if suggestions:
+                suggestion_text = "\n  Similar models: " + ", ".join(f"`{s}`" for s in suggestions)
+
+            message = (
+                f"Note: `{requested}` was not found in this custom endpoint's model listing "
+                f"({probe.get('probed_url')}). It may still work if the server supports hidden or aliased models."
+                f"{suggestion_text}"
+            )
+            if probe.get("used_fallback"):
+                message += (
+                    f"\n  Endpoint verification succeeded after trying `{probe.get('resolved_base_url')}`. "
+                    f"Consider saving that as your base URL."
+                )
+
+            return {
+                "accepted": True,
+                "persist": True,
+                "recognized": False,
+                "message": message,
+            }
+
+        message = (
+            f"Note: could not reach this custom endpoint's model listing at `{probe.get('probed_url')}`. "
+            f"Hermes will still save `{requested}`, but the endpoint should expose `/models` for verification."
+        )
+        if probe.get("suggested_base_url"):
+            message += f"\n  If this server expects `/v1`, try base URL: `{probe.get('suggested_base_url')}`"
+
        return {
            "accepted": True,
            "persist": True,
            "recognized": False,
-            "message": None,
+            "message": message,
        }

    # Probe the live API to check if the model actually exists
@@ -227,54 +227,86 @@ def prompt(question: str, default: str = None, password: bool = False) -> str:
        sys.exit(1)


+def _curses_prompt_choice(question: str, choices: list, default: int = 0) -> int:
+    """Single-select menu using curses to avoid simple_term_menu rendering bugs."""
+    try:
+        import curses
+        result_holder = [default]
+
+        def _curses_menu(stdscr):
+            curses.curs_set(0)
+            if curses.has_colors():
+                curses.start_color()
+                curses.use_default_colors()
+                curses.init_pair(1, curses.COLOR_GREEN, -1)
+                curses.init_pair(2, curses.COLOR_YELLOW, -1)
+            cursor = default
+
+            while True:
+                stdscr.clear()
+                max_y, max_x = stdscr.getmaxyx()
+                try:
+                    stdscr.addnstr(
+                        0,
+                        0,
+                        question,
+                        max_x - 1,
+                        curses.A_BOLD | (curses.color_pair(2) if curses.has_colors() else 0),
+                    )
+                except curses.error:
+                    pass
+
+                for i, choice in enumerate(choices):
+                    y = i + 2
+                    if y >= max_y - 1:
+                        break
+                    arrow = "→" if i == cursor else " "
+                    line = f" {arrow}  {choice}"
+                    attr = curses.A_NORMAL
+                    if i == cursor:
+                        attr = curses.A_BOLD
+                        if curses.has_colors():
+                            attr |= curses.color_pair(1)
+                    try:
+                        stdscr.addnstr(y, 0, line, max_x - 1, attr)
+                    except curses.error:
+                        pass
+
+                stdscr.refresh()
+                key = stdscr.getch()
+                if key in (curses.KEY_UP, ord("k")):
+                    cursor = (cursor - 1) % len(choices)
+                elif key in (curses.KEY_DOWN, ord("j")):
+                    cursor = (cursor + 1) % len(choices)
+                elif key in (curses.KEY_ENTER, 10, 13):
+                    result_holder[0] = cursor
+                    return
+                elif key in (27, ord("q")):
+                    return
+
+        curses.wrapper(_curses_menu)
+        return result_holder[0]
+    except Exception:
+        return -1
+
+
+
 def prompt_choice(question: str, choices: list, default: int = 0) -> int:
    """Prompt for a choice from a list with arrow key navigation.

    Escape keeps the current default (skips the question).
    Ctrl+C exits the wizard.
    """
-    print(color(question, Colors.YELLOW))
-
-    # Try to use interactive menu if available
-    try:
-        from simple_term_menu import TerminalMenu
-        import re
-
-        # Strip emoji characters — simple_term_menu miscalculates visual
-        # width of emojis, causing duplicated/garbled lines on redraw.
-        _emoji_re = re.compile(
-            "[\U0001f300-\U0001f9ff\U00002600-\U000027bf\U0000fe00-\U0000fe0f"
-            "\U0001fa00-\U0001fa6f\U0001fa70-\U0001faff\u200d]+",
-            flags=re.UNICODE,
-        )
-        menu_choices = [f"  {_emoji_re.sub('', choice).strip()}" for choice in choices]
-
-        print_info("  ↑/↓ Navigate  Enter Select  Esc Skip  Ctrl+C Exit")
-
-        terminal_menu = TerminalMenu(
-            menu_choices,
-            cursor_index=default,
-            menu_cursor="→ ",
-            menu_cursor_style=("fg_green", "bold"),
-            menu_highlight_style=("fg_green",),
-            cycle_cursor=True,
-            clear_screen=False,
-        )
-
-        idx = terminal_menu.show()
-        if idx is None:  # User pressed Escape — keep current value
-            print_info(f"  Skipped (keeping current)")
+    idx = _curses_prompt_choice(question, choices, default)
+    if idx >= 0:
+        if idx == default:
+            print_info("  Skipped (keeping current)")
            print()
            return default
-        print()  # Add newline after selection
+        print()
        return idx

-    except (ImportError, NotImplementedError):
-        pass
-    except Exception as e:
-        print(f"  (Interactive menu unavailable: {e})")
-
-    # Fallback to number-based selection (simple_term_menu doesn't support Windows)
+    print(color(question, Colors.YELLOW))
    for i, choice in enumerate(choices):
        marker = "●" if i == default else "○"
        if i == default:
@@ -344,84 +376,15 @@ def prompt_checklist(title: str, items: list, pre_selected: list = None) -> list
    if pre_selected is None:
        pre_selected = []

-    print(color(title, Colors.YELLOW))
-    print_info("  SPACE Toggle  ENTER Confirm  ESC Skip  Ctrl+C Exit")
-    print()
+    from hermes_cli.curses_ui import curses_checklist

-    try:
-        from simple_term_menu import TerminalMenu
-        import re
-
-        # Strip emoji characters from menu labels — simple_term_menu miscalculates
-        # visual width of emojis on macOS, causing duplicated/garbled lines.
-        _emoji_re = re.compile(
-            "[\U0001f300-\U0001f9ff\U00002600-\U000027bf\U0000fe00-\U0000fe0f"
-            "\U0001fa00-\U0001fa6f\U0001fa70-\U0001faff\u200d]+",
-            flags=re.UNICODE,
-        )
-        menu_items = [f"  {_emoji_re.sub('', item).strip()}" for item in items]
-
-        # Map pre-selected indices to the actual menu entry strings
-        preselected = [menu_items[i] for i in pre_selected if i < len(menu_items)]
-
-        terminal_menu = TerminalMenu(
-            menu_items,
-            multi_select=True,
-            show_multi_select_hint=False,
-            multi_select_cursor="[✓] ",
-            multi_select_select_on_accept=False,
-            multi_select_empty_ok=True,
-            preselected_entries=preselected if preselected else None,
-            menu_cursor="→ ",
-            menu_cursor_style=("fg_green", "bold"),
-            menu_highlight_style=("fg_green",),
-            cycle_cursor=True,
-            clear_screen=False,
-        )
-
-        terminal_menu.show()
-
-        if terminal_menu.chosen_menu_entries is None:
-            print_info("  Skipped (keeping current)")
-            return list(pre_selected)
-
-        selected = list(terminal_menu.chosen_menu_indices or [])
-        return selected
-
-    except (ImportError, NotImplementedError):
-        # Fallback: numbered toggle interface (simple_term_menu doesn't support Windows)
-        selected = set(pre_selected)
-
-        while True:
-            for i, item in enumerate(items):
-                marker = color("[✓]", Colors.GREEN) if i in selected else "[ ]"
-                print(f"  {marker} {i + 1}. {item}")
-            print()
-
-            try:
-                value = input(
-                    color("  Toggle # (or Enter to confirm): ", Colors.DIM)
-                ).strip()
-                if not value:
-                    break
-                idx = int(value) - 1
-                if 0 <= idx < len(items):
-                    if idx in selected:
-                        selected.discard(idx)
-                    else:
-                        selected.add(idx)
-                else:
-                    print_error(f"Enter a number between 1 and {len(items)}")
-            except ValueError:
-                print_error("Enter a number")
-            except (KeyboardInterrupt, EOFError):
-                print()
-                return []
-
-            # Clear and redraw (simple approach)
-            print()
-
-        return sorted(selected)
+    chosen = curses_checklist(
+        title,
+        items,
+        set(pre_selected),
+        cancel_returns=set(pre_selected),
+    )
+    return sorted(chosen)


 def _prompt_api_key(var: dict):
@@ -780,6 +743,7 @@ def setup_model_provider(config: dict):
    selected_provider = (
        None  # "nous", "openai-codex", "openrouter", "custom", or None (keep)
    )
+    selected_base_url = None  # deferred until after model selection
    nous_models = []  # populated if Nous login succeeds

    if provider_idx == 0:  # Nous Portal (OAuth)
@@ -933,11 +897,35 @@ def setup_model_provider(config: dict):

        base_url = prompt(
            "  API base URL (e.g., https://api.example.com/v1)", current_url
-        )
+        ).strip()
        api_key = prompt("  API key", password=True)
        model_name = prompt("  Model name (e.g., gpt-4, claude-3-opus)", current_model)

        if base_url:
+            from hermes_cli.models import probe_api_models
+
+            probe = probe_api_models(api_key, base_url)
+            if probe.get("used_fallback") and probe.get("resolved_base_url"):
+                print_warning(
+                    f"Endpoint verification worked at {probe['resolved_base_url']}/models, "
+                    f"not the exact URL you entered. Saving the working base URL instead."
+                )
+                base_url = probe["resolved_base_url"]
+            elif probe.get("models") is not None:
+                print_success(
+                    f"Verified endpoint via {probe.get('probed_url')} "
+                    f"({len(probe.get('models') or [])} model(s) visible)"
+                )
+            else:
+                print_warning(
+                    f"Could not verify this endpoint via {probe.get('probed_url')}. "
+                    f"Hermes will still save it."
+                )
+                if probe.get("suggested_base_url"):
+                    print_info(
+                        f"  If this server expects /v1, try base URL: {probe['suggested_base_url']}"
+                    )
+
            save_env_value("OPENAI_BASE_URL", base_url)
        if api_key:
            save_env_value("OPENAI_API_KEY", api_key)
@@ -1038,8 +1026,8 @@ def setup_model_provider(config: dict):
        if existing_custom:
            save_env_value("OPENAI_BASE_URL", "")
            save_env_value("OPENAI_API_KEY", "")
-        _update_config_for_provider("zai", zai_base_url, default_model="glm-5")
        _set_model_provider(config, "zai", zai_base_url)
+        selected_base_url = zai_base_url

    elif provider_idx == 5:  # Kimi / Moonshot
        selected_provider = "kimi-coding"
@@ -1071,8 +1059,8 @@ def setup_model_provider(config: dict):
        if existing_custom:
            save_env_value("OPENAI_BASE_URL", "")
            save_env_value("OPENAI_API_KEY", "")
-        _update_config_for_provider("kimi-coding", pconfig.inference_base_url, default_model="kimi-k2.5")
        _set_model_provider(config, "kimi-coding", pconfig.inference_base_url)
+        selected_base_url = pconfig.inference_base_url

    elif provider_idx == 6:  # MiniMax
        selected_provider = "minimax"
@@ -1104,8 +1092,8 @@ def setup_model_provider(config: dict):
        if existing_custom:
            save_env_value("OPENAI_BASE_URL", "")
            save_env_value("OPENAI_API_KEY", "")
-        _update_config_for_provider("minimax", pconfig.inference_base_url, default_model="MiniMax-M2.5")
        _set_model_provider(config, "minimax", pconfig.inference_base_url)
+        selected_base_url = pconfig.inference_base_url

    elif provider_idx == 7:  # MiniMax China
        selected_provider = "minimax-cn"
@@ -1137,8 +1125,8 @@ def setup_model_provider(config: dict):
        if existing_custom:
            save_env_value("OPENAI_BASE_URL", "")
            save_env_value("OPENAI_API_KEY", "")
-        _update_config_for_provider("minimax-cn", pconfig.inference_base_url, default_model="MiniMax-M2.5")
        _set_model_provider(config, "minimax-cn", pconfig.inference_base_url)
+        selected_base_url = pconfig.inference_base_url

    elif provider_idx == 8:  # Anthropic
        selected_provider = "anthropic"
@@ -1241,8 +1229,8 @@ def setup_model_provider(config: dict):
            save_env_value("OPENAI_API_KEY", "")
        # Don't save base_url for Anthropic — resolve_runtime_provider()
        # always hardcodes it. Stale base_urls contaminate other providers.
-        _update_config_for_provider("anthropic", "", default_model="claude-opus-4-6")
        _set_model_provider(config, "anthropic")
+        selected_base_url = ""

    # else: provider_idx == 9 (Keep current) — only shown when a provider already exists
    # Normalize "keep current" to an explicit provider so downstream logic
@@ -1472,6 +1460,12 @@ def setup_model_provider(config: dict):
            )
            print_success(f"Model set to: {_display}")

+    # Write provider+base_url to config.yaml only after model selection is complete.
+    # This prevents a race condition where the gateway picks up a new provider
+    # before the model name has been updated to match.
+    if selected_provider in ("zai", "kimi-coding", "minimax", "minimax-cn", "anthropic") and selected_base_url is not None:
+        _update_config_for_provider(selected_provider, selected_base_url)
+
    save_config(config)


@@ -60,6 +60,12 @@ All fields are optional. Missing values inherit from the ``default`` skin.
    # Tool prefix: character for tool output lines (default: ┊)
    tool_prefix: "┊"

+    # Tool emojis: override the default emoji for any tool (used in spinners & progress)
+    tool_emojis:
+      terminal: "⚔"           # Override terminal tool emoji
+      web_search: "🔮"        # Override web_search tool emoji
+      # Any tool not listed here uses its registry default
+
 USAGE
 =====

@@ -111,6 +117,7 @@ class SkinConfig:
    spinner: Dict[str, Any] = field(default_factory=dict)
    branding: Dict[str, str] = field(default_factory=dict)
    tool_prefix: str = "┊"
+    tool_emojis: Dict[str, str] = field(default_factory=dict)  # per-tool emoji overrides
    banner_logo: str = ""    # Rich-markup ASCII art logo (replaces HERMES_AGENT_LOGO)
    banner_hero: str = ""    # Rich-markup hero art (replaces HERMES_CADUCEUS)

@@ -541,6 +548,7 @@ def _build_skin_config(data: Dict[str, Any]) -> SkinConfig:
        spinner=spinner,
        branding=branding,
        tool_prefix=data.get("tool_prefix", default.get("tool_prefix", "┊")),
+        tool_emojis=data.get("tool_emojis", {}),
        banner_logo=data.get("banner_logo", ""),
        banner_hero=data.get("banner_hero", ""),
    )
@@ -275,8 +275,13 @@ def show_status(args):
    print(color("◆ Gateway Service", Colors.CYAN, Colors.BOLD))
    
    if sys.platform.startswith('linux'):
+        try:
+            from hermes_cli.gateway import get_service_name
+            _gw_svc = get_service_name()
+        except Exception:
+            _gw_svc = "hermes-gateway"
        result = subprocess.run(
-            ["systemctl", "--user", "is-active", "hermes-gateway"],
+            ["systemctl", "--user", "is-active", _gw_svc],
            capture_output=True,
            text=True
        )
@@ -354,9 +354,29 @@ def _get_platform_tools(config: dict, platform: str) -> Set[str]:


 def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[str]):
-    """Save the selected toolset keys for a platform to config."""
+    """Save the selected toolset keys for a platform to config.
+
+    Preserves any non-configurable toolset entries (like MCP server names)
+    that were already in the config for this platform.
+    """
    config.setdefault("platform_toolsets", {})
-    config["platform_toolsets"][platform] = sorted(enabled_toolset_keys)
+
+    # Get the set of all configurable toolset keys
+    configurable_keys = {ts_key for ts_key, _, _ in CONFIGURABLE_TOOLSETS}
+
+    # Get existing toolsets for this platform
+    existing_toolsets = config.get("platform_toolsets", {}).get(platform, [])
+    if not isinstance(existing_toolsets, list):
+        existing_toolsets = []
+
+    # Preserve any entries that are NOT configurable toolsets (i.e. MCP server names)
+    preserved_entries = {
+        entry for entry in existing_toolsets
+        if entry not in configurable_keys
+    }
+
+    # Merge preserved entries with new enabled toolsets
+    config["platform_toolsets"][platform] = sorted(enabled_toolset_keys | preserved_entries)
    save_config(config)


@@ -133,7 +133,13 @@ def uninstall_gateway_service():
    if platform.system() != "Linux":
        return False
    
-    service_file = Path.home() / ".config" / "systemd" / "user" / "hermes-gateway.service"
+    try:
+        from hermes_cli.gateway import get_service_name
+        svc_name = get_service_name()
+    except Exception:
+        svc_name = "hermes-gateway"
+
+    service_file = Path.home() / ".config" / "systemd" / "user" / f"{svc_name}.service"
    
    if not service_file.exists():
        return False
@@ -141,14 +147,14 @@ def uninstall_gateway_service():
    try:
        # Stop the service
        subprocess.run(
-            ["systemctl", "--user", "stop", "hermes-gateway"],
+            ["systemctl", "--user", "stop", svc_name],
            capture_output=True,
            check=False
        )
        
        # Disable the service
        subprocess.run(
-            ["systemctl", "--user", "disable", "hermes-gateway"],
+            ["systemctl", "--user", "disable", svc_name],
            capture_output=True,
            check=False
        )
@@ -249,6 +249,32 @@ class SessionDB:
        row = cursor.fetchone()
        return dict(row) if row else None

+    def resolve_session_id(self, session_id_or_prefix: str) -> Optional[str]:
+        """Resolve an exact or uniquely prefixed session ID to the full ID.
+
+        Returns the exact ID when it exists. Otherwise treats the input as a
+        prefix and returns the single matching session ID if the prefix is
+        unambiguous. Returns None for no matches or ambiguous prefixes.
+        """
+        exact = self.get_session(session_id_or_prefix)
+        if exact:
+            return exact["id"]
+
+        escaped = (
+            session_id_or_prefix
+            .replace("\\", "\\\\")
+            .replace("%", "\\%")
+            .replace("_", "\\_")
+        )
+        cursor = self._conn.execute(
+            "SELECT id FROM sessions WHERE id LIKE ? ESCAPE '\\' ORDER BY started_at DESC LIMIT 2",
+            (f"{escaped}%",),
+        )
+        matches = [row["id"] for row in cursor.fetchall()]
+        if len(matches) == 1:
+            return matches[0]
+        return None
+
    # Maximum length for session titles
    MAX_TITLE_LENGTH = 100

@@ -927,6 +927,11 @@ class HonchoSessionManager:
            return False

        assistant_peer = self._get_or_create_peer(session.assistant_peer_id)
+        honcho_session = self._sessions_cache.get(session.honcho_session_id)
+        if not honcho_session:
+            logger.warning("No Honcho session cached for '%s', skipping AI seed", session_key)
+            return False
+
        try:
            wrapped = (
                f"<ai_identity_seed>\n"
@@ -935,7 +940,7 @@ class HonchoSessionManager:
                f"{content.strip()}\n"
                f"</ai_identity_seed>"
            )
-            assistant_peer.add_message("assistant", wrapped)
+            honcho_session.add_messages([assistant_peer.message(wrapped)])
            logger.info("Seeded AI identity from '%s' into %s", source, session_key)
            return True
        except Exception as e:
@@ -267,6 +267,8 @@ def handle_function_call(
    task_id: Optional[str] = None,
    user_task: Optional[str] = None,
    enabled_tools: Optional[List[str]] = None,
+    honcho_manager: Optional[Any] = None,
+    honcho_session_key: Optional[str] = None,
 ) -> str:
    """
    Main function call dispatcher that routes calls to the tool registry.
@@ -306,12 +308,16 @@ def handle_function_call(
                function_name, function_args,
                task_id=task_id,
                enabled_tools=sandbox_enabled,
+                honcho_manager=honcho_manager,
+                honcho_session_key=honcho_session_key,
            )

        return registry.dispatch(
            function_name, function_args,
            task_id=task_id,
            user_task=user_task,
+            honcho_manager=honcho_manager,
+            honcho_session_key=honcho_session_key,
        )

    except Exception as e:
@@ -0,0 +1,422 @@
+---
+name: oss-forensics
+description: |
+  Supply chain investigation, evidence recovery, and forensic analysis for GitHub repositories.
+  Covers deleted commit recovery, force-push detection, IOC extraction, multi-source evidence
+  collection, hypothesis formation/validation, and structured forensic reporting.
+  Inspired by RAPTOR's 1800+ line OSS Forensics system.
+category: security
+triggers:
+  - "investigate this repository"
+  - "investigate [owner/repo]"
+  - "check for supply chain compromise"
+  - "recover deleted commits"
+  - "forensic analysis of [owner/repo]"
+  - "was this repo compromised"
+  - "supply chain attack"
+  - "suspicious commit"
+  - "force push detected"
+  - "IOC extraction"
+toolsets:
+  - terminal
+  - web
+  - file
+  - delegation
+---
+
+# OSS Security Forensics Skill
+
+A 7-phase multi-agent investigation framework for researching open-source supply chain attacks.
+Adapted from RAPTOR's forensics system. Covers GitHub Archive, Wayback Machine, GitHub API,
+local git analysis, IOC extraction, evidence-backed hypothesis formation and validation,
+and final forensic report generation.
+
+---
+
+## ⚠️ Anti-Hallucination Guardrails
+
+Read these before every investigation step. Violating them invalidates the report.
+
+1. **Evidence-First Rule**: Every claim in any report, hypothesis, or summary MUST cite at least one evidence ID (`EV-XXXX`). Assertions without citations are forbidden.
+2. **STAY IN YOUR LANE**: Each sub-agent (investigator) has a single data source. Do NOT mix sources. The GH Archive investigator does not query the GitHub API, and vice versa. Role boundaries are hard.
+3. **Fact vs. Hypothesis Separation**: Mark all unverified inferences with `[HYPOTHESIS]`. Only statements verified against original sources may be stated as facts.
+4. **No Evidence Fabrication**: The hypothesis validator MUST mechanically check that every cited evidence ID actually exists in the evidence store before accepting a hypothesis.
+5. **Proof-Required Disproval**: A hypothesis cannot be dismissed without a specific, evidence-backed counter-argument. "No evidence found" is not sufficient to disprove—it only makes a hypothesis inconclusive.
+6. **SHA/URL Double-Verification**: Any commit SHA, URL, or external identifier cited as evidence must be independently confirmed from at least two sources before being marked as verified.
+7. **Suspicious Code Rule**: Never run code found inside the investigated repository locally. Analyze statically only, or use `execute_code` in a sandboxed environment.
+8. **Secret Redaction**: Any API keys, tokens, or credentials discovered during investigation must be redacted in the final report. Log them internally only.
+
+---
+
+## Example Scenarios
+
+- **Scenario A: Dependency Confusion**: A malicious package `internal-lib-v2` is uploaded to NPM with a higher version than the internal one. The investigator must track when this package was first seen and if any PushEvents in the target repo updated `package.json` to this version.
+- **Scenario B: Maintainer Takeover**: A long-term contributor's account is used to push a backdoored `.github/workflows/build.yml`. The investigator looks for PushEvents from this user after a long period of inactivity or from a new IP/location (if detectable via BigQuery).
+- **Scenario C: Force-Push Hide**: A developer accidentally commits a production secret, then force-pushes to "fix" it. The investigator uses `git fsck` and GH Archive to recover the original commit SHA and verify what was leaked.
+
+---
+
+> **Path convention**: Throughout this skill, `SKILL_DIR` refers to the root of this skill's
+> installation directory (the folder containing this `SKILL.md`). When the skill is loaded,
+> resolve `SKILL_DIR` to the actual path — e.g. `~/.hermes/skills/security/oss-forensics/`
+> or the `optional-skills/` equivalent. All script and template references are relative to it.
+
+## Phase 0: Initialization
+
+1. Create investigation working directory:
+   ```bash
+   mkdir investigation_$(echo "REPO_NAME" | tr '/' '_')
+   cd investigation_$(echo "REPO_NAME" | tr '/' '_')
+   ```
+2. Initialize the evidence store:
+   ```bash
+   python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json list
+   ```
+3. Copy the forensic report template:
+   ```bash
+   cp SKILL_DIR/templates/forensic-report.md ./investigation-report.md
+   ```
+4. Create an `iocs.md` file to track Indicators of Compromise as they are discovered.
+5. Record the investigation start time, target repository, and stated investigation goal.
+
+---
+
+## Phase 1: Prompt Parsing and IOC Extraction
+
+**Goal**: Extract all structured investigative targets from the user's request.
+
+**Actions**:
+- Parse the user prompt and extract:
+  - Target repository (`owner/repo`)
+  - Target actors (GitHub handles, email addresses)
+  - Time window of interest (commit date ranges, PR timestamps)
+  - Provided Indicators of Compromise: commit SHAs, file paths, package names, IP addresses, domains, API keys/tokens, malicious URLs
+  - Any linked vendor security reports or blog posts
+
+**Tools**: Reasoning only, or `execute_code` for regex extraction from large text blocks.
+
+**Output**: Populate `iocs.md` with extracted IOCs. Each IOC must have:
+- Type (from: COMMIT_SHA, FILE_PATH, API_KEY, SECRET, IP_ADDRESS, DOMAIN, PACKAGE_NAME, ACTOR_USERNAME, MALICIOUS_URL, OTHER)
+- Value
+- Source (user-provided, inferred)
+
+**Reference**: See [evidence-types.md](./references/evidence-types.md) for IOC taxonomy.
+
+---
+
+## Phase 2: Parallel Evidence Collection
+
+Spawn up to 5 specialist investigator sub-agents using `delegate_task` (batch mode, max 3 concurrent). Each investigator has a **single data source** and must not mix sources.
+
+> **Orchestrator note**: Pass the IOC list from Phase 1 and the investigation time window in the `context` field of each delegated task.
+
+---
+
+### Investigator 1: Local Git Investigator
+
+**ROLE BOUNDARY**: You query the LOCAL GIT REPOSITORY ONLY. Do not call any external APIs.
+
+**Actions**:
+```bash
+# Clone repository
+git clone https://github.com/OWNER/REPO.git target_repo && cd target_repo
+
+# Full commit log with stats
+git log --all --full-history --stat --format="%H|%ae|%an|%ai|%s" > ../git_log.txt
+
+# Detect force-push evidence (orphaned/dangling commits)
+git fsck --lost-found --unreachable 2>&1 | grep commit > ../dangling_commits.txt
+
+# Check reflog for rewritten history
+git reflog --all > ../reflog.txt
+
+# List ALL branches including deleted remote refs
+git branch -a -v > ../branches.txt
+
+# Find suspicious large binary additions
+git log --all --diff-filter=A --name-only --format="%H %ai" -- "*.so" "*.dll" "*.exe" "*.bin" > ../binary_additions.txt
+
+# Check for GPG signature anomalies
+git log --show-signature --format="%H %ai %aN" > ../signature_check.txt 2>&1
+```
+
+**Evidence to collect** (add via `python3 SKILL_DIR/scripts/evidence-store.py add`):
+- Each dangling commit SHA → type: `git`
+- Force-push evidence (reflog showing history rewrite) → type: `git`
+- Unsigned commits from verified contributors → type: `git`
+- Suspicious binary file additions → type: `git`
+
+**Reference**: See [recovery-techniques.md](./references/recovery-techniques.md) for accessing force-pushed commits.
+
+---
+
+### Investigator 2: GitHub API Investigator
+
+**ROLE BOUNDARY**: You query the GITHUB REST API ONLY. Do not run git commands locally.
+
+**Actions**:
+```bash
+# Commits (paginated)
+curl -s "https://api.github.com/repos/OWNER/REPO/commits?per_page=100" > api_commits.json
+
+# Pull Requests including closed/deleted
+curl -s "https://api.github.com/repos/OWNER/REPO/pulls?state=all&per_page=100" > api_prs.json
+
+# Issues
+curl -s "https://api.github.com/repos/OWNER/REPO/issues?state=all&per_page=100" > api_issues.json
+
+# Contributors and collaborator changes
+curl -s "https://api.github.com/repos/OWNER/REPO/contributors" > api_contributors.json
+
+# Repository events (last 300)
+curl -s "https://api.github.com/repos/OWNER/REPO/events?per_page=100" > api_events.json
+
+# Check specific suspicious commit SHA details
+curl -s "https://api.github.com/repos/OWNER/REPO/git/commits/SHA" > commit_detail.json
+
+# Releases
+curl -s "https://api.github.com/repos/OWNER/REPO/releases?per_page=100" > api_releases.json
+
+# Check if a specific commit exists (force-pushed commits may 404 on commits/ but succeed on git/commits/)
+curl -s "https://api.github.com/repos/OWNER/REPO/commits/SHA" | jq .sha
+```
+
+**Cross-reference targets** (flag discrepancies as evidence):
+- PR exists in archive but missing from API → evidence of deletion
+- Contributor in archive events but not in contributors list → evidence of permission revocation
+- Commit in archive PushEvents but not in API commit list → evidence of force-push/deletion
+
+**Reference**: See [evidence-types.md](./references/evidence-types.md) for GH event types.
+
+---
+
+### Investigator 3: Wayback Machine Investigator
+
+**ROLE BOUNDARY**: You query the WAYBACK MACHINE CDX API ONLY. Do not use the GitHub API.
+
+**Goal**: Recover deleted GitHub pages (READMEs, issues, PRs, releases, wiki pages).
+
+**Actions**:
+```bash
+# Search for archived snapshots of the repo main page
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO&output=json&limit=100&from=YYYYMMDD&to=YYYYMMDD" > wayback_main.json
+
+# Search for a specific deleted issue
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO/issues/NUM&output=json&limit=50" > wayback_issue_NUM.json
+
+# Search for a specific deleted PR
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO/pull/NUM&output=json&limit=50" > wayback_pr_NUM.json
+
+# Fetch the best snapshot of a page
+# Use the Wayback Machine URL: https://web.archive.org/web/TIMESTAMP/ORIGINAL_URL
+# Example: https://web.archive.org/web/20240101000000*/github.com/OWNER/REPO
+
+# Advanced: Search for deleted releases/tags
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO/releases/tag/*&output=json" > wayback_tags.json
+
+# Advanced: Search for historical wiki changes
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO/wiki/*&output=json" > wayback_wiki.json
+```
+
+**Evidence to collect**:
+- Archived snapshots of deleted issues/PRs with their content
+- Historical README versions showing changes
+- Evidence of content present in archive but missing from current GitHub state
+
+**Reference**: See [github-archive-guide.md](./references/github-archive-guide.md) for CDX API parameters.
+
+---
+
+### Investigator 4: GH Archive / BigQuery Investigator
+
+**ROLE BOUNDARY**: You query GITHUB ARCHIVE via BIGQUERY ONLY. This is a tamper-proof record of all public GitHub events.
+
+> **Prerequisites**: Requires Google Cloud credentials with BigQuery access (`gcloud auth application-default login`). If unavailable, skip this investigator and note it in the report.
+
+**Cost Optimization Rules** (MANDATORY):
+1. ALWAYS run a `--dry_run` before every query to estimate cost.
+2. Use `_TABLE_SUFFIX` to filter by date range and minimize scanned data.
+3. Only SELECT the columns you need.
+4. Add a LIMIT unless aggregating.
+
+```bash
+# Template: safe BigQuery query for PushEvents to OWNER/REPO
+bq query --use_legacy_sql=false --dry_run "
+SELECT created_at, actor.login, payload.commits, payload.before, payload.head,
+       payload.size, payload.distinct_size
+FROM \`githubarchive.month.*\`
+WHERE _TABLE_SUFFIX BETWEEN 'YYYYMM' AND 'YYYYMM'
+  AND type = 'PushEvent'
+  AND repo.name = 'OWNER/REPO'
+LIMIT 1000
+"
+# If cost is acceptable, re-run without --dry_run
+
+# Detect force-pushes: zero-distinct_size PushEvents mean commits were force-erased
+# payload.distinct_size = 0 AND payload.size > 0 → force push indicator
+
+# Check for deleted branch events
+bq query --use_legacy_sql=false "
+SELECT created_at, actor.login, payload.ref, payload.ref_type
+FROM \`githubarchive.month.*\`
+WHERE _TABLE_SUFFIX BETWEEN 'YYYYMM' AND 'YYYYMM'
+  AND type = 'DeleteEvent'
+  AND repo.name = 'OWNER/REPO'
+LIMIT 200
+"
+```
+
+**Evidence to collect**:
+- Force-push events (payload.size > 0, payload.distinct_size = 0)
+- DeleteEvents for branches/tags
+- WorkflowRunEvents for suspicious CI/CD automation
+- PushEvents that precede a "gap" in the git log (evidence of rewrite)
+
+**Reference**: See [github-archive-guide.md](./references/github-archive-guide.md) for all 12 event types and query patterns.
+
+---
+
+### Investigator 5: IOC Enrichment Investigator
+
+**ROLE BOUNDARY**: You enrich EXISTING IOCs from Phase 1 using passive public sources ONLY. Do not execute any code from the target repository.
+
+**Actions**:
+- For each commit SHA: attempt recovery via direct GitHub URL (`github.com/OWNER/REPO/commit/SHA.patch`)
+- For each domain/IP: check passive DNS, WHOIS records (via `web_extract` on public WHOIS services)
+- For each package name: check npm/PyPI for matching malicious package reports
+- For each actor username: check GitHub profile, contribution history, account age
+- Recover force-pushed commits using 3 methods (see [recovery-techniques.md](./references/recovery-techniques.md))
+
+---
+
+## Phase 3: Evidence Consolidation
+
+After all investigators complete:
+
+1. Run `python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json list` to see all collected evidence.
+2. For each piece of evidence, verify the `content_sha256` hash matches the original source.
+3. Group evidence by:
+   - **Timeline**: Sort all timestamped evidence chronologically
+   - **Actor**: Group by GitHub handle or email
+   - **IOC**: Link evidence to the IOC it relates to
+4. Identify **discrepancies**: items present in one source but absent in another (key deletion indicators).
+5. Flag evidence as `[VERIFIED]` (confirmed from 2+ independent sources) or `[UNVERIFIED]` (single source only).
+
+---
+
+## Phase 4: Hypothesis Formation
+
+A hypothesis must:
+- State a specific claim (e.g., "Actor X force-pushed to BRANCH on DATE to erase commit SHA")
+- Cite at least 2 evidence IDs that support it (`EV-XXXX`, `EV-YYYY`)
+- Identify what evidence would disprove it
+- Be labeled `[HYPOTHESIS]` until validated
+
+**Common hypothesis templates** (see [investigation-templates.md](./references/investigation-templates.md)):
+- Maintainer Compromise: legitimate account used post-takeover to inject malicious code
+- Dependency Confusion: package name squatting to intercept installs
+- CI/CD Injection: malicious workflow changes to run code during builds
+- Typosquatting: near-identical package name targeting misspellers
+- Credential Leak: token/key accidentally committed then force-pushed to erase
+
+For each hypothesis, spawn a `delegate_task` sub-agent to attempt to find disconfirming evidence before confirming.
+
+---
+
+## Phase 5: Hypothesis Validation
+
+The validator sub-agent MUST mechanically check:
+
+1. For each hypothesis, extract all cited evidence IDs.
+2. Verify each ID exists in `evidence.json` (hard failure if any ID is missing → hypothesis rejected as potentially fabricated).
+3. Verify each `[VERIFIED]` piece of evidence was confirmed from 2+ sources.
+4. Check logical consistency: does the timeline depicted by the evidence support the hypothesis?
+5. Check for alternative explanations: could the same evidence pattern arise from a benign cause?
+
+**Output**:
+- `VALIDATED`: All evidence cited, verified, logically consistent, no plausible alternative explanation.
+- `INCONCLUSIVE`: Evidence supports hypothesis but alternative explanations exist or evidence is insufficient.
+- `REJECTED`: Missing evidence IDs, unverified evidence cited as fact, logical inconsistency detected.
+
+Rejected hypotheses feed back into Phase 4 for refinement (max 3 iterations).
+
+---
+
+## Phase 6: Final Report Generation
+
+Populate `investigation-report.md` using the template in [forensic-report.md](./templates/forensic-report.md).
+
+**Mandatory sections**:
+- Executive Summary: one-paragraph verdict (Compromised / Clean / Inconclusive) with confidence level
+- Timeline: chronological reconstruction of all significant events with evidence citations
+- Validated Hypotheses: each with status and supporting evidence IDs
+- Evidence Registry: table of all `EV-XXXX` entries with source, type, and verification status
+- IOC List: all extracted and enriched Indicators of Compromise
+- Chain of Custody: how evidence was collected, from what sources, at what timestamps
+- Recommendations: immediate mitigations if compromise detected; monitoring recommendations
+
+**Report rules**:
+- Every factual claim must have at least one `[EV-XXXX]` citation
+- Executive Summary must state confidence level (High / Medium / Low)
+- All secrets/credentials must be redacted to `[REDACTED]`
+
+---
+
+## Phase 7: Completion
+
+1. Run final evidence count: `python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json list`
+2. Archive the full investigation directory.
+3. If compromise is confirmed:
+   - List immediate mitigations (rotate credentials, pin dependency hashes, notify affected users)
+   - Identify affected versions/packages
+   - Note disclosure obligations (if a public package: coordinate with the package registry)
+4. Present the final `investigation-report.md` to the user.
+
+---
+
+## Ethical Use Guidelines
+
+This skill is designed for **defensive security investigation** — protecting open-source software from supply chain attacks. It must not be used for:
+
+- **Harassment or stalking** of contributors or maintainers
+- **Doxing** — correlating GitHub activity to real identities for malicious purposes
+- **Competitive intelligence** — investigating proprietary or internal repositories without authorization
+- **False accusations** — publishing investigation results without validated evidence (see anti-hallucination guardrails)
+
+Investigations should be conducted with the principle of **minimal intrusion**: collect only the evidence necessary to validate or refute the hypothesis. When publishing results, follow responsible disclosure practices and coordinate with affected maintainers before public disclosure.
+
+If the investigation reveals a genuine compromise, follow the coordinated vulnerability disclosure process:
+1. Notify the repository maintainers privately first
+2. Allow reasonable time for remediation (typically 90 days)
+3. Coordinate with package registries (npm, PyPI, etc.) if published packages are affected
+4. File a CVE if appropriate
+
+---
+
+## API Rate Limiting
+
+GitHub REST API enforces rate limits that will interrupt large investigations if not managed.
+
+**Authenticated requests**: 5,000/hour (requires `GITHUB_TOKEN` env var or `gh` CLI auth)
+**Unauthenticated requests**: 60/hour (unusable for investigations)
+
+**Best practices**:
+- Always authenticate: `export GITHUB_TOKEN=ghp_...` or use `gh` CLI (auto-authenticates)
+- Use conditional requests (`If-None-Match` / `If-Modified-Since` headers) to avoid consuming quota on unchanged data
+- For paginated endpoints, fetch all pages in sequence — don't parallelize against the same endpoint
+- Check `X-RateLimit-Remaining` header; if below 100, pause for `X-RateLimit-Reset` timestamp
+- BigQuery has its own quotas (10 TiB/day free tier) — always dry-run first
+- Wayback Machine CDX API: no formal rate limit, but be courteous (1-2 req/sec max)
+
+If rate-limited mid-investigation, record the partial results in the evidence store and note the limitation in the report.
+
+---
+
+## Reference Materials
+
+- [github-archive-guide.md](./references/github-archive-guide.md) — BigQuery queries, CDX API, 12 event types
+- [evidence-types.md](./references/evidence-types.md) — IOC taxonomy, evidence source types, observation types
+- [recovery-techniques.md](./references/recovery-techniques.md) — Recovering deleted commits, PRs, issues
+- [investigation-templates.md](./references/investigation-templates.md) — Pre-built hypothesis templates per attack type
+- [evidence-store.py](./scripts/evidence-store.py) — CLI tool for managing the evidence JSON store
+- [forensic-report.md](./templates/forensic-report.md) — Structured report template
@@ -0,0 +1,89 @@
+# Evidence Types Reference
+
+Taxonomy of all evidence types, IOC types, GitHub event types, and observation types
+used in OSS forensic investigations.
+
+---
+
+## Evidence Source Types
+
+| Type | Description | Example Sources |
+|------|-------------|-----------------|
+| `git` | Data from local git repository analysis | `git log`, `git fsck`, `git reflog`, `git blame` |
+| `gh_api` | Data from GitHub REST API responses | `/repos/.../commits`, `/repos/.../pulls`, `/repos/.../events` |
+| `gh_archive` | Data from GitHub Archive (BigQuery) | `githubarchive.month.*` BigQuery tables |
+| `web_archive` | Archived web pages from Wayback Machine | CDX API results, `web.archive.org/web/...` snapshots |
+| `ioc` | Indicator of Compromise from any source | Extracted from vendor reports, git history, network traces |
+| `analysis` | Derived insight from cross-source correlation | "SHA present in archive but absent from API" |
+| `vendor_report` | External security vendor or researcher report | CVE advisories, blog posts, NVD records |
+| `manual` | Manually recorded observation by investigator | Notes on behavioral patterns, timeline gaps |
+
+---
+
+## IOC Types
+
+| Type | Description | Example |
+|------|-------------|---------|
+| `COMMIT_SHA` | A git commit hash linked to malicious activity | `abc123def456...` |
+| `FILE_PATH` | A suspicious file inside the repository | `src/utils/crypto.js`, `dist/index.min.js` |
+| `API_KEY` | An API key accidentally committed | `AKIA...` (AWS), `ghp_...` (GitHub PAT) |
+| `SECRET` | A generic secret / credential | Database password, private key blob |
+| `IP_ADDRESS` | A C2 server or attacker IP | `192.0.2.1` |
+| `DOMAIN` | A malicious or suspicious domain | `evil-cdn.io`, typosquatted package registry domain |
+| `PACKAGE_NAME` | A malicious or squatted package name | `colo-rs` (typosquatting `color`), `lodash-utils` |
+| `ACTOR_USERNAME` | A GitHub handle linked to the attack | `malicious-bot-account` |
+| `MALICIOUS_URL` | A URL to a malicious resource | `https://evil.example.com/payload.sh` |
+| `WORKFLOW_FILE` | A suspicious CI/CD workflow file | `.github/workflows/release.yml` |
+| `BRANCH_NAME` | A suspicious branch | `refs/heads/temp-fix-do-not-merge` |
+| `TAG_NAME` | A suspicious git tag | `v1.0.0-security-patch` |
+| `RELEASE_NAME` | A suspicious release | Release with no associated tag or changelog |
+| `OTHER` | Catch-all for unclassified IOCs | — |
+
+---
+
+## GitHub Archive Event Types (12 Types)
+
+| Event Type | Forensic Relevance |
+|------------|-------------------|
+| `PushEvent` | Core: `payload.distinct_size=0` with `payload.size>0` → force push. `payload.before`/`payload.head` shows rewritten history. |
+| `PullRequestEvent` | Detects deleted PRs, rapid open→close patterns, PRs from new accounts |
+| `IssueEvent` | Detects deleted issues, coordinated labeling, rapid closure of vulnerability reports |
+| `IssueCommentEvent` | Deleted comments, rapid activity bursts |
+| `WatchEvent` | Star-farming campaigns (coordinated starring from new accounts) |
+| `ForkEvent` | Unusual fork patterns before malicious commit |
+| `CreateEvent` | Branch/tag creation: signals new release or code injection point |
+| `DeleteEvent` | Branch/tag deletion: critical — often used to hide traces |
+| `ReleaseEvent` | Unauthorized releases, release artifacts modified post-publish |
+| `MemberEvent` | Collaborator added/removed: maintainer compromise indicator |
+| `PublicEvent` | Repository made public (sometimes to drop malicious code briefly) |
+| `WorkflowRunEvent` | CI/CD pipeline executions: workflow injection, secret exfiltration |
+
+---
+
+## Evidence Verification States
+
+| State | Meaning |
+|-------|---------|
+| `unverified` | Collected from a single source, not cross-referenced |
+| `single_source` | The primary source has been confirmed directly (e.g., SHA resolves on GitHub), but no second source |
+| `multi_source_verified` | Confirmed from 2+ independent sources (e.g., GH Archive AND GitHub API both show the same event) |
+
+Only `multi_source_verified` evidence may be cited as fact in validated hypotheses.
+`unverified` and `single_source` evidence must be labeled `[UNVERIFIED]` or `[SINGLE-SOURCE]`.
+
+---
+
+## Observation Types (Patterned after RAPTOR)
+
+| Type | Description |
+|------|-------------|
+| `CommitObservation` | Specific commit SHA with metadata (author, date, files changed) |
+| `ForceWashObservation` | Evidence that commits were force-erased from a branch |
+| `DanglingCommitObservation` | SHA present in git object store but unreachable from any ref |
+| `IssueObservation` | A GitHub issue (current or archived) with title, body, timestamp |
+| `PRObservation` | A GitHub PR (current or archived) with diff summary, reviewers |
+| `IOC` | A single Indicator of Compromise with context |
+| `TimelineGap` | A period with unusual absence of expected activity |
+| `ActorAnomalyObservation` | Behavioral anomaly for a specific GitHub actor |
+| `WorkflowAnomalyObservation` | Suspicious CI/CD workflow change or unexpected run |
+| `CrossSourceDiscrepancy` | Item present in one source but absent in another (strong deletion indicator) |
@@ -0,0 +1,184 @@
+# GitHub Archive Query Guide (BigQuery)
+
+GitHub Archive records every public event on GitHub as immutable JSON records. This data is accessible via Google BigQuery and is the most reliable source for forensic investigation — events cannot be deleted or modified after recording.
+
+## Public Dataset
+
+- **Project**: `githubarchive`
+- **Tables**: `day.YYYYMMDD`, `month.YYYYMM`, `year.YYYY`
+- **Cost**: $6.25 per TiB scanned. Always run dry runs first.
+- **Access**: Requires a Google Cloud account with BigQuery enabled. Free tier includes 1 TiB/month of queries.
+
+---
+
+## The 12 GitHub Event Types
+
+| Event Type | What It Records | Forensic Value |
+|------------|-----------------|----------------|
+| `PushEvent` | Commits pushed to a branch | Force-push detection, commit timeline, author attribution |
+| `PullRequestEvent` | PR opened, closed, merged, reopened | Deleted PR recovery, review timeline |
+| `IssuesEvent` | Issue opened, closed, reopened, labeled | Deleted issue recovery, social engineering traces |
+| `IssueCommentEvent` | Comments on issues and PRs | Deleted comment recovery, communication patterns |
+| `CreateEvent` | Branch, tag, or repository creation | Suspicious branch creation, tag timing |
+| `DeleteEvent` | Branch or tag deletion | Evidence of cleanup after compromise |
+| `MemberEvent` | Collaborator added or removed | Permission changes, access escalation |
+| `PublicEvent` | Repository made public | Accidental exposure of private repos |
+| `WatchEvent` | User stars a repository | Actor reconnaissance patterns |
+| `ForkEvent` | Repository forked | Exfiltration of code before cleanup |
+| `ReleaseEvent` | Release published, edited, deleted | Malicious release injection, deleted release recovery |
+| `WorkflowRunEvent` | GitHub Actions workflow triggered | CI/CD abuse, unauthorized workflow runs |
+
+---
+
+## Query Templates
+
+### Basic: All Events for a Repository
+
+```sql
+SELECT
+  created_at,
+  type,
+  actor.login,
+  repo.name,
+  payload
+FROM
+  `githubarchive.day.20240101`  -- Adjust date
+WHERE
+  repo.name = 'owner/repo'
+  AND type IN ('PushEvent', 'DeleteEvent', 'MemberEvent')
+ORDER BY
+  created_at ASC
+```
+
+### Force-Push Detection
+
+Force-pushes produce PushEvents where commits are overwritten. Key indicators:
+- `payload.distinct_size = 0` with `payload.size > 0` → commits were erased
+- `payload.before` contains the SHA before the rewrite (recoverable)
+
+```sql
+SELECT
+  created_at,
+  actor.login,
+  JSON_EXTRACT_SCALAR(payload, '$.before') AS before_sha,
+  JSON_EXTRACT_SCALAR(payload, '$.head') AS after_sha,
+  JSON_EXTRACT_SCALAR(payload, '$.size') AS total_commits,
+  JSON_EXTRACT_SCALAR(payload, '$.distinct_size') AS distinct_commits,
+  JSON_EXTRACT_SCALAR(payload, '$.ref') AS branch_ref
+FROM
+  `githubarchive.month.*`
+WHERE
+  _TABLE_SUFFIX BETWEEN '202401' AND '202403'
+  AND type = 'PushEvent'
+  AND repo.name = 'owner/repo'
+  AND CAST(JSON_EXTRACT_SCALAR(payload, '$.distinct_size') AS INT64) = 0
+ORDER BY
+  created_at ASC
+```
+
+### Deleted Branch/Tag Detection
+
+```sql
+SELECT
+  created_at,
+  actor.login,
+  JSON_EXTRACT_SCALAR(payload, '$.ref') AS deleted_ref,
+  JSON_EXTRACT_SCALAR(payload, '$.ref_type') AS ref_type
+FROM
+  `githubarchive.month.*`
+WHERE
+  _TABLE_SUFFIX BETWEEN '202401' AND '202403'
+  AND type = 'DeleteEvent'
+  AND repo.name = 'owner/repo'
+ORDER BY
+  created_at ASC
+```
+
+### Collaborator Permission Changes
+
+```sql
+SELECT
+  created_at,
+  actor.login,
+  JSON_EXTRACT_SCALAR(payload, '$.action') AS action,
+  JSON_EXTRACT_SCALAR(payload, '$.member.login') AS member
+FROM
+  `githubarchive.month.*`
+WHERE
+  _TABLE_SUFFIX BETWEEN '202401' AND '202403'
+  AND type = 'MemberEvent'
+  AND repo.name = 'owner/repo'
+ORDER BY
+  created_at ASC
+```
+
+### CI/CD Workflow Activity
+
+```sql
+SELECT
+  created_at,
+  actor.login,
+  JSON_EXTRACT_SCALAR(payload, '$.action') AS action,
+  JSON_EXTRACT_SCALAR(payload, '$.workflow_run.name') AS workflow_name,
+  JSON_EXTRACT_SCALAR(payload, '$.workflow_run.conclusion') AS conclusion,
+  JSON_EXTRACT_SCALAR(payload, '$.workflow_run.head_sha') AS head_sha
+FROM
+  `githubarchive.month.*`
+WHERE
+  _TABLE_SUFFIX BETWEEN '202401' AND '202403'
+  AND type = 'WorkflowRunEvent'
+  AND repo.name = 'owner/repo'
+ORDER BY
+  created_at ASC
+```
+
+### Actor Activity Profiling
+
+```sql
+SELECT
+  type,
+  COUNT(*) AS event_count,
+  MIN(created_at) AS first_event,
+  MAX(created_at) AS last_event
+FROM
+  `githubarchive.month.*`
+WHERE
+  _TABLE_SUFFIX BETWEEN '202301' AND '202412'
+  AND actor.login = 'suspicious-username'
+GROUP BY type
+ORDER BY event_count DESC
+```
+
+---
+
+## Cost Optimization (MANDATORY)
+
+1. **Always dry run first**: Add `--dry_run` flag to `bq query` to see estimated bytes scanned before executing.
+2. **Use `_TABLE_SUFFIX`**: Narrow the date range as much as possible. `day.*` tables are cheapest for narrow windows; `month.*` for broader sweeps.
+3. **Select only needed columns**: Avoid `SELECT *`. The `payload` column is large — only select specific JSON paths.
+4. **Add LIMIT**: Use `LIMIT 1000` during exploration. Remove only for final exhaustive queries.
+5. **Column filtering in WHERE**: Filter on indexed columns (`type`, `repo.name`, `actor.login`) before payload extraction.
+
+**Cost estimation**: A single month of GH Archive data is ~1-2 TiB uncompressed. Querying a specific repo + event type with `_TABLE_SUFFIX` typically scans 1-10 GiB ($0.006-$0.06).
+
+---
+
+## Accessing via Hermes
+
+**Option A: BigQuery CLI** (if `gcloud` is installed)
+```bash
+bq query --use_legacy_sql=false --format=json "YOUR QUERY"
+```
+
+**Option B: Python** (via `execute_code`)
+```python
+from google.cloud import bigquery
+client = bigquery.Client()
+query = "YOUR QUERY"
+results = client.query(query).result()
+for row in results:
+    print(dict(row))
+```
+
+**Option C: No GCP credentials available**
+If BigQuery is unavailable, document this limitation in the report. Use the other 4 investigators (Git, GitHub API, Wayback Machine, IOC Enrichment) — they cover most investigation needs without BigQuery.
@@ -0,0 +1,131 @@
+# Investigation Templates
+
+Pre-built hypothesis and investigation templates for common supply chain attack scenarios.
+Each template includes: attack pattern, key evidence to collect, and hypothesis starters.
+
+---
+
+## Template 1: Maintainer Account Compromise
+
+**Pattern**: Attacker gains access to a legitimate maintainer account (phishing, credential stuffing)
+and uses it to push malicious code, create backdoored releases, or exfiltrate CI secrets.
+
+**Real-world examples**: XZ Utils (2024), Codecov (2021), event-stream (2018)
+
+**Key Evidence to Collect**:
+- [ ] Push events from maintainer account outside normal working hours/timezone
+- [ ] Commits adding new dependencies, obfuscated code, or modified build scripts
+- [ ] Release creation immediately after suspicious push (to maximize package distribution)
+- [ ] MemberEvent adding unknown collaborators (attacker adding backup access)
+- [ ] WorkflowRunEvent with unexpected secret access or exfiltration-like behavior
+- [ ] Account login location changes (check social media, conference talks for corroboration)
+
+**Hypothesis Starters**:
+```
+[HYPOTHESIS] Actor <HANDLE>'s account was compromised on or around <DATE>, 
+based on anomalous commit timing [EV-XXXX] and geographic access patterns [EV-YYYY].
+```
+```
+[HYPOTHESIS] Release <VERSION> was published by the compromised account to push 
+malicious code to downstream users, evidenced by the malicious commit [EV-XXXX] 
+being added <N> hours before the release [EV-YYYY].
+```
+
+---
+
+## Template 2: Malicious Dependency Injection
+
+**Pattern**: A trusted package is modified to include malicious code in a dependency,
+or a new malicious dependency is injected into an existing package.
+
+**Key Evidence to Collect**:
+- [ ] Diff of `package.json`/`requirements.txt`/`go.mod` before and after suspicious commit
+- [ ] The new dependency's publication timestamp vs. the injection commit timestamp
+- [ ] Whether the new dependency exists on npm/PyPI and who owns it
+- [ ] Any obfuscation patterns in the injected dependency code
+- [ ] Install-time scripts (`postinstall`, `setup.py`, etc.) that execute code on install
+
+**Hypothesis Starters**:
+```
+[HYPOTHESIS] Commit <SHA> [EV-XXXX] introduced dependency <PACKAGE@VERSION> 
+which appears to be a malicious package published by actor <HANDLE> [EV-YYYY], 
+designed to execute <BEHAVIOR> during installation.
+```
+
+---
+
+## Template 3: CI/CD Pipeline Injection
+
+**Pattern**: Attacker modifies GitHub Actions workflows to steal secrets, exfiltrate code,
+or inject malicious artifacts into the build output.
+
+**Key Evidence to Collect**:
+- [ ] Diff of all `.github/workflows/*.yml` files before/after suspicious period
+- [ ] WorkflowRunEvents triggered by the modified workflows
+- [ ] Any `curl`, `wget`, or network calls added to workflow steps
+- [ ] New or modified `env:` sections referencing `secrets.*`
+- [ ] Artifacts produced by modified workflow runs
+
+**Hypothesis Starters**:
+```
+[HYPOTHESIS] Workflow file <FILE> was modified in commit <SHA> [EV-XXXX] to 
+exfiltrate repository secrets via <METHOD>, as evidenced by the added network 
+call pattern [EV-YYYY].
+```
+
+---
+
+## Template 4: Typosquatting / Dependency Confusion
+
+**Pattern**: Attacker registers a package with a name similar to a popular package
+(or an internal package name) to intercept installs from users who mistype.
+
+**Key Evidence to Collect**:
+- [ ] Registration timestamp of the suspicious package on the registry
+- [ ] Package content: does it contain malicious code or is it a stub?
+- [ ] Download statistics for the suspicious package
+- [ ] Names of internal packages that could be targeted (if private repo scope)
+- [ ] Any references to the legitimate package in the malicious one's metadata
+
+**Hypothesis Starters**:
+```
+[HYPOTHESIS] Package <MALICIOUS_NAME> was registered on <DATE> [EV-XXXX] to 
+typosquat on <LEGITIMATE_NAME>, targeting users who misspell the package name. 
+The package contains <BEHAVIOR> [EV-YYYY].
+```
+
+---
+
+## Template 5: Force-Push History Rewrite (Evidence Erasure)
+
+**Pattern**: After a malicious commit is detected (or before wider notice), the attacker
+force-pushes to remove the malicious commit from branch history.
+
+**Detection is key** — this template focuses on proving the erasure happened.
+
+**Key Evidence to Collect**:
+- [ ] GH Archive PushEvent with `distinct_size=0` (force push indicator) [EV-XXXX]
+- [ ] The SHA of the commit BEFORE the force push (from GH Archive `payload.before`)
+- [ ] Recovery of the erased commit via direct URL or `git fetch origin SHA`
+- [ ] Wayback Machine snapshot of the commit page before erasure
+- [ ] Timeline gap in git log (N commits visible in archive but M < N in current repo)
+
+**Hypothesis Starters**:
+```
+[HYPOTHESIS] Actor <HANDLE> force-pushed branch <BRANCH> on <DATE> [EV-XXXX] 
+to erase commit <SHA> [EV-YYYY], which contained <MALICIOUS_CONTENT>. 
+The erased commit was recovered via <METHOD> [EV-ZZZZ].
+```
+
+---
+
+## Cross-Cutting Investigation Checklist
+
+Apply to every investigation regardless of template:
+
+- [ ] Check all contributors for newly created accounts (< 30 days old at time of malicious activity)
+- [ ] Check if any maintainer account changed email in the period (sign of account takeover)
+- [ ] Verify GPG signatures on suspicious commits match known maintainer keys
+- [ ] Check if the repository changed ownership or transferred orgs near the incident
+- [ ] Look for "cleanup" commits immediately after the malicious commit (cover-up pattern)
+- [ ] Check related packages/repos by the same author for similar patterns
@@ -0,0 +1,164 @@
+# Deleted Content Recovery Techniques
+
+## Key Insight: GitHub Never Fully Deletes Force-Pushed Commits
+
+Force-pushed commits are removed from the branch history but REMAIN on GitHub's servers until garbage collection runs (which can take weeks to months). This is the foundation of deleted commit recovery.
+
+---
+
+## Method 1: Direct GitHub URL (Fastest — No Auth Required)
+
+If you have a commit SHA, access it directly even if it was force-pushed off a branch:
+
+```bash
+# View commit metadata
+curl -s "https://github.com/OWNER/REPO/commit/SHA"
+
+# Download as patch (includes full diff)
+curl -s "https://github.com/OWNER/REPO/commit/SHA.patch" > recovered_commit.patch
+
+# Download as diff
+curl -s "https://github.com/OWNER/REPO/commit/SHA.diff" > recovered_commit.diff
+
+# Example (Istio credential leak - real incident):
+curl -s "https://github.com/istio/istio/commit/FORCE_PUSHED_SHA.patch"
+```
+
+**When this works**: SHA is known (from GH Archive, Wayback Machine, or `git fsck`)
+**When this fails**: GitHub has already garbage-collected the object (rare, typically 30–90 days post-force-push)
+
+---
+
+## Method 2: GitHub REST API
+
+```bash
+# Works for commits force-pushed off branches but still on server
+# Note: /commits/SHA may 404, but /git/commits/SHA often succeeds for orphaned commits
+curl -s "https://api.github.com/repos/OWNER/REPO/git/commits/SHA" | jq .
+
+# Get the tree (file listing) of a force-pushed commit
+curl -s "https://api.github.com/repos/OWNER/REPO/git/trees/SHA?recursive=1" | jq .
+
+# Get a specific file from a force-pushed commit
+curl -s "https://api.github.com/repos/OWNER/REPO/contents/PATH?ref=SHA" | jq .content | base64 -d
+```
+
+---
+
+## Method 3: Git Fetch by SHA (Local — Requires Clone)
+
+```bash
+# Fetch an orphaned commit directly by SHA into local repo
+cd target_repo
+git fetch origin SHA
+git log FETCH_HEAD -1   # view the commit
+git diff FETCH_HEAD~1 FETCH_HEAD  # view the diff
+
+# If the SHA was recently force-pushed it will still be fetchable
+# This stops working once GitHub GC runs
+```
+
+---
+
+## Method 4: Dangling Commits via git fsck
+
+```bash
+cd target_repo
+
+# Find all unreachable objects (includes force-pushed commits)
+git fsck --unreachable --no-reflogs 2>&1 | grep "unreachable commit" | awk '{print $3}' > dangling_shas.txt
+
+# For each dangling commit, get its metadata
+while read sha; do
+  echo "=== $sha ===" >> dangling_details.txt
+  git show --stat "$sha" >> dangling_details.txt 2>&1
+done < dangling_shas.txt
+
+# Note: dangling objects only exist in LOCAL clone — not the same as GitHub's copies
+# GitHub's copies are accessible via Methods 1-3 until GC runs
+```
+
+---
+
+## Recovering Deleted GitHub Issues and PRs
+
+### Via Wayback Machine CDX API
+
+```bash
+# Find all archived snapshots of a specific issue
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO/issues/NUMBER&output=json&limit=50&fl=timestamp,statuscode,original" | python3 -m json.tool
+
+# Fetch the best snapshot
+# Use the timestamp from the CDX result:
+# https://web.archive.org/web/TIMESTAMP/https://github.com/OWNER/REPO/issues/NUMBER
+curl -s "https://web.archive.org/web/TIMESTAMP/https://github.com/OWNER/REPO/issues/NUMBER" > issue_NUMBER_archived.html
+
+# Find all snapshots of the repo in a date range
+curl -s "https://web.archive.org/cdx/search/cdx?url=github.com/OWNER/REPO*&output=json&from=20240101&to=20240201&limit=200&fl=timestamp,urlkey,statuscode" | python3 -m json.tool
+```
+
+### Via GitHub API (Limited — Only Non-Deleted Content)
+
+```bash
+# Closed issues (not deleted) are retrievable
+curl -s "https://api.github.com/repos/OWNER/REPO/issues?state=closed&per_page=100" | jq '.[].number'
+
+# Note: DELETED issues/PRs do NOT appear in the API. Use Wayback Machine or GH Archive for those.
+```
+
+### Via GitHub Archive (For Event History — Not Content)
+
+```sql
+-- Find all IssueEvents for a repo in a date range
+SELECT created_at, actor.login, payload.action, payload.issue.number, payload.issue.title
+FROM `githubarchive.day.*`
+WHERE _TABLE_SUFFIX BETWEEN '20240101' AND '20240201'
+  AND type = 'IssuesEvent'
+  AND repo.name = 'OWNER/REPO'
+ORDER BY created_at
+```
+
+---
+
+## Recovering Deleted Files from a Known Commit
+
+```bash
+# If you have the commit SHA (even force-pushed):
+git show SHA:path/to/file.py > recovered_file.py
+
+# Or via API (base64 encoded content):
+curl -s "https://api.github.com/repos/OWNER/REPO/contents/path/to/file.py?ref=SHA" | python3 -c "
+import sys, json, base64
+d = json.load(sys.stdin)
+print(base64.b64decode(d['content']).decode())
+"
+```
+
+---
+
+## Evidence Recording
+
+After recovering any deleted content, immediately record it:
+
+```bash
+python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json add \
+  --source "git fetch origin FORCE_PUSHED_SHA" \
+  --content "Recovered commit: FORCE_PUSHED_SHA | Author: attacker@example.com | Date: 2024-01-15 | Added file: malicious.sh" \
+  --type git \
+  --actor "attacker-handle" \
+  --url "https://github.com/OWNER/REPO/commit/FORCE_PUSHED_SHA.patch" \
+  --timestamp "2024-01-15T00:00:00Z" \
+  --verification single_source \
+  --notes "Commit force-pushed off main branch on 2024-01-16. Recovered via direct fetch."
+```
+
+---
+
+## Recovery Failure Modes
+
+| Failure | Cause | Workaround |
+|---------|-------|------------|
+| `git fetch origin SHA` returns "not our ref" | GitHub GC already ran | Try Method 1/2, search Wayback Machine |
+| `github.com/OWNER/REPO/commit/SHA` returns 404 | GC ran or SHA is wrong | Verify SHA via GH Archive; try partial SHA search |
+| Wayback Machine has no snapshots | Page was never crawled by IA | Check `commoncrawl.org`, check Google Cache |
+| BigQuery shows event but no content | GH Archive stores event metadata, not file contents | Recovery only reveals the event occurred, not the content |
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""
+OSS Forensics Evidence Store Manager
+Manages a JSON-based evidence store for forensic investigations.
+
+Commands:
+  add      - Add a piece of evidence
+  list     - List all evidence (optionally filter by type or actor)
+  verify   - Re-check SHA-256 hashes for integrity
+  query    - Search evidence by keyword
+  export   - Export evidence as a Markdown table
+  summary  - Print investigation statistics
+
+Usage example:
+  python3 evidence-store.py --store evidence.json add \
+    --source "git fsck output" --content "dangling commit abc123" \
+    --type git --actor "malicious-user" --url "https://github.com/owner/repo/commit/abc123"
+
+  python3 evidence-store.py --store evidence.json list --type git
+  python3 evidence-store.py --store evidence.json verify
+  python3 evidence-store.py --store evidence.json export > evidence-table.md
+"""
+
+import json
+import argparse
+import os
+import datetime
+import hashlib
+import sys
+
+EVIDENCE_TYPES = [
+    "git",           # Local git repository data (commits, reflog, fsck)
+    "gh_api",        # GitHub REST API responses
+    "gh_archive",    # GitHub Archive / BigQuery query results
+    "web_archive",   # Wayback Machine snapshots
+    "ioc",           # Indicator of Compromise (SHA, domain, IP, package name, etc.)
+    "analysis",      # Derived analysis / cross-source correlation result
+    "manual",        # Manually noted observation
+    "vendor_report", # External security vendor report excerpt
+]
+
+VERIFICATION_STATES = ["unverified", "single_source", "multi_source_verified"]
+
+IOC_TYPES = [
+    "COMMIT_SHA", "FILE_PATH", "API_KEY", "SECRET", "IP_ADDRESS",
+    "DOMAIN", "PACKAGE_NAME", "ACTOR_USERNAME", "MALICIOUS_URL",
+    "WORKFLOW_FILE", "BRANCH_NAME", "TAG_NAME", "RELEASE_NAME", "OTHER",
+]
+
+
+def _now_iso():
+    return datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") + "Z"
+
+
+def _sha256(content: str) -> str:
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()
+
+
+class EvidenceStore:
+    def __init__(self, filepath: str):
+        self.filepath = filepath
+        self.data = {
+            "metadata": {
+                "version": "2.0",
+                "created_at": _now_iso(),
+                "last_updated": _now_iso(),
+                "investigation": "",
+                "target_repo": "",
+            },
+            "evidence": [],
+            "chain_of_custody": [],
+        }
+        if os.path.exists(filepath):
+            try:
+                with open(filepath, "r", encoding="utf-8") as f:
+                    self.data = json.load(f)
+            except (json.JSONDecodeError, IOError) as e:
+                print(f"Error loading evidence store '{filepath}': {e}", file=sys.stderr)
+                print("Hint: The file might be corrupted. Check for manual edits or syntax errors.", file=sys.stderr)
+                sys.exit(1)
+
+    def _save(self):
+        self.data["metadata"]["last_updated"] = _now_iso()
+        with open(self.filepath, "w", encoding="utf-8") as f:
+            json.dump(self.data, f, indent=2, ensure_ascii=False)
+
+    def _next_id(self) -> str:
+        return f"EV-{len(self.data['evidence']) + 1:04d}"
+
+    def add(
+        self,
+        source: str,
+        content: str,
+        evidence_type: str,
+        actor: str = None,
+        url: str = None,
+        timestamp: str = None,
+        ioc_type: str = None,
+        verification: str = "unverified",
+        notes: str = None,
+    ) -> str:
+        evidence_id = self._next_id()
+        entry = {
+            "id": evidence_id,
+            "type": evidence_type,
+            "source": source,
+            "content": content,
+            "content_sha256": _sha256(content),
+            "actor": actor,
+            "url": url,
+            "event_timestamp": timestamp,
+            "collected_at": _now_iso(),
+            "ioc_type": ioc_type,
+            "verification": verification,
+            "notes": notes,
+        }
+        self.data["evidence"].append(entry)
+        self.data["chain_of_custody"].append({
+            "action": "add",
+            "evidence_id": evidence_id,
+            "timestamp": _now_iso(),
+            "source": source,
+        })
+        self._save()
+        return evidence_id
+
+    def list_evidence(self, filter_type: str = None, filter_actor: str = None):
+        results = self.data["evidence"]
+        if filter_type:
+            results = [e for e in results if e.get("type") == filter_type]
+        if filter_actor:
+            results = [e for e in results if e.get("actor") == filter_actor]
+        return results
+
+    def verify_integrity(self):
+        """Re-compute SHA-256 for all entries and report mismatches."""
+        issues = []
+        for entry in self.data["evidence"]:
+            expected = _sha256(entry["content"])
+            stored = entry.get("content_sha256", "")
+            if expected != stored:
+                issues.append({
+                    "id": entry["id"],
+                    "stored_sha256": stored,
+                    "computed_sha256": expected,
+                })
+        return issues
+
+    def query(self, keyword: str):
+        """Search for keyword in content, source, actor, or url."""
+        keyword_lower = keyword.lower()
+        return [
+            e for e in self.data["evidence"]
+            if keyword_lower in (e.get("content", "") or "").lower()
+            or keyword_lower in (e.get("source", "") or "").lower()
+            or keyword_lower in (e.get("actor", "") or "").lower()
+            or keyword_lower in (e.get("url", "") or "").lower()
+        ]
+
+    def export_markdown(self) -> str:
+        lines = [
+            "# Evidence Registry",
+            "",
+            f"**Store**: `{self.filepath}`",
+            f"**Last Updated**: {self.data['metadata'].get('last_updated', 'N/A')}",
+            f"**Total Evidence Items**: {len(self.data['evidence'])}",
+            "",
+            "| ID | Type | Source | Actor | Verification | Event Timestamp | URL |",
+            "|----|------|--------|-------|--------------|-----------------|-----|",
+        ]
+        for e in self.data["evidence"]:
+            url = e.get("url") or ""
+            url_display = f"[link]({url})" if url else ""
+            lines.append(
+                f"| {e['id']} | {e.get('type','')} | {e.get('source','')} "
+                f"| {e.get('actor') or ''} | {e.get('verification','')} "
+                f"| {e.get('event_timestamp') or ''} | {url_display} |"
+            )
+        lines.append("")
+        lines.append("## Chain of Custody")
+        lines.append("")
+        lines.append("| Evidence ID | Action | Timestamp | Source |")
+        lines.append("|-------------|--------|-----------|--------|")
+        for c in self.data["chain_of_custody"]:
+            lines.append(
+                f"| {c.get('evidence_id','')} | {c.get('action','')} "
+                f"| {c.get('timestamp','')} | {c.get('source','')} |"
+            )
+        return "\n".join(lines)
+
+    def summary(self) -> dict:
+        by_type = {}
+        by_verification = {}
+        actors = set()
+        for e in self.data["evidence"]:
+            t = e.get("type", "unknown")
+            by_type[t] = by_type.get(t, 0) + 1
+            v = e.get("verification", "unverified")
+            by_verification[v] = by_verification.get(v, 0) + 1
+            if e.get("actor"):
+                actors.add(e["actor"])
+        return {
+            "total": len(self.data["evidence"]),
+            "by_type": by_type,
+            "by_verification": by_verification,
+            "unique_actors": sorted(actors),
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="OSS Forensics Evidence Store Manager v2.0",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--store", default="evidence.json", help="Path to evidence JSON file (default: evidence.json)")
+
+    subparsers = parser.add_subparsers(dest="command", metavar="COMMAND")
+
+    # --- add ---
+    add_p = subparsers.add_parser("add", help="Add a new evidence entry")
+    add_p.add_argument("--source", required=True, help="Where this evidence came from (e.g. 'git fsck', 'GH API /commits')")
+    add_p.add_argument("--content", required=True, help="The evidence content (commit SHA, API response excerpt, etc.)")
+    add_p.add_argument("--type", required=True, choices=EVIDENCE_TYPES, dest="evidence_type", help="Evidence type")
+    add_p.add_argument("--actor", help="GitHub handle or email of associated actor")
+    add_p.add_argument("--url", help="URL to original source")
+    add_p.add_argument("--timestamp", help="When the event occurred (ISO 8601)")
+    add_p.add_argument("--ioc-type", choices=IOC_TYPES, help="IOC subtype (for --type ioc)")
+    add_p.add_argument("--verification", choices=VERIFICATION_STATES, default="unverified")
+    add_p.add_argument("--notes", help="Additional investigator notes")
+    add_p.add_argument("--quiet", action="store_true", help="Suppress success message")
+
+    # --- list ---
+    list_p = subparsers.add_parser("list", help="List all evidence entries")
+    list_p.add_argument("--type", dest="filter_type", choices=EVIDENCE_TYPES, help="Filter by type")
+    list_p.add_argument("--actor", dest="filter_actor", help="Filter by actor")
+
+    # --- verify ---
+    subparsers.add_parser("verify", help="Verify SHA-256 integrity of all evidence content")
+
+    # --- query ---
+    query_p = subparsers.add_parser("query", help="Search evidence by keyword")
+    query_p.add_argument("keyword", help="Keyword to search for")
+
+    # --- export ---
+    subparsers.add_parser("export", help="Export evidence as a Markdown table (stdout)")
+
+    # --- summary ---
+    subparsers.add_parser("summary", help="Print investigation statistics")
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        sys.exit(0)
+
+    store = EvidenceStore(args.store)
+
+    if args.command == "add":
+        eid = store.add(
+            source=args.source,
+            content=args.content,
+            evidence_type=args.evidence_type,
+            actor=args.actor,
+            url=args.url,
+            timestamp=args.timestamp,
+            ioc_type=args.ioc_type,
+            verification=args.verification,
+            notes=args.notes,
+        )
+        if not getattr(args, "quiet", False):
+            print(f"✓ Added evidence: {eid}")
+
+    elif args.command == "list":
+        items = store.list_evidence(
+            filter_type=getattr(args, "filter_type", None),
+            filter_actor=getattr(args, "filter_actor", None),
+        )
+        if not items:
+            print("No evidence found.")
+        for e in items:
+            actor_str = f" | actor: {e['actor']}" if e.get("actor") else ""
+            url_str = f" | {e['url']}" if e.get("url") else ""
+            print(f"[{e['id']}] {e['type']:12s} | {e['verification']:20s} | {e['source']}{actor_str}{url_str}")
+
+    elif args.command == "verify":
+        issues = store.verify_integrity()
+        if not issues:
+            print(f"✓ All {len(store.data['evidence'])} evidence entries passed SHA-256 integrity check.")
+        else:
+            print(f"✗ {len(issues)} integrity issue(s) detected:")
+            for i in issues:
+                print(f"  [{i['id']}] stored={i['stored_sha256'][:16]}... computed={i['computed_sha256'][:16]}...")
+            sys.exit(1)
+
+    elif args.command == "query":
+        results = store.query(args.keyword)
+        print(f"Found {len(results)} result(s) for '{args.keyword}':")
+        for e in results:
+            print(f"  [{e['id']}] {e['type']} | {e['source']} | {e['content'][:80]}")
+
+    elif args.command == "export":
+        print(store.export_markdown())
+
+    elif args.command == "summary":
+        s = store.summary()
+        print(f"Total evidence items : {s['total']}")
+        print(f"By type              : {json.dumps(s['by_type'], indent=2)}")
+        print(f"By verification      : {json.dumps(s['by_verification'], indent=2)}")
+        print(f"Unique actors        : {s['unique_actors']}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,151 @@
+# Forensic Investigation Report
+
+> **Instructions**: Fill in all sections. Every factual claim must cite at least one `[EV-XXXX]` evidence ID.
+> Remove placeholder text and instruction notes before finalizing. Redact all secrets to `[REDACTED]`.
+
+---
+
+## Executive Summary
+
+**Target Repository**: `OWNER/REPO`
+**Investigation Period**: YYYY-MM-DD to YYYY-MM-DD
+**Verdict**: <!-- Compromised / Clean / Inconclusive -->
+**Confidence Level**: <!-- High / Medium / Low -->
+**Report Date**: YYYY-MM-DD
+**Investigator**: <!-- Agent session ID or analyst name -->
+
+<!-- One paragraph: what was investigated, what was found, what is recommended. -->
+
+---
+
+## Timeline of Events
+
+> All timestamps in UTC. Each event must cite at least one evidence ID.
+
+| Timestamp (UTC) | Event | Evidence IDs | Source |
+|-----------------|-------|--------------|--------|
+| YYYY-MM-DDTHH:MM:SSZ | _Describe event_ | [EV-XXXX] | git / gh_api / gh_archive / web_archive |
+| | | | |
+
+---
+
+## Validated Hypotheses
+
+### Hypothesis 1: <!-- Short title -->
+
+**Status**: <!-- VALIDATED / INCONCLUSIVE / REJECTED -->
+
+**Claim**: _Full statement of the hypothesis._
+
+**Supporting Evidence**:
+- [EV-XXXX]: _What this evidence shows_
+- [EV-YYYY]: _What this evidence shows_
+
+**Counter-Evidence Considered**: _What might disprove this, and why it was ruled out or not._
+
+**Confidence**: <!-- High / Medium / Low, and why -->
+
+---
+
+## Indicators of Compromise (IOC List)
+
+| Type | Value | Status | Evidence |
+|------|-------|--------|----------|
+| COMMIT_SHA | `abc123...` | Confirmed malicious | [EV-XXXX] |
+| ACTOR_USERNAME | `handle` | Suspected compromised | [EV-YYYY] |
+| FILE_PATH | `src/evil.js` | Confirmed malicious | [EV-ZZZZ] |
+| DOMAIN | `evil-cdn.io` | Confirmed C2 | [EV-WWWW] |
+
+---
+
+## Affected Versions
+
+| Version / Tag | Published | Contains Malicious Code | Evidence |
+|---------------|-----------|------------------------|----------|
+| `v1.2.3` | YYYY-MM-DD | Yes / No / Unknown | [EV-XXXX] |
+
+---
+
+## Evidence Registry
+
+> Generated by: `python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json export`
+
+<!-- Paste the Markdown table output from the evidence-store.py export command here -->
+
+| ID | Type | Source | Actor | Verification | Event Timestamp | URL |
+|----|------|--------|-------|--------------|-----------------|-----|
+| EV-0001 | | | | | | |
+
+---
+
+## Chain of Custody
+
+> Generated by: `python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json export`
+
+<!-- Paste the chain of custody section from the export output here -->
+
+| Evidence ID | Action | Timestamp | Source |
+|-------------|--------|-----------|--------|
+| EV-0001 | add | | |
+
+---
+
+## Technical Findings
+
+### Git History Analysis
+
+_Summarize findings from local git analysis: dangling commits, reflog anomalies, unsigned commits, binary additions, etc._
+
+### GitHub API Analysis
+
+_Summarize findings from GitHub REST API: deleted PRs/issues, contributor changes, release anomalies, etc._
+
+### GitHub Archive Analysis
+
+_Summarize findings from BigQuery: force-push events, delete events, workflow anomalies, member changes, etc._
+_Note: If BigQuery was unavailable, state this explicitly._
+
+### Wayback Machine Analysis
+
+_Summarize findings from archive.org: recovered deleted pages, historical content differences, etc._
+
+### IOC Enrichment
+
+_Summarize enrichment results: WHOIS data for domains, recovered commit content, actor account analysis, etc._
+
+---
+
+## Recommendations
+
+### Immediate Actions (If Compromise Confirmed)
+
+- [ ] Rotate all GitHub tokens, API keys, and credentials that may have been exposed
+- [ ] Pin dependency versions to hashes in all affected packages
+- [ ] Publish a security advisory / CVE if applicable
+- [ ] Notify downstream users/package registries (npm, PyPI, etc.)
+- [ ] Revoke access for the compromised account and re-secure with hardware 2FA
+- [ ] Audit all CI/CD workflow files for unauthorized modifications
+- [ ] Review all releases published during the compromise window
+
+### Monitoring Recommendations
+
+- [ ] Enable branch protection on `main`/`master` (require code review, disallow force-push)
+- [ ] Enable required commit signing (GPG/SSH)
+- [ ] Set up GitHub audit log streaming for future monitoring
+- [ ] Pin critical dependencies to known-good SHAs in lock files
+
+---
+
+## Limitations and Caveats
+
+- _List any data sources that were unavailable (e.g., no BigQuery access)_
+- _Note any evidence that is single-source only (not independently verified)_
+- _Note any hypotheses that could not be confirmed or denied_
+
+---
+
+## References
+
+- Evidence store: `evidence.json` (SHA-256 integrity: run `python3 SKILL_DIR/scripts/evidence-store.py --store evidence.json verify`)
+- Related issues: <!-- Link to GitHub issues, CVEs, security advisories -->
+- RAPTOR framework: https://github.com/gadievron/raptor
@@ -0,0 +1,43 @@
+# Malicious Package Investigation Report
+
+---
+
+## 📦 Package Metadata
+- **Package Name**: 
+- **Registry**: [NPM / PyPI / RubyGems / etc.]
+- **Affected Versions**: 
+- **Malicious Version(s)**: 
+- **Downloads at Time of Detection**: 
+- **Package URL**: 
+
+---
+
+## 🚩 Indicators of Compromise (IOCs)
+- **Malicious URL(s)**: 
+- **Exfiltrated Data Types**: [Environment variables, ~/.ssh/id_rsa, /etc/shadow, etc.]
+- **Exfiltration Method**: [DNS tunneling, HTTP POST to C2, etc.]
+- **C2 IP/Domain**: 
+
+---
+
+## 🛠️ Analysis Summary
+- **Primary Mechanism**: [Typosquatting / Dependency Confusion / Maintainer Takeover]
+- **Behavior Description**: 
+  - [Example: Installs a postinstall script that exfiltrates environment variables.]
+  - [Example: Patches `setup.py` to download a secondary payload.]
+
+---
+
+## 🔍 Evidence Registry
+| Evidence ID | Type | Source | Description |
+|-------------|------|--------|-------------|
+| EV-XXXX     | ioc  | NPM    | Package install script snapshot |
+| EV-YYYY     | web  | Wayback| Historical version comparison |
+
+---
+
+## 🛡️ Recommended Mitigations
+1. [ ] Unpublish/Report the package to the registry.
+2. [ ] Audit `package-lock.json` or `requirements.txt` across all projects.
+3. [ ] Rotate secrets exfiltrated via environment variables.
+4. [ ] Pin specific hashes (SHASUM) for mission-critical dependencies.
@@ -27,25 +27,16 @@ from pathlib import Path
 import fire
 import yaml

-# Load .env from ~/.hermes/.env first, then project root as dev fallback
-from dotenv import load_dotenv
-
+# Load .env from ~/.hermes/.env first, then project root as dev fallback.
+# User-managed env files should override stale shell exports on restart.
 _hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
-_user_env = _hermes_home / ".env"
 _project_env = Path(__file__).parent / '.env'

-if _user_env.exists():
-    try:
-        load_dotenv(dotenv_path=_user_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_user_env, encoding="latin-1")
-    print(f"✅ Loaded environment variables from {_user_env}")
-elif _project_env.exists():
-    try:
-        load_dotenv(dotenv_path=_project_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_project_env, encoding="latin-1")
-    print(f"✅ Loaded environment variables from {_project_env}")
+from hermes_cli.env_loader import load_hermes_dotenv
+
+_loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
+for _env_path in _loaded_env_paths:
+    print(f"✅ Loaded environment variables from {_env_path}")

 # Set terminal working directory to tinker-atropos submodule
 # This ensures terminal commands run in the right context for RL work
@@ -21,6 +21,8 @@ Usage:
 """

 import atexit
+import asyncio
+import base64
 import concurrent.futures
 import copy
 import hashlib
@@ -31,6 +33,7 @@ import os
 import random
 import re
 import sys
+import tempfile
 import time
 import threading
 import weakref
@@ -42,24 +45,16 @@ import fire
 from datetime import datetime
 from pathlib import Path

-# Load .env from ~/.hermes/.env first, then project root as dev fallback
-from dotenv import load_dotenv
+# Load .env from ~/.hermes/.env first, then project root as dev fallback.
+# User-managed env files should override stale shell exports on restart.
+from hermes_cli.env_loader import load_hermes_dotenv

 _hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
-_user_env = _hermes_home / ".env"
 _project_env = Path(__file__).parent / '.env'
-if _user_env.exists():
-    try:
-        load_dotenv(dotenv_path=_user_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_user_env, encoding="latin-1")
-    logger.info("Loaded environment variables from %s", _user_env)
-elif _project_env.exists():
-    try:
-        load_dotenv(dotenv_path=_project_env, encoding="utf-8")
-    except UnicodeDecodeError:
-        load_dotenv(dotenv_path=_project_env, encoding="latin-1")
-    logger.info("Loaded environment variables from %s", _project_env)
+_loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
+if _loaded_env_paths:
+    for _env_path in _loaded_env_paths:
+        logger.info("Loaded environment variables from %s", _env_path)
 else:
    logger.info("No .env file found. Using system environment variables.")

@@ -95,6 +90,7 @@ from agent.display import (
    KawaiiSpinner, build_tool_preview as _build_tool_preview,
    get_cute_tool_message as _get_cute_tool_message_impl,
    _detect_tool_failure,
+    get_tool_emoji as _get_tool_emoji,
 )
 from agent.trajectory import (
    convert_scratchpad_to_think, has_incomplete_scratchpad,
@@ -209,6 +205,33 @@ _NEVER_PARALLEL_TOOLS = frozenset({"clarify"})
 # Maximum number of concurrent worker threads for parallel tool execution.
 _MAX_TOOL_WORKERS = 8

+# Patterns that indicate a terminal command may modify/delete files.
+_DESTRUCTIVE_PATTERNS = re.compile(
+    r"""(?:^|\s|&&|\|\||;|`)(?:
+        rm\s|rmdir\s|
+        mv\s|
+        sed\s+-i|
+        truncate\s|
+        dd\s|
+        shred\s|
+        git\s+(?:reset|clean|checkout)\s
+    )""",
+    re.VERBOSE,
+)
+# Output redirects that overwrite files (> but not >>)
+_REDIRECT_OVERWRITE = re.compile(r'[^>]>[^>]|^>[^>]')
+
+
+def _is_destructive_command(cmd: str) -> bool:
+    """Heuristic: does this terminal command look like it modifies/deletes files?"""
+    if not cmd:
+        return False
+    if _DESTRUCTIVE_PATTERNS.search(cmd):
+        return True
+    if _REDIRECT_OVERWRITE.search(cmd):
+        return True
+    return False
+

 def _inject_honcho_turn_context(content, turn_context: str):
    """Append Honcho recall to the current-turn user message without mutating history.
@@ -504,6 +527,11 @@ class AIAgent:
        self._persist_user_message_idx = None
        self._persist_user_message_override = None

+        # Cache anthropic image-to-text fallbacks per image payload/URL so a
+        # single tool loop does not repeatedly re-run auxiliary vision on the
+        # same image history.
+        self._anthropic_image_fallback_cache: Dict[str, str] = {}
+
        # Initialize LLM client via centralized provider router.
        # The router handles auth resolution, base URL, headers, and
        # Codex/Anthropic wrapping for all known providers.
@@ -3034,13 +3062,156 @@ class AIAgent:

    # ── End provider fallback ──────────────────────────────────────────────

+    @staticmethod
+    def _content_has_image_parts(content: Any) -> bool:
+        if not isinstance(content, list):
+            return False
+        for part in content:
+            if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
+                return True
+        return False
+
+    @staticmethod
+    def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]:
+        header, _, data = str(image_url or "").partition(",")
+        mime = "image/jpeg"
+        if header.startswith("data:"):
+            mime_part = header[len("data:"):].split(";", 1)[0].strip()
+            if mime_part.startswith("image/"):
+                mime = mime_part
+        suffix = {
+            "image/png": ".png",
+            "image/gif": ".gif",
+            "image/webp": ".webp",
+            "image/jpeg": ".jpg",
+            "image/jpg": ".jpg",
+        }.get(mime, ".jpg")
+        tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
+        with tmp:
+            tmp.write(base64.b64decode(data))
+        path = Path(tmp.name)
+        return str(path), path
+
+    def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str:
+        cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest()
+        cached = self._anthropic_image_fallback_cache.get(cache_key)
+        if cached:
+            return cached
+
+        role_label = {
+            "assistant": "assistant",
+            "tool": "tool result",
+        }.get(role, "user")
+        analysis_prompt = (
+            "Describe everything visible in this image in thorough detail. "
+            "Include any text, code, UI, data, objects, people, layout, colors, "
+            "and any other notable visual information."
+        )
+
+        vision_source = str(image_url or "")
+        cleanup_path: Optional[Path] = None
+        if vision_source.startswith("data:"):
+            vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source)
+
+        description = ""
+        try:
+            from tools.vision_tools import vision_analyze_tool
+
+            result_json = asyncio.run(
+                vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
+            )
+            result = json.loads(result_json) if isinstance(result_json, str) else {}
+            description = (result.get("analysis") or "").strip()
+        except Exception as e:
+            description = f"Image analysis failed: {e}"
+        finally:
+            if cleanup_path and cleanup_path.exists():
+                try:
+                    cleanup_path.unlink()
+                except OSError:
+                    pass
+
+        if not description:
+            description = "Image analysis failed."
+
+        note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]"
+        if vision_source and not str(image_url or "").startswith("data:"):
+            note += (
+                f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]"
+            )
+
+        self._anthropic_image_fallback_cache[cache_key] = note
+        return note
+
+    def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
+        if not self._content_has_image_parts(content):
+            return content
+
+        text_parts: List[str] = []
+        image_notes: List[str] = []
+        for part in content:
+            if isinstance(part, str):
+                if part.strip():
+                    text_parts.append(part.strip())
+                continue
+            if not isinstance(part, dict):
+                continue
+
+            ptype = part.get("type")
+            if ptype in {"text", "input_text"}:
+                text = str(part.get("text", "") or "").strip()
+                if text:
+                    text_parts.append(text)
+                continue
+
+            if ptype in {"image_url", "input_image"}:
+                image_data = part.get("image_url", {})
+                image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "")
+                if image_url:
+                    image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role))
+                else:
+                    image_notes.append("[An image was attached but no image source was available.]")
+                continue
+
+            text = str(part.get("text", "") or "").strip()
+            if text:
+                text_parts.append(text)
+
+        prefix = "\n\n".join(note for note in image_notes if note).strip()
+        suffix = "\n".join(text for text in text_parts if text).strip()
+        if prefix and suffix:
+            return f"{prefix}\n\n{suffix}"
+        if prefix:
+            return prefix
+        if suffix:
+            return suffix
+        return "[A multimodal message was converted to text for Anthropic compatibility.]"
+
+    def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
+        if not any(
+            isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
+            for msg in api_messages
+        ):
+            return api_messages
+
+        transformed = copy.deepcopy(api_messages)
+        for msg in transformed:
+            if not isinstance(msg, dict):
+                continue
+            msg["content"] = self._preprocess_anthropic_content(
+                msg.get("content"),
+                str(msg.get("role", "user") or "user"),
+            )
+        return transformed
+
    def _build_api_kwargs(self, api_messages: list) -> dict:
        """Build the keyword arguments dict for the active API mode."""
        if self.api_mode == "anthropic_messages":
            from agent.anthropic_adapter import build_anthropic_kwargs
+            anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
            return build_anthropic_kwargs(
                model=self.model,
-                messages=api_messages,
+                messages=anthropic_messages,
                tools=self.tools,
                max_tokens=self.max_tokens,
                reasoning_config=self.reasoning_config,
@@ -3158,8 +3329,7 @@ class AIAgent:
            extra_body["provider"] = provider_preferences
        _is_nous = "nousresearch" in self.base_url.lower()

-        _is_mistral = "api.mistral.ai" in self.base_url.lower()
-        if (_is_openrouter or _is_nous) and not _is_mistral:
+        if self._supports_reasoning_extra_body():
            if self.reasoning_config is not None:
                rc = dict(self.reasoning_config)
                # Nous Portal requires reasoning enabled — don't send
@@ -3183,6 +3353,32 @@ class AIAgent:

        return api_kwargs

+    def _supports_reasoning_extra_body(self) -> bool:
+        """Return True when reasoning extra_body is safe to send for this route/model.
+
+        OpenRouter forwards unknown extra_body fields to upstream providers.
+        Some providers/routes reject `reasoning` with 400s, so gate it to
+        known reasoning-capable model families and direct Nous Portal.
+        """
+        base_url = (self.base_url or "").lower()
+        if "nousresearch" in base_url:
+            return True
+        if "openrouter" not in base_url:
+            return False
+        if "api.mistral.ai" in base_url:
+            return False
+
+        model = (self.model or "").lower()
+        reasoning_model_prefixes = (
+            "deepseek/",
+            "anthropic/",
+            "openai/",
+            "x-ai/",
+            "google/gemini-2",
+            "qwen/qwen3",
+        )
+        return any(model.startswith(prefix) for prefix in reasoning_model_prefixes)
+
    def _build_assistant_message(self, assistant_message, finish_reason: str) -> dict:
        """Build a normalized assistant message dict from an API response message.

@@ -3202,8 +3398,7 @@ class AIAgent:
                reasoning_text = combined or None

        if reasoning_text and self.verbose_logging:
-            preview = reasoning_text[:100] + "..." if len(reasoning_text) > 100 else reasoning_text
-            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {preview}")
+            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")

        if reasoning_text and self.reasoning_callback:
            try:
@@ -3622,6 +3817,8 @@ class AIAgent:
            return handle_function_call(
                function_name, function_args, effective_task_id,
                enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
+                honcho_manager=self._honcho,
+                honcho_session_key=self._honcho_session_key,
            )

    def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
@@ -3672,6 +3869,18 @@ class AIAgent:
                except Exception:
                    pass

+            # Checkpoint before destructive terminal commands
+            if function_name == "terminal" and self._checkpoint_mgr.enabled:
+                try:
+                    cmd = function_args.get("command", "")
+                    if _is_destructive_command(cmd):
+                        cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
+                        self._checkpoint_mgr.ensure_checkpoint(
+                            cwd, f"before terminal: {cmd[:60]}"
+                        )
+                except Exception:
+                    pass
+
            parsed_calls.append((tool_call, function_name, function_args))

        # ── Logging / callbacks ──────────────────────────────────────────
@@ -3680,8 +3889,12 @@ class AIAgent:
            print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
            for i, (tc, name, args) in enumerate(parsed_calls, 1):
                args_str = json.dumps(args, ensure_ascii=False)
-                args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
-                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
+                if self.verbose_logging:
+                    print(f"  📞 Tool {i}: {name}({list(args.keys())})")
+                    print(f"     Args: {args_str}")
+                else:
+                    args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
+                    print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")

        for _, name, args in parsed_calls:
            if self.tool_progress_callback:
@@ -3746,17 +3959,20 @@ class AIAgent:
                    logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)

                if self.verbose_logging:
-                    result_preview = function_result[:200] if len(function_result) > 200 else function_result
                    logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
-                    logging.debug(f"Tool result preview: {result_preview}...")
+                    logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")

            # Print cute message per tool
            if self.quiet_mode:
                cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result)
                print(f"  {cute_msg}")
            elif not self.quiet_mode:
-                response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
-                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")
+                if self.verbose_logging:
+                    print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s")
+                    print(f"     Result: {function_result}")
+                else:
+                    response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
+                    print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")

            # Truncate oversized results
            MAX_TOOL_RESULT_CHARS = 100_000
@@ -3832,8 +4048,12 @@ class AIAgent:

            if not self.quiet_mode:
                args_str = json.dumps(function_args, ensure_ascii=False)
-                args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
-                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
+                if self.verbose_logging:
+                    print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
+                    print(f"     Args: {args_str}")
+                else:
+                    args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
+                    print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")

            if self.tool_progress_callback:
                try:
@@ -3854,6 +4074,18 @@ class AIAgent:
                except Exception:
                    pass  # never block tool execution

+            # Checkpoint before destructive terminal commands
+            if function_name == "terminal" and self._checkpoint_mgr.enabled:
+                try:
+                    cmd = function_args.get("command", "")
+                    if _is_destructive_command(cmd):
+                        cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
+                        self._checkpoint_mgr.ensure_checkpoint(
+                            cwd, f"before terminal: {cmd[:60]}"
+                        )
+                except Exception:
+                    pass  # never block tool execution
+
            tool_start_time = time.time()

            if function_name == "todo":
@@ -3942,23 +4174,7 @@ class AIAgent:
                        self._vprint(f"  {cute_msg}")
            elif self.quiet_mode and self._stream_callback is None:
                face = random.choice(KawaiiSpinner.KAWAII_WAITING)
-                tool_emoji_map = {
-                    'web_search': '🔍', 'web_extract': '📄', 'web_crawl': '🕸️',
-                    'terminal': '💻', 'process': '⚙️',
-                    'read_file': '📖', 'write_file': '✍️', 'patch': '🔧', 'search_files': '🔎',
-                    'browser_navigate': '🌐', 'browser_snapshot': '📸',
-                    'browser_click': '👆', 'browser_type': '⌨️',
-                    'browser_scroll': '📜', 'browser_back': '◀️',
-                    'browser_press': '⌨️', 'browser_close': '🚪',
-                    'browser_get_images': '🖼️', 'browser_vision': '👁️',
-                    'image_generate': '🎨', 'text_to_speech': '🔊',
-                    'vision_analyze': '👁️', 'mixture_of_agents': '🧠',
-                    'skills_list': '📚', 'skill_view': '📚',
-                    'cronjob': '⏰',
-                    'send_message': '📨', 'todo': '📋', 'memory': '🧠', 'session_search': '🔍',
-                    'clarify': '❓', 'execute_code': '🐍', 'delegate_task': '🔀',
-                }
-                emoji = tool_emoji_map.get(function_name, '⚡')
+                emoji = _get_tool_emoji(function_name)
                preview = _build_tool_preview(function_name, function_args) or function_name
                if len(preview) > 30:
                    preview = preview[:27] + "..."
@@ -3969,6 +4185,8 @@ class AIAgent:
                    function_result = handle_function_call(
                        function_name, function_args, effective_task_id,
                        enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
+                        honcho_manager=self._honcho,
+                        honcho_session_key=self._honcho_session_key,
                    )
                    _spinner_result = function_result
                except Exception as tool_error:
@@ -3983,13 +4201,17 @@ class AIAgent:
                    function_result = handle_function_call(
                        function_name, function_args, effective_task_id,
                        enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
+                        honcho_manager=self._honcho,
+                        honcho_session_key=self._honcho_session_key,
                    )
                except Exception as tool_error:
                    function_result = f"Error executing tool '{function_name}': {tool_error}"
                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
                tool_duration = time.time() - tool_start_time

-            result_preview = function_result[:200] if len(function_result) > 200 else function_result
+            result_preview = function_result if self.verbose_logging else (
+                function_result[:200] if len(function_result) > 200 else function_result
+            )

            # Log tool errors to the persistent error log so [error] tags
            # in the UI always have a corresponding detailed entry on disk.
@@ -3999,7 +4221,7 @@ class AIAgent:

            if self.verbose_logging:
                logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
-                logging.debug(f"Tool result preview: {result_preview}...")
+                logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")

            # Guard against tools returning absurdly large content that would
            # blow up the context window. 100K chars ≈ 25K tokens — generous
@@ -4022,8 +4244,12 @@ class AIAgent:
            messages.append(tool_msg)

            if not self.quiet_mode:
-                response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
-                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")
+                if self.verbose_logging:
+                    print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
+                    print(f"     Result: {function_result}")
+                else:
+                    response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
+                    print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")

            if self._interrupt_requested and i < len(assistant_message.tool_calls):
                remaining = len(assistant_message.tool_calls) - i
@@ -4121,9 +4347,8 @@ class AIAgent:
                    api_messages.insert(sys_offset + idx, pfm.copy())

            summary_extra_body = {}
-            _is_openrouter = "openrouter" in self.base_url.lower()
            _is_nous = "nousresearch" in self.base_url.lower()
-            if _is_openrouter or _is_nous:
+            if self._supports_reasoning_extra_body():
                if self.reasoning_config is not None:
                    summary_extra_body["reasoning"] = self.reasoning_config
                else:
@@ -4242,6 +4467,7 @@ class AIAgent:
        task_id: str = None,
        stream_callback: Optional[callable] = None,
        persist_user_message: Optional[str] = None,
+        sync_honcho: bool = True,
    ) -> Dict[str, Any]:
        """
        Run a complete conversation with tool calling until completion.
@@ -4257,6 +4483,8 @@ class AIAgent:
            persist_user_message: Optional clean user message to store in
                transcripts/history when user_message contains API-only
                synthetic prefixes.
+            sync_honcho: When False, skip writing the final synthetic turn back
+                to Honcho or queuing follow-up prefetch work.

        Returns:
            Dict: Complete conversation result with final response and message history
@@ -4901,6 +5129,22 @@ class AIAgent:
                        self.session_completion_tokens += completion_tokens
                        self.session_total_tokens += total_tokens
                        self.session_api_calls += 1
+
+                        # Persist token counts to session DB for /insights.
+                        # Gateway sessions persist via session_store.update_session()
+                        # after run_conversation returns, so only persist here for
+                        # CLI (and other non-gateway) platforms to avoid double-counting.
+                        if (self._session_db and self.session_id
+                                and getattr(self, 'platform', None) == 'cli'):
+                            try:
+                                self._session_db.update_token_counts(
+                                    self.session_id,
+                                    input_tokens=prompt_tokens,
+                                    output_tokens=completion_tokens,
+                                    model=self.model,
+                                )
+                            except Exception:
+                                pass  # never block the agent loop
                        
                        if self.verbose_logging:
                            logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
@@ -5275,7 +5519,10 @@ class AIAgent:

                # Handle assistant response
                if assistant_message.content and not self.quiet_mode:
-                    self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
+                    if self.verbose_logging:
+                        self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content}")
+                    else:
+                        self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")

                # Notify progress callback of model's thinking (used by subagent
                # delegation to relay the child's reasoning to the parent display).
@@ -5439,6 +5686,12 @@ class AIAgent:
                    invalid_json_args = []
                    for tc in assistant_message.tool_calls:
                        args = tc.function.arguments
+                        if isinstance(args, (dict, list)):
+                            tc.function.arguments = json.dumps(args)
+                            continue
+                        if args is not None and not isinstance(args, str):
+                            tc.function.arguments = str(args)
+                            args = tc.function.arguments
                        # Treat empty/whitespace strings as empty object
                        if not args or not args.strip():
                            tc.function.arguments = "{}"
@@ -5740,7 +5993,7 @@ class AIAgent:
        self._persist_session(messages, conversation_history)

        # Sync conversation to Honcho for user modeling
-        if final_response and not interrupted:
+        if final_response and not interrupted and sync_honcho:
            self._honcho_sync(original_user_message, final_response)
            self._queue_honcho_prefetch(original_user_message)

@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+"""Discord Voice Doctor — diagnostic tool for voice channel support.
+
+Checks all dependencies, configuration, and bot permissions needed
+for Discord voice mode to work correctly.
+
+Usage:
+    python scripts/discord-voice-doctor.py
+    .venv/bin/python scripts/discord-voice-doctor.py
+"""
+
+import os
+import sys
+import shutil
+from pathlib import Path
+
+# Resolve project root
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+HERMES_HOME = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
+ENV_FILE = HERMES_HOME / ".env"
+
+OK = "\033[92m\u2713\033[0m"
+FAIL = "\033[91m\u2717\033[0m"
+WARN = "\033[93m!\033[0m"
+
+# Track whether discord.py is available for later sections
+_discord_available = False
+
+
+def mask(value):
+    """Mask sensitive value: show only first 4 chars."""
+    if not value or len(value) < 8:
+        return "****"
+    return f"{value[:4]}{'*' * (len(value) - 4)}"
+
+
+def check(label, ok, detail=""):
+    symbol = OK if ok else FAIL
+    msg = f"  {symbol} {label}"
+    if detail:
+        msg += f"  ({detail})"
+    print(msg)
+    return ok
+
+
+def warn(label, detail=""):
+    msg = f"  {WARN} {label}"
+    if detail:
+        msg += f"  ({detail})"
+    print(msg)
+
+
+def section(title):
+    print(f"\n\033[1m{title}\033[0m")
+
+
+def check_packages():
+    """Check Python package dependencies. Returns True if all critical deps OK."""
+    global _discord_available
+    section("Python Packages")
+    ok = True
+
+    # discord.py
+    try:
+        import discord
+        _discord_available = True
+        check("discord.py", True, f"v{discord.__version__}")
+    except ImportError:
+        check("discord.py", False, "pip install discord.py[voice]")
+        ok = False
+
+    # PyNaCl
+    try:
+        import nacl
+        ver = getattr(nacl, "__version__", "unknown")
+        try:
+            import nacl.secret
+            nacl.secret.Aead(bytes(32))
+            check("PyNaCl", True, f"v{ver}")
+        except (AttributeError, Exception):
+            check("PyNaCl (Aead)", False, f"v{ver} — need >=1.5.0")
+            ok = False
+    except ImportError:
+        check("PyNaCl", False, "pip install PyNaCl>=1.5.0")
+        ok = False
+
+    # davey (DAVE E2EE)
+    try:
+        import davey
+        check("davey (DAVE E2EE)", True, f"v{getattr(davey, '__version__', '?')}")
+    except ImportError:
+        check("davey (DAVE E2EE)", False, "pip install davey")
+        ok = False
+
+    # Optional: local STT
+    try:
+        import faster_whisper
+        check("faster-whisper (local STT)", True)
+    except ImportError:
+        warn("faster-whisper (local STT)", "not installed — local STT unavailable")
+
+    # Optional: TTS providers
+    try:
+        import edge_tts
+        check("edge-tts", True)
+    except ImportError:
+        warn("edge-tts", "not installed — edge TTS unavailable")
+
+    try:
+        import elevenlabs
+        check("elevenlabs SDK", True)
+    except ImportError:
+        warn("elevenlabs SDK", "not installed — premium TTS unavailable")
+
+    return ok
+
+
+def check_system_tools():
+    """Check system-level tools (opus, ffmpeg). Returns True if all OK."""
+    section("System Tools")
+    ok = True
+
+    # Opus codec
+    if _discord_available:
+        try:
+            import discord
+            opus_loaded = discord.opus.is_loaded()
+            if not opus_loaded:
+                import ctypes.util
+                opus_path = ctypes.util.find_library("opus")
+                if not opus_path:
+                    # Platform-specific fallback paths
+                    candidates = [
+                        "/opt/homebrew/lib/libopus.dylib",   # macOS Apple Silicon
+                        "/usr/local/lib/libopus.dylib",      # macOS Intel
+                        "/usr/lib/x86_64-linux-gnu/libopus.so.0",  # Debian/Ubuntu x86
+                        "/usr/lib/aarch64-linux-gnu/libopus.so.0", # Debian/Ubuntu ARM
+                        "/usr/lib/libopus.so",               # Arch Linux
+                        "/usr/lib64/libopus.so",             # RHEL/Fedora
+                    ]
+                    for p in candidates:
+                        if os.path.isfile(p):
+                            opus_path = p
+                            break
+                if opus_path:
+                    discord.opus.load_opus(opus_path)
+                    opus_loaded = discord.opus.is_loaded()
+            if opus_loaded:
+                check("Opus codec", True)
+            else:
+                check("Opus codec", False, "brew install opus / apt install libopus0")
+                ok = False
+        except Exception as e:
+            check("Opus codec", False, str(e))
+            ok = False
+    else:
+        warn("Opus codec", "skipped — discord.py not installed")
+
+    # ffmpeg
+    ffmpeg_path = shutil.which("ffmpeg")
+    if ffmpeg_path:
+        check("ffmpeg", True, ffmpeg_path)
+    else:
+        check("ffmpeg", False, "brew install ffmpeg / apt install ffmpeg")
+        ok = False
+
+    return ok
+
+
+def check_env_vars():
+    """Check environment variables. Returns (ok, token, groq_key, eleven_key)."""
+    section("Environment Variables")
+
+    # Load .env
+    try:
+        from dotenv import load_dotenv
+        if ENV_FILE.exists():
+            load_dotenv(ENV_FILE)
+    except ImportError:
+        pass
+
+    ok = True
+
+    token = os.getenv("DISCORD_BOT_TOKEN", "")
+    if token:
+        check("DISCORD_BOT_TOKEN", True, mask(token))
+    else:
+        check("DISCORD_BOT_TOKEN", False, "not set")
+        ok = False
+
+    # Allowed users — resolve usernames if possible
+    allowed = os.getenv("DISCORD_ALLOWED_USERS", "")
+    if allowed:
+        users = [u.strip() for u in allowed.split(",") if u.strip()]
+        user_labels = []
+        for uid in users:
+            label = mask(uid)
+            if token and uid.isdigit():
+                try:
+                    import requests
+                    r = requests.get(
+                        f"https://discord.com/api/v10/users/{uid}",
+                        headers={"Authorization": f"Bot {token}"},
+                        timeout=3,
+                    )
+                    if r.status_code == 200:
+                        label = f"{r.json().get('username', '?')} ({mask(uid)})"
+                except Exception:
+                    pass
+            user_labels.append(label)
+        check("DISCORD_ALLOWED_USERS", True, f"{len(users)} user(s): {', '.join(user_labels)}")
+    else:
+        warn("DISCORD_ALLOWED_USERS", "not set — all users can use voice")
+
+    groq_key = os.getenv("GROQ_API_KEY", "")
+    eleven_key = os.getenv("ELEVENLABS_API_KEY", "")
+
+    if groq_key:
+        check("GROQ_API_KEY (STT)", True, mask(groq_key))
+    else:
+        warn("GROQ_API_KEY", "not set — Groq STT unavailable")
+
+    if eleven_key:
+        check("ELEVENLABS_API_KEY (TTS)", True, mask(eleven_key))
+    else:
+        warn("ELEVENLABS_API_KEY", "not set — ElevenLabs TTS unavailable")
+
+    return ok, token, groq_key, eleven_key
+
+
+def check_config(groq_key, eleven_key):
+    """Check hermes config.yaml."""
+    section("Configuration")
+
+    config_path = HERMES_HOME / "config.yaml"
+    if config_path.exists():
+        try:
+            import yaml
+            with open(config_path) as f:
+                cfg = yaml.safe_load(f) or {}
+
+            stt_provider = cfg.get("stt", {}).get("provider", "local")
+            tts_provider = cfg.get("tts", {}).get("provider", "edge")
+            check("STT provider", True, stt_provider)
+            check("TTS provider", True, tts_provider)
+
+            if stt_provider == "groq" and not groq_key:
+                warn("STT config says groq but GROQ_API_KEY is missing")
+            if tts_provider == "elevenlabs" and not eleven_key:
+                warn("TTS config says elevenlabs but ELEVENLABS_API_KEY is missing")
+        except Exception as e:
+            warn("config.yaml", f"parse error: {e}")
+    else:
+        warn("config.yaml", "not found — using defaults")
+
+    # Voice mode state
+    voice_mode_path = HERMES_HOME / "gateway_voice_mode.json"
+    if voice_mode_path.exists():
+        try:
+            import json
+            modes = json.loads(voice_mode_path.read_text())
+            off_count = sum(1 for v in modes.values() if v == "off")
+            all_count = sum(1 for v in modes.values() if v == "all")
+            check("Voice mode state", True, f"{all_count} on, {off_count} off, {len(modes)} total")
+        except Exception:
+            warn("Voice mode state", "parse error")
+    else:
+        check("Voice mode state", True, "no saved state (fresh)")
+
+
+def check_bot_permissions(token):
+    """Check bot permissions via Discord API. Returns True if all OK."""
+    section("Bot Permissions")
+
+    if not token:
+        warn("Bot permissions", "no token — skipping")
+        return True
+
+    try:
+        import requests
+    except ImportError:
+        warn("Bot permissions", "requests not installed — skipping")
+        return True
+
+    VOICE_PERMS = {
+        "Priority Speaker":      8,
+        "Stream":                9,
+        "View Channel":         10,
+        "Send Messages":        11,
+        "Embed Links":          14,
+        "Attach Files":         15,
+        "Read Message History": 16,
+        "Connect":              20,
+        "Speak":                21,
+        "Mute Members":         22,
+        "Deafen Members":       23,
+        "Move Members":         24,
+        "Use VAD":              25,
+        "Send Voice Messages":  46,
+    }
+    REQUIRED_PERMS = {"Connect", "Speak", "View Channel", "Send Messages"}
+    ok = True
+
+    try:
+        headers = {"Authorization": f"Bot {token}"}
+        r = requests.get("https://discord.com/api/v10/users/@me", headers=headers, timeout=5)
+
+        if r.status_code == 401:
+            check("Bot login", False, "invalid token (401)")
+            return False
+        if r.status_code != 200:
+            check("Bot login", False, f"HTTP {r.status_code}")
+            return False
+
+        bot = r.json()
+        bot_name = bot.get("username", "?")
+        check("Bot login", True, f"{bot_name[:3]}{'*' * (len(bot_name) - 3)}")
+
+        # Check guilds
+        r2 = requests.get("https://discord.com/api/v10/users/@me/guilds", headers=headers, timeout=5)
+        if r2.status_code != 200:
+            warn("Guilds", f"HTTP {r2.status_code}")
+            return ok
+
+        guilds = r2.json()
+        check("Guilds", True, f"{len(guilds)} guild(s)")
+
+        for g in guilds[:5]:
+            perms = int(g.get("permissions", 0))
+            is_admin = bool(perms & (1 << 3))
+
+            if is_admin:
+                print(f"    {OK} {g['name']}: Administrator (all permissions)")
+                continue
+
+            has = []
+            missing = []
+            for name, bit in sorted(VOICE_PERMS.items(), key=lambda x: x[1]):
+                if perms & (1 << bit):
+                    has.append(name)
+                elif name in REQUIRED_PERMS:
+                    missing.append(name)
+
+            if missing:
+                print(f"    {FAIL} {g['name']}: missing {', '.join(missing)}")
+                ok = False
+            else:
+                print(f"    {OK} {g['name']}: {', '.join(has)}")
+
+    except requests.exceptions.Timeout:
+        warn("Bot permissions", "Discord API timeout")
+    except requests.exceptions.ConnectionError:
+        warn("Bot permissions", "cannot reach Discord API")
+    except Exception as e:
+        warn("Bot permissions", f"check failed: {e}")
+
+    return ok
+
+
+def main():
+    print()
+    print("\033[1m" + "=" * 50 + "\033[0m")
+    print("\033[1m  Discord Voice Doctor\033[0m")
+    print("\033[1m" + "=" * 50 + "\033[0m")
+
+    all_ok = True
+
+    all_ok &= check_packages()
+    all_ok &= check_system_tools()
+    env_ok, token, groq_key, eleven_key = check_env_vars()
+    all_ok &= env_ok
+    check_config(groq_key, eleven_key)
+    all_ok &= check_bot_permissions(token)
+
+    # Summary
+    print()
+    print("\033[1m" + "-" * 50 + "\033[0m")
+    if all_ok:
+        print(f"  {OK} \033[92mAll checks passed — voice mode ready!\033[0m")
+    else:
+        print(f"  {FAIL} \033[91mSome checks failed — fix issues above.\033[0m")
+    print()
+
+
+if __name__ == "__main__":
+    main()
@@ -114,6 +114,7 @@ curl -s "https://export.arxiv.org/api/query?id_list=2402.03300,2401.12345,2403.0

 After fetching metadata for a paper, generate a BibTeX entry:

+{% raw %}
 ```bash
 curl -s "https://export.arxiv.org/api/query?id_list=1706.03762" | python3 -c "
 import sys, xml.etree.ElementTree as ET
@@ -139,6 +140,7 @@ print(f'  url       = {{https://arxiv.org/abs/{raw_id}}}')
 print('}')
 "
 ```
+{% endraw %}

 ## Reading Paper Content

@@ -215,6 +215,7 @@ def generate_citation_key(bibtex: str) -> str:

 ### Complete Citation Manager Class

+{% raw %}
 ```python
 """
 Citation Manager - Verified citation workflow for ML papers.
@@ -377,6 +378,7 @@ if __name__ == "__main__":
    if bibtex:
        print(bibtex)
 ```
+{% endraw %}

 ### Quick Functions

@@ -195,7 +195,7 @@ class TestGetTextAuxiliaryClient:
        with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
             patch("agent.auxiliary_client.OpenAI") as mock_openai:
            client, model = get_text_auxiliary_client()
-        assert model == "gpt-5.3-codex"
+        assert model == "gpt-5.2-codex"
        # Returns a CodexAuxiliaryClient wrapper, not a raw OpenAI client
        from agent.auxiliary_client import CodexAuxiliaryClient
        assert isinstance(client, CodexAuxiliaryClient)
@@ -288,7 +288,7 @@ class TestVisionClientFallback:
            client, model = get_vision_auxiliary_client()
        from agent.auxiliary_client import CodexAuxiliaryClient
        assert isinstance(client, CodexAuxiliaryClient)
-        assert model == "gpt-5.3-codex"
+        assert model == "gpt-5.2-codex"

    def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch):
        """Custom endpoint is used as fallback in vision auto mode.
@@ -371,7 +371,7 @@ class TestVisionClientFallback:
            client, model = get_vision_auxiliary_client()
        from agent.auxiliary_client import CodexAuxiliaryClient
        assert isinstance(client, CodexAuxiliaryClient)
-        assert model == "gpt-5.3-codex"
+        assert model == "gpt-5.2-codex"


 class TestGetAuxiliaryProvider:
@@ -489,7 +489,7 @@ class TestResolveForcedProvider:
            client, model = _resolve_forced_provider("main")
        from agent.auxiliary_client import CodexAuxiliaryClient
        assert isinstance(client, CodexAuxiliaryClient)
-        assert model == "gpt-5.3-codex"
+        assert model == "gpt-5.2-codex"

    def test_forced_codex(self, codex_auth_dir, monkeypatch):
        with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
@@ -497,7 +497,7 @@ class TestResolveForcedProvider:
            client, model = _resolve_forced_provider("codex")
        from agent.auxiliary_client import CodexAuxiliaryClient
        assert isinstance(client, CodexAuxiliaryClient)
-        assert model == "gpt-5.3-codex"
+        assert model == "gpt-5.2-codex"

    def test_forced_codex_no_token(self, monkeypatch):
        with patch("agent.auxiliary_client._read_codex_access_token", return_value=None):
@@ -0,0 +1,123 @@
+"""Tests for get_tool_emoji in agent/display.py — skin + registry integration."""
+
+from unittest.mock import patch as mock_patch, MagicMock
+
+from agent.display import get_tool_emoji
+
+
+class TestGetToolEmoji:
+    """Verify the skin → registry → fallback resolution chain."""
+
+    def test_returns_registry_emoji_when_no_skin(self):
+        """Registry-registered emoji is used when no skin is active."""
+        mock_registry = MagicMock()
+        mock_registry.get_emoji.return_value = "🎨"
+        with mock_patch("agent.display._get_skin", return_value=None), \
+             mock_patch("agent.display.registry", mock_registry, create=True):
+            # Need to patch the import inside get_tool_emoji
+            pass
+        # Direct test: patch the lazy import path
+        with mock_patch("agent.display._get_skin", return_value=None):
+            # get_tool_emoji will try to import registry — mock that
+            mock_reg = MagicMock()
+            mock_reg.get_emoji.return_value = "📖"
+            with mock_patch.dict("sys.modules", {}):
+                import sys
+                # Patch tools.registry module
+                mock_module = MagicMock()
+                mock_module.registry = mock_reg
+                with mock_patch.dict(sys.modules, {"tools.registry": mock_module}):
+                    result = get_tool_emoji("read_file")
+                    assert result == "📖"
+
+    def test_skin_override_takes_precedence(self):
+        """Skin tool_emojis override registry defaults."""
+        skin = MagicMock()
+        skin.tool_emojis = {"terminal": "⚔"}
+        with mock_patch("agent.display._get_skin", return_value=skin):
+            result = get_tool_emoji("terminal")
+            assert result == "⚔"
+
+    def test_skin_empty_dict_falls_through(self):
+        """Empty skin tool_emojis falls through to registry."""
+        skin = MagicMock()
+        skin.tool_emojis = {}
+        mock_reg = MagicMock()
+        mock_reg.get_emoji.return_value = "💻"
+        import sys
+        mock_module = MagicMock()
+        mock_module.registry = mock_reg
+        with mock_patch("agent.display._get_skin", return_value=skin), \
+             mock_patch.dict(sys.modules, {"tools.registry": mock_module}):
+            result = get_tool_emoji("terminal")
+            assert result == "💻"
+
+    def test_fallback_default(self):
+        """When neither skin nor registry has an emoji, use the default."""
+        skin = MagicMock()
+        skin.tool_emojis = {}
+        mock_reg = MagicMock()
+        mock_reg.get_emoji.return_value = ""
+        import sys
+        mock_module = MagicMock()
+        mock_module.registry = mock_reg
+        with mock_patch("agent.display._get_skin", return_value=skin), \
+             mock_patch.dict(sys.modules, {"tools.registry": mock_module}):
+            result = get_tool_emoji("unknown_tool")
+            assert result == "⚡"
+
+    def test_custom_default(self):
+        """Custom default is returned when nothing matches."""
+        with mock_patch("agent.display._get_skin", return_value=None):
+            mock_reg = MagicMock()
+            mock_reg.get_emoji.return_value = ""
+            import sys
+            mock_module = MagicMock()
+            mock_module.registry = mock_reg
+            with mock_patch.dict(sys.modules, {"tools.registry": mock_module}):
+                result = get_tool_emoji("x", default="⚙️")
+                assert result == "⚙️"
+
+    def test_skin_override_only_for_matching_tool(self):
+        """Skin override for one tool doesn't affect others."""
+        skin = MagicMock()
+        skin.tool_emojis = {"terminal": "⚔"}
+        mock_reg = MagicMock()
+        mock_reg.get_emoji.return_value = "🔍"
+        import sys
+        mock_module = MagicMock()
+        mock_module.registry = mock_reg
+        with mock_patch("agent.display._get_skin", return_value=skin), \
+             mock_patch.dict(sys.modules, {"tools.registry": mock_module}):
+            assert get_tool_emoji("terminal") == "⚔"  # skin override
+            assert get_tool_emoji("web_search") == "🔍"  # registry fallback
+
+
+class TestSkinConfigToolEmojis:
+    """Verify SkinConfig handles tool_emojis field correctly."""
+
+    def test_skin_config_has_tool_emojis_field(self):
+        from hermes_cli.skin_engine import SkinConfig
+        skin = SkinConfig(name="test")
+        assert skin.tool_emojis == {}
+
+    def test_skin_config_accepts_tool_emojis(self):
+        from hermes_cli.skin_engine import SkinConfig
+        emojis = {"terminal": "⚔", "web_search": "🔮"}
+        skin = SkinConfig(name="test", tool_emojis=emojis)
+        assert skin.tool_emojis == emojis
+
+    def test_build_skin_config_includes_tool_emojis(self):
+        from hermes_cli.skin_engine import _build_skin_config
+        data = {
+            "name": "custom",
+            "tool_emojis": {"terminal": "🗡️", "patch": "⚒️"},
+        }
+        skin = _build_skin_config(data)
+        assert skin.tool_emojis == {"terminal": "🗡️", "patch": "⚒️"}
+
+    def test_build_skin_config_empty_tool_emojis_default(self):
+        from hermes_cli.skin_engine import _build_skin_config
+        data = {"name": "minimal"}
+        skin = _build_skin_config(data)
+        assert skin.tool_emojis == {}
@@ -0,0 +1,61 @@
+from agent.smart_model_routing import choose_cheap_model_route
+
+
+_BASE_CONFIG = {
+    "enabled": True,
+    "cheap_model": {
+        "provider": "openrouter",
+        "model": "google/gemini-2.5-flash",
+    },
+}
+
+
+def test_returns_none_when_disabled():
+    cfg = {**_BASE_CONFIG, "enabled": False}
+    assert choose_cheap_model_route("what time is it in tokyo?", cfg) is None
+
+
+def test_routes_short_simple_prompt():
+    result = choose_cheap_model_route("what time is it in tokyo?", _BASE_CONFIG)
+    assert result is not None
+    assert result["provider"] == "openrouter"
+    assert result["model"] == "google/gemini-2.5-flash"
+    assert result["routing_reason"] == "simple_turn"
+
+
+def test_skips_long_prompt():
+    prompt = "please summarize this carefully " * 20
+    assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
+
+
+def test_skips_code_like_prompt():
+    prompt = "debug this traceback: ```python\nraise ValueError('bad')\n```"
+    assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
+
+
+def test_skips_tool_heavy_prompt_keywords():
+    prompt = "implement a patch for this docker error"
+    assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
+
+
+def test_resolve_turn_route_falls_back_to_primary_when_route_runtime_cannot_be_resolved(monkeypatch):
+    from agent.smart_model_routing import resolve_turn_route
+
+    monkeypatch.setattr(
+        "hermes_cli.runtime_provider.resolve_runtime_provider",
+        lambda **kwargs: (_ for _ in ()).throw(RuntimeError("bad route")),
+    )
+    result = resolve_turn_route(
+        "what time is it in tokyo?",
+        _BASE_CONFIG,
+        {
+            "model": "anthropic/claude-sonnet-4",
+            "provider": "openrouter",
+            "base_url": "https://openrouter.ai/api/v1",
+            "api_mode": "chat_completions",
+            "api_key": "sk-primary",
+        },
+    )
+    assert result["model"] == "anthropic/claude-sonnet-4"
+    assert result["runtime"]["provider"] == "openrouter"
+    assert result["label"] is None
@@ -83,6 +83,14 @@ class TestSessionResetPolicy:
        assert policy.at_hour == 4
        assert policy.idle_minutes == 1440

+    def test_from_dict_treats_null_values_as_defaults(self):
+        restored = SessionResetPolicy.from_dict(
+            {"mode": None, "at_hour": None, "idle_minutes": None}
+        )
+        assert restored.mode == "both"
+        assert restored.at_hour == 4
+        assert restored.idle_minutes == 1440
+

 class TestGatewayConfigRoundtrip:
    def test_full_roundtrip(self):
@@ -96,6 +104,7 @@ class TestGatewayConfigRoundtrip:
            },
            reset_triggers=["/new"],
            quick_commands={"limits": {"type": "exec", "command": "echo ok"}},
+            group_sessions_per_user=False,
        )
        d = config.to_dict()
        restored = GatewayConfig.from_dict(d)
@@ -104,6 +113,7 @@ class TestGatewayConfigRoundtrip:
        assert restored.platforms[Platform.TELEGRAM].token == "tok_123"
        assert restored.reset_triggers == ["/new"]
        assert restored.quick_commands == {"limits": {"type": "exec", "command": "echo ok"}}
+        assert restored.group_sessions_per_user is False


 class TestLoadGatewayConfig:
@@ -125,6 +135,18 @@ class TestLoadGatewayConfig:

        assert config.quick_commands == {"limits": {"type": "exec", "command": "echo ok"}}

+    def test_bridges_group_sessions_per_user_from_config_yaml(self, tmp_path, monkeypatch):
+        hermes_home = tmp_path / ".hermes"
+        hermes_home.mkdir()
+        config_path = hermes_home / "config.yaml"
+        config_path.write_text("group_sessions_per_user: false\n", encoding="utf-8")
+
+        monkeypatch.setenv("HERMES_HOME", str(hermes_home))
+
+        config = load_gateway_config()
+
+        assert config.group_sessions_per_user is False
+
    def test_invalid_quick_commands_in_config_yaml_are_ignored(self, tmp_path, monkeypatch):
        hermes_home = tmp_path / ".hermes"
        hermes_home.mkdir()
@@ -252,3 +252,109 @@ async def test_discord_dms_ignore_mention_requirement(adapter, monkeypatch):
    event = adapter.handle_message.await_args.args[0]
    assert event.text == "dm without mention"
    assert event.source.chat_type == "dm"
+
+
+@pytest.mark.asyncio
+async def test_discord_auto_thread_enabled_by_default(adapter, monkeypatch):
+    """Auto-threading should be enabled by default (DISCORD_AUTO_THREAD defaults to 'true')."""
+    monkeypatch.delenv("DISCORD_AUTO_THREAD", raising=False)
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "false")
+
+    # Patch _auto_create_thread to return a fake thread
+    fake_thread = FakeThread(channel_id=999, name="auto-thread")
+    adapter._auto_create_thread = AsyncMock(return_value=fake_thread)
+
+    message = make_message(channel=FakeTextChannel(channel_id=123), content="hello")
+
+    await adapter._handle_message(message)
+
+    adapter._auto_create_thread.assert_awaited_once()
+    adapter.handle_message.assert_awaited_once()
+    event = adapter.handle_message.await_args.args[0]
+    assert event.source.chat_type == "thread"
+    assert event.source.thread_id == "999"
+
+
+@pytest.mark.asyncio
+async def test_discord_auto_thread_can_be_disabled(adapter, monkeypatch):
+    """Setting auto_thread to false skips thread creation."""
+    monkeypatch.setenv("DISCORD_AUTO_THREAD", "false")
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "false")
+
+    adapter._auto_create_thread = AsyncMock()
+
+    message = make_message(channel=FakeTextChannel(channel_id=123), content="hello")
+
+    await adapter._handle_message(message)
+
+    adapter._auto_create_thread.assert_not_awaited()
+    adapter.handle_message.assert_awaited_once()
+    event = adapter.handle_message.await_args.args[0]
+    assert event.source.chat_type == "group"
+
+
+@pytest.mark.asyncio
+async def test_discord_bot_thread_skips_mention_requirement(adapter, monkeypatch):
+    """Messages in a thread the bot has participated in should not require @mention."""
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "true")
+    monkeypatch.delenv("DISCORD_FREE_RESPONSE_CHANNELS", raising=False)
+    monkeypatch.setenv("DISCORD_AUTO_THREAD", "false")
+
+    # Simulate bot having previously participated in thread 456
+    adapter._bot_participated_threads.add("456")
+
+    thread = FakeThread(channel_id=456, name="existing thread")
+    message = make_message(channel=thread, content="follow-up without mention")
+
+    await adapter._handle_message(message)
+
+    adapter.handle_message.assert_awaited_once()
+    event = adapter.handle_message.await_args.args[0]
+    assert event.text == "follow-up without mention"
+    assert event.source.chat_type == "thread"
+
+
+@pytest.mark.asyncio
+async def test_discord_unknown_thread_still_requires_mention(adapter, monkeypatch):
+    """Messages in a thread the bot hasn't participated in should still require @mention."""
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "true")
+    monkeypatch.delenv("DISCORD_FREE_RESPONSE_CHANNELS", raising=False)
+    monkeypatch.setenv("DISCORD_AUTO_THREAD", "false")
+
+    # Bot has NOT participated in thread 789
+    thread = FakeThread(channel_id=789, name="some thread")
+    message = make_message(channel=thread, content="hello from unknown thread")
+
+    await adapter._handle_message(message)
+
+    adapter.handle_message.assert_not_awaited()
+
+
+@pytest.mark.asyncio
+async def test_discord_auto_thread_tracks_participation(adapter, monkeypatch):
+    """Auto-created threads should be tracked for future mention-free replies."""
+    monkeypatch.delenv("DISCORD_AUTO_THREAD", raising=False)
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "false")
+
+    fake_thread = FakeThread(channel_id=555, name="auto-thread")
+    adapter._auto_create_thread = AsyncMock(return_value=fake_thread)
+
+    message = make_message(channel=FakeTextChannel(channel_id=123), content="start a thread")
+
+    await adapter._handle_message(message)
+
+    assert "555" in adapter._bot_participated_threads
+
+
+@pytest.mark.asyncio
+async def test_discord_thread_participation_tracked_on_dispatch(adapter, monkeypatch):
+    """When the bot processes a message in a thread, it tracks participation."""
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "false")
+    monkeypatch.setenv("DISCORD_AUTO_THREAD", "false")
+
+    thread = FakeThread(channel_id=777, name="manually created thread")
+    message = make_message(channel=thread, content="hello in thread")
+
+    await adapter._handle_message(message)
+
+    assert "777" in adapter._bot_participated_threads
@@ -363,11 +363,37 @@ async def test_auto_thread_creates_thread_and_redirects(adapter, monkeypatch):


@pytest.mark.asyncio
-async def test_auto_thread_disabled_by_default(adapter, monkeypatch):
-    """Without DISCORD_AUTO_THREAD, messages stay in the channel."""
+async def test_auto_thread_enabled_by_default_slash_commands(adapter, monkeypatch):
+    """Without DISCORD_AUTO_THREAD env var, auto-threading is enabled (default: true)."""
    monkeypatch.delenv("DISCORD_AUTO_THREAD", raising=False)
    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "false")

+    fake_thread = _FakeThreadChannel(channel_id=999, name="auto-thread")
+    adapter._auto_create_thread = AsyncMock(return_value=fake_thread)
+
+    captured_events = []
+
+    async def capture_handle(event):
+        captured_events.append(event)
+
+    adapter.handle_message = capture_handle
+
+    msg = _fake_message(_FakeTextChannel())
+
+    await adapter._handle_message(msg)
+
+    adapter._auto_create_thread.assert_awaited_once()
+    assert len(captured_events) == 1
+    assert captured_events[0].source.chat_id == "999"  # redirected to thread
+    assert captured_events[0].source.chat_type == "thread"
+
+
+@pytest.mark.asyncio
+async def test_auto_thread_can_be_disabled(adapter, monkeypatch):
+    """Setting DISCORD_AUTO_THREAD=false keeps messages in the channel."""
+    monkeypatch.setenv("DISCORD_AUTO_THREAD", "false")
+    monkeypatch.setenv("DISCORD_REQUIRE_MENTION", "false")
+
    adapter._auto_create_thread = AsyncMock()

    captured_events = []
@@ -0,0 +1,106 @@
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from gateway.config import GatewayConfig, Platform, PlatformConfig
+from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult
+from gateway.run import GatewayRunner
+from gateway.session import SessionSource, build_session_key
+
+
+class StubAdapter(BasePlatformAdapter):
+    def __init__(self):
+        super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
+
+    async def connect(self):
+        return True
+
+    async def disconnect(self):
+        return None
+
+    async def send(self, chat_id, content, reply_to=None, metadata=None):
+        return SendResult(success=True, message_id="1")
+
+    async def send_typing(self, chat_id, metadata=None):
+        return None
+
+    async def get_chat_info(self, chat_id):
+        return {"id": chat_id}
+
+
+def _source(chat_id="123456", chat_type="dm"):
+    return SessionSource(
+        platform=Platform.TELEGRAM,
+        chat_id=chat_id,
+        chat_type=chat_type,
+    )
+
+
+@pytest.mark.asyncio
+async def test_cancel_background_tasks_cancels_inflight_message_processing():
+    adapter = StubAdapter()
+    release = asyncio.Event()
+
+    async def block_forever(_event):
+        await release.wait()
+        return None
+
+    adapter.set_message_handler(block_forever)
+    event = MessageEvent(text="work", source=_source(), message_id="1")
+
+    await adapter.handle_message(event)
+    await asyncio.sleep(0)
+
+    session_key = build_session_key(event.source)
+    assert session_key in adapter._active_sessions
+    assert adapter._background_tasks
+
+    await adapter.cancel_background_tasks()
+
+    assert adapter._background_tasks == set()
+    assert adapter._active_sessions == {}
+    assert adapter._pending_messages == {}
+
+
+@pytest.mark.asyncio
+async def test_gateway_stop_interrupts_running_agents_and_cancels_adapter_tasks():
+    runner = object.__new__(GatewayRunner)
+    runner.config = GatewayConfig(platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")})
+    runner._running = True
+    runner._shutdown_event = asyncio.Event()
+    runner._exit_reason = None
+    runner._pending_messages = {"session": "pending text"}
+    runner._pending_approvals = {"session": {"command": "rm -rf /tmp/x"}}
+    runner._shutdown_all_gateway_honcho = lambda: None
+
+    adapter = StubAdapter()
+    release = asyncio.Event()
+
+    async def block_forever(_event):
+        await release.wait()
+        return None
+
+    adapter.set_message_handler(block_forever)
+    event = MessageEvent(text="work", source=_source(), message_id="1")
+    await adapter.handle_message(event)
+    await asyncio.sleep(0)
+
+    disconnect_mock = AsyncMock()
+    adapter.disconnect = disconnect_mock
+
+    session_key = build_session_key(event.source)
+    running_agent = MagicMock()
+    runner._running_agents = {session_key: running_agent}
+    runner.adapters = {Platform.TELEGRAM: adapter}
+
+    with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
+        await runner.stop()
+
+    running_agent.interrupt.assert_called_once_with("Gateway shutting down")
+    disconnect_mock.assert_awaited_once()
+    assert runner.adapters == {}
+    assert runner._running_agents == {}
+    assert runner._pending_messages == {}
+    assert runner._pending_approvals == {}
+    assert runner._shutdown_event.is_set() is True
@@ -90,6 +90,7 @@ class TestGatewayHonchoLifecycle:
        runner = _make_runner()
        event = _make_event()
        runner._shutdown_gateway_honcho = MagicMock()
+        runner._async_flush_memories = AsyncMock()
        runner.session_store = MagicMock()
        runner.session_store._generate_session_key.return_value = "gateway-key"
        runner.session_store._entries = {
@@ -100,4 +101,31 @@ class TestGatewayHonchoLifecycle:
        result = await runner._handle_reset_command(event)

        runner._shutdown_gateway_honcho.assert_called_once_with("gateway-key")
+        runner._async_flush_memories.assert_called_once_with("old-session", "gateway-key")
        assert "Session reset" in result
+
+    def test_flush_memories_reuses_gateway_session_key_and_skips_honcho_sync(self):
+        runner = _make_runner()
+        runner.session_store = MagicMock()
+        runner.session_store.load_transcript.return_value = [
+            {"role": "user", "content": "a"},
+            {"role": "assistant", "content": "b"},
+            {"role": "user", "content": "c"},
+            {"role": "assistant", "content": "d"},
+        ]
+        tmp_agent = MagicMock()
+
+        with (
+            patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
+            patch("gateway.run._resolve_gateway_model", return_value="model-name"),
+            patch("run_agent.AIAgent", return_value=tmp_agent) as mock_agent_cls,
+        ):
+            runner._flush_memories_for_session("old-session", "gateway-key")
+
+        mock_agent_cls.assert_called_once()
+        _, kwargs = mock_agent_cls.call_args
+        assert kwargs["session_id"] == "old-session"
+        assert kwargs["honcho_session_key"] == "gateway-key"
+        tmp_agent.run_conversation.assert_called_once()
+        _, run_kwargs = tmp_agent.run_conversation.call_args
+        assert run_kwargs["sync_honcho"] is False
@@ -11,7 +11,7 @@ import asyncio
 import pytest

 from gateway.config import Platform, PlatformConfig
-from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult
+from gateway.platforms.base import BasePlatformAdapter, MessageEvent, MessageType, SendResult
 from gateway.session import SessionSource, build_session_key


@@ -50,11 +50,11 @@ class TestInterruptKeyConsistency:
    """Ensure adapter interrupt methods are queried with session_key, not chat_id."""

    def test_session_key_differs_from_chat_id_for_dm(self):
-        """Session key for a DM is NOT the same as chat_id."""
+        """Session key for a DM is namespaced and includes the DM chat_id."""
        source = _source("123456", "dm")
        session_key = build_session_key(source)
        assert session_key != source.chat_id
-        assert session_key == "agent:main:telegram:dm"
+        assert session_key == "agent:main:telegram:dm:123456"

    def test_session_key_differs_from_chat_id_for_group(self):
        """Session key for a group chat includes prefix, unlike raw chat_id."""
@@ -122,3 +122,29 @@ class TestInterruptKeyConsistency:

        # Interrupt event was set
        assert adapter._active_sessions[session_key].is_set()
+
+    @pytest.mark.asyncio
+    async def test_photo_followup_is_queued_without_interrupt(self):
+        """Photo follow-ups should queue behind the active run instead of interrupting it."""
+        adapter = StubAdapter()
+        adapter.set_message_handler(lambda event: asyncio.sleep(0, result=None))
+
+        source = _source("-1001234", "group")
+        session_key = build_session_key(source)
+        interrupt_event = asyncio.Event()
+        adapter._active_sessions[session_key] = interrupt_event
+
+        event = MessageEvent(
+            text="caption",
+            source=source,
+            message_type=MessageType.PHOTO,
+            message_id="2",
+            media_urls=["/tmp/photo-a.jpg"],
+            media_types=["image/jpeg"],
+        )
+        await adapter.handle_message(event)
+
+        queued = adapter._pending_messages[session_key]
+        assert queued is event
+        assert queued.media_urls == ["/tmp/photo-a.jpg"]
+        assert interrupt_event.is_set() is False
@@ -199,3 +199,28 @@ class TestHandleResumeCommand:

        assert real_key not in runner._running_agents
        db.close()
+
+    @pytest.mark.asyncio
+    async def test_resume_flushes_memories_with_gateway_session_key(self, tmp_path):
+        """Resume should preserve the gateway session key for Honcho flushes."""
+        from hermes_state import SessionDB
+
+        db = SessionDB(db_path=tmp_path / "state.db")
+        db.create_session("old_session", "telegram")
+        db.set_session_title("old_session", "Old Work")
+        db.create_session("current_session_001", "telegram")
+
+        event = _make_event(text="/resume Old Work")
+        runner = _make_runner(
+            session_db=db,
+            current_session_id="current_session_001",
+            event=event,
+        )
+
+        await runner._handle_resume_command(event)
+
+        runner._async_flush_memories.assert_called_once_with(
+            "current_session_001",
+            _session_key_for_event(event),
+        )
+        db.close()
@@ -338,7 +338,7 @@ class TestSessionStoreRewriteTranscript:

 class TestWhatsAppDMSessionKeyConsistency:
    """Regression: all session-key construction must go through build_session_key
-    so WhatsApp DMs include chat_id while other DMs do not."""
+    so DMs are isolated by chat_id across platforms."""

    @pytest.fixture()
    def store(self, tmp_path):
@@ -369,15 +369,72 @@ class TestWhatsAppDMSessionKeyConsistency:
        )
        assert store._generate_session_key(source) == build_session_key(source)

-    def test_telegram_dm_omits_chat_id(self):
-        """Non-WhatsApp DMs should still omit chat_id (single owner DM)."""
+    def test_store_creates_distinct_group_sessions_per_user(self, store):
+        first = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="alice",
+            user_name="Alice",
+        )
+        second = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="bob",
+            user_name="Bob",
+        )
+
+        first_entry = store.get_or_create_session(first)
+        second_entry = store.get_or_create_session(second)
+
+        assert first_entry.session_key == "agent:main:discord:group:guild-123:alice"
+        assert second_entry.session_key == "agent:main:discord:group:guild-123:bob"
+        assert first_entry.session_id != second_entry.session_id
+
+    def test_store_shares_group_sessions_when_disabled_in_config(self, store):
+        store.config.group_sessions_per_user = False
+
+        first = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="alice",
+            user_name="Alice",
+        )
+        second = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="bob",
+            user_name="Bob",
+        )
+
+        first_entry = store.get_or_create_session(first)
+        second_entry = store.get_or_create_session(second)
+
+        assert first_entry.session_key == "agent:main:discord:group:guild-123"
+        assert second_entry.session_key == "agent:main:discord:group:guild-123"
+        assert first_entry.session_id == second_entry.session_id
+
+    def test_telegram_dm_includes_chat_id(self):
+        """Non-WhatsApp DMs should also include chat_id to separate users."""
        source = SessionSource(
            platform=Platform.TELEGRAM,
            chat_id="99",
            chat_type="dm",
        )
        key = build_session_key(source)
-        assert key == "agent:main:telegram:dm"
+        assert key == "agent:main:telegram:dm:99"
+
+    def test_distinct_dm_chat_ids_get_distinct_session_keys(self):
+        """Different DM chats must not collapse into one shared session."""
+        first = SessionSource(platform=Platform.TELEGRAM, chat_id="99", chat_type="dm")
+        second = SessionSource(platform=Platform.TELEGRAM, chat_id="100", chat_type="dm")
+
+        assert build_session_key(first) == "agent:main:telegram:dm:99"
+        assert build_session_key(second) == "agent:main:telegram:dm:100"
+        assert build_session_key(first) != build_session_key(second)

    def test_discord_group_includes_chat_id(self):
        """Group/channel keys include chat_type and chat_id."""
@@ -389,6 +446,41 @@ class TestWhatsAppDMSessionKeyConsistency:
        key = build_session_key(source)
        assert key == "agent:main:discord:group:guild-123"

+    def test_group_sessions_are_isolated_per_user_when_user_id_present(self):
+        first = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="alice",
+        )
+        second = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="bob",
+        )
+
+        assert build_session_key(first) == "agent:main:discord:group:guild-123:alice"
+        assert build_session_key(second) == "agent:main:discord:group:guild-123:bob"
+        assert build_session_key(first) != build_session_key(second)
+
+    def test_group_sessions_can_be_shared_when_isolation_disabled(self):
+        first = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="alice",
+        )
+        second = SessionSource(
+            platform=Platform.DISCORD,
+            chat_id="guild-123",
+            chat_type="group",
+            user_id="bob",
+        )
+
+        assert build_session_key(first, group_sessions_per_user=False) == "agent:main:discord:group:guild-123"
+        assert build_session_key(second, group_sessions_per_user=False) == "agent:main:discord:group:guild-123"
+
    def test_group_thread_includes_thread_id(self):
        """Forum-style threads need a distinct session key within one group."""
        source = SessionSource(
@@ -400,6 +492,17 @@ class TestWhatsAppDMSessionKeyConsistency:
        key = build_session_key(source)
        assert key == "agent:main:telegram:group:-1002285219667:17585"

+    def test_group_thread_sessions_are_isolated_per_user(self):
+        source = SessionSource(
+            platform=Platform.TELEGRAM,
+            chat_id="-1002285219667",
+            chat_type="group",
+            thread_id="17585",
+            user_id="42",
+        )
+        key = build_session_key(source)
+        assert key == "agent:main:telegram:group:-1002285219667:17585:42"
+

 class TestSessionStoreEntriesAttribute:
    """Regression: /reset must access _entries, not _sessions."""
@@ -0,0 +1,45 @@
+import os
+
+from gateway.config import Platform
+from gateway.run import GatewayRunner
+from gateway.session import SessionContext, SessionSource
+
+
+def test_set_session_env_includes_thread_id(monkeypatch):
+    runner = object.__new__(GatewayRunner)
+    source = SessionSource(
+        platform=Platform.TELEGRAM,
+        chat_id="-1001",
+        chat_name="Group",
+        chat_type="group",
+        thread_id="17585",
+    )
+    context = SessionContext(source=source, connected_platforms=[], home_channels={})
+
+    monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_CHAT_ID", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_CHAT_NAME", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_THREAD_ID", raising=False)
+
+    runner._set_session_env(context)
+
+    assert os.getenv("HERMES_SESSION_PLATFORM") == "telegram"
+    assert os.getenv("HERMES_SESSION_CHAT_ID") == "-1001"
+    assert os.getenv("HERMES_SESSION_CHAT_NAME") == "Group"
+    assert os.getenv("HERMES_SESSION_THREAD_ID") == "17585"
+
+
+def test_clear_session_env_removes_thread_id(monkeypatch):
+    runner = object.__new__(GatewayRunner)
+
+    monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
+    monkeypatch.setenv("HERMES_SESSION_CHAT_ID", "-1001")
+    monkeypatch.setenv("HERMES_SESSION_CHAT_NAME", "Group")
+    monkeypatch.setenv("HERMES_SESSION_THREAD_ID", "17585")
+
+    runner._clear_session_env()
+
+    assert os.getenv("HERMES_SESSION_PLATFORM") is None
+    assert os.getenv("HERMES_SESSION_CHAT_ID") is None
+    assert os.getenv("HERMES_SESSION_CHAT_NAME") is None
+    assert os.getenv("HERMES_SESSION_THREAD_ID") is None
@@ -0,0 +1,81 @@
+"""Tests for SSL certificate auto-detection in gateway/run.py."""
+
+import importlib
+import os
+from unittest.mock import patch, MagicMock
+
+
+def _load_ensure_ssl():
+    """Import _ensure_ssl_certs fresh (gateway/run.py has heavy deps, so we
+    extract just the function source to avoid importing the whole gateway)."""
+    # We can test via the actual module since conftest isolates HERMES_HOME,
+    # but we need to be careful about side effects.  Instead, replicate the
+    # logic in a controlled way.
+    from types import ModuleType
+    import textwrap, ssl as _ssl  # noqa: F401
+
+    code = textwrap.dedent("""\
+    import os, ssl
+
+    def _ensure_ssl_certs():
+        if "SSL_CERT_FILE" in os.environ:
+            return
+        paths = ssl.get_default_verify_paths()
+        for candidate in (paths.cafile, paths.openssl_cafile):
+            if candidate and os.path.exists(candidate):
+                os.environ["SSL_CERT_FILE"] = candidate
+                return
+        try:
+            import certifi
+            os.environ["SSL_CERT_FILE"] = certifi.where()
+            return
+        except ImportError:
+            pass
+        for candidate in (
+            "/etc/ssl/certs/ca-certificates.crt",
+            "/etc/ssl/cert.pem",
+        ):
+            if os.path.exists(candidate):
+                os.environ["SSL_CERT_FILE"] = candidate
+                return
+    """)
+    mod = ModuleType("_ssl_helper")
+    exec(code, mod.__dict__)
+    return mod._ensure_ssl_certs
+
+
+class TestEnsureSslCerts:
+    def test_respects_existing_env_var(self):
+        fn = _load_ensure_ssl()
+        with patch.dict(os.environ, {"SSL_CERT_FILE": "/custom/ca.pem"}):
+            fn()
+            assert os.environ["SSL_CERT_FILE"] == "/custom/ca.pem"
+
+    def test_sets_from_ssl_default_paths(self, tmp_path):
+        fn = _load_ensure_ssl()
+        cert = tmp_path / "ca.crt"
+        cert.write_text("FAKE CERT")
+
+        mock_paths = MagicMock()
+        mock_paths.cafile = str(cert)
+        mock_paths.openssl_cafile = None
+
+        env = {k: v for k, v in os.environ.items() if k != "SSL_CERT_FILE"}
+        with patch.dict(os.environ, env, clear=True), \
+             patch("ssl.get_default_verify_paths", return_value=mock_paths):
+            fn()
+            assert os.environ.get("SSL_CERT_FILE") == str(cert)
+
+    def test_no_op_when_nothing_found(self):
+        fn = _load_ensure_ssl()
+        mock_paths = MagicMock()
+        mock_paths.cafile = None
+        mock_paths.openssl_cafile = None
+
+        env = {k: v for k, v in os.environ.items() if k != "SSL_CERT_FILE"}
+        with patch.dict(os.environ, env, clear=True), \
+             patch("ssl.get_default_verify_paths", return_value=mock_paths), \
+             patch("os.path.exists", return_value=False), \
+             patch.dict("sys.modules", {"certifi": None}):
+            fn()
+            assert "SSL_CERT_FILE" not in os.environ
@@ -26,6 +26,22 @@ class TestGatewayPidState:
        assert status.get_running_pid() is None
        assert not pid_path.exists()

+    def test_get_running_pid_accepts_gateway_metadata_when_cmdline_unavailable(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        pid_path = tmp_path / "gateway.pid"
+        pid_path.write_text(json.dumps({
+            "pid": os.getpid(),
+            "kind": "hermes-gateway",
+            "argv": ["python", "-m", "hermes_cli.main", "gateway"],
+            "start_time": 123,
+        }))
+
+        monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
+        monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
+
+        assert status.get_running_pid() == os.getpid()
+

 class TestGatewayRuntimeStatus:
    def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
@@ -0,0 +1,133 @@
+"""Tests for gateway /status behavior and token persistence."""
+
+from datetime import datetime
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from gateway.config import GatewayConfig, Platform, PlatformConfig
+from gateway.platforms.base import MessageEvent
+from gateway.session import SessionEntry, SessionSource, build_session_key
+
+
+def _make_source() -> SessionSource:
+    return SessionSource(
+        platform=Platform.TELEGRAM,
+        user_id="u1",
+        chat_id="c1",
+        user_name="tester",
+        chat_type="dm",
+    )
+
+
+def _make_event(text: str) -> MessageEvent:
+    return MessageEvent(
+        text=text,
+        source=_make_source(),
+        message_id="m1",
+    )
+
+
+def _make_runner(session_entry: SessionEntry):
+    from gateway.run import GatewayRunner
+
+    runner = object.__new__(GatewayRunner)
+    runner.config = GatewayConfig(
+        platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
+    )
+    adapter = MagicMock()
+    adapter.send = AsyncMock()
+    runner.adapters = {Platform.TELEGRAM: adapter}
+    runner._voice_mode = {}
+    runner.hooks = SimpleNamespace(emit=AsyncMock(), loaded_hooks=False)
+    runner.session_store = MagicMock()
+    runner.session_store.get_or_create_session.return_value = session_entry
+    runner.session_store.load_transcript.return_value = []
+    runner.session_store.has_any_sessions.return_value = True
+    runner.session_store.append_to_transcript = MagicMock()
+    runner.session_store.rewrite_transcript = MagicMock()
+    runner.session_store.update_session = MagicMock()
+    runner._running_agents = {}
+    runner._pending_messages = {}
+    runner._pending_approvals = {}
+    runner._session_db = None
+    runner._reasoning_config = None
+    runner._provider_routing = {}
+    runner._fallback_model = None
+    runner._show_reasoning = False
+    runner._is_user_authorized = lambda _source: True
+    runner._set_session_env = lambda _context: None
+    runner._should_send_voice_reply = lambda *_args, **_kwargs: False
+    runner._send_voice_reply = AsyncMock()
+    runner._capture_gateway_honcho_if_configured = lambda *args, **kwargs: None
+    runner._emit_gateway_run_progress = AsyncMock()
+    return runner
+
+
+@pytest.mark.asyncio
+async def test_status_command_reports_running_agent_without_interrupt(monkeypatch):
+    session_entry = SessionEntry(
+        session_key=build_session_key(_make_source()),
+        session_id="sess-1",
+        created_at=datetime.now(),
+        updated_at=datetime.now(),
+        platform=Platform.TELEGRAM,
+        chat_type="dm",
+        total_tokens=321,
+    )
+    runner = _make_runner(session_entry)
+    running_agent = MagicMock()
+    runner._running_agents[build_session_key(_make_source())] = running_agent
+
+    result = await runner._handle_message(_make_event("/status"))
+
+    assert "**Tokens:** 321" in result
+    assert "**Agent Running:** Yes ⚡" in result
+    running_agent.interrupt.assert_not_called()
+    assert runner._pending_messages == {}
+
+
+@pytest.mark.asyncio
+async def test_handle_message_persists_agent_token_counts(monkeypatch):
+    import gateway.run as gateway_run
+
+    session_entry = SessionEntry(
+        session_key=build_session_key(_make_source()),
+        session_id="sess-1",
+        created_at=datetime.now(),
+        updated_at=datetime.now(),
+        platform=Platform.TELEGRAM,
+        chat_type="dm",
+    )
+    runner = _make_runner(session_entry)
+    runner.session_store.load_transcript.return_value = [{"role": "user", "content": "earlier"}]
+    runner._run_agent = AsyncMock(
+        return_value={
+            "final_response": "ok",
+            "messages": [],
+            "tools": [],
+            "history_offset": 0,
+            "last_prompt_tokens": 80,
+            "input_tokens": 120,
+            "output_tokens": 45,
+            "model": "openai/test-model",
+        }
+    )
+
+    monkeypatch.setattr(gateway_run, "_resolve_runtime_agent_kwargs", lambda: {"api_key": "***"})
+    monkeypatch.setattr(
+        "agent.model_metadata.get_model_context_length",
+        lambda *_args, **_kwargs: 100000,
+    )
+
+    result = await runner._handle_message(_make_event("hello"))
+
+    assert result == "ok"
+    runner.session_store.update_session.assert_called_once_with(
+        session_entry.session_key,
+        input_tokens=120,
+        output_tokens=45,
+        last_prompt_tokens=80,
+        model="openai/test-model",
+    )
@@ -51,3 +51,27 @@ async def test_enrich_message_with_transcription_skips_when_stt_disabled():

    assert "transcription is disabled" in result.lower()
    assert "caption" in result
+
+
+@pytest.mark.asyncio
+async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors():
+    from gateway.run import GatewayRunner
+
+    runner = GatewayRunner.__new__(GatewayRunner)
+    runner.config = GatewayConfig(stt_enabled=True)
+
+    with patch(
+        "tools.transcription_tools.transcribe_audio",
+        return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"},
+    ), patch(
+        "tools.transcription_tools.get_stt_model_from_config",
+        return_value=None,
+    ):
+        result = await runner._enrich_message_with_transcription(
+            "caption",
+            ["/tmp/voice.ogg"],
+        )
+
+    assert "No STT provider is configured" not in result
+    assert "trouble transcribing" in result
+    assert "caption" in result
@@ -12,6 +12,7 @@ import asyncio
 import importlib
 import os
 import sys
+from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock, patch

 import pytest
@@ -351,6 +352,26 @@ class TestDocumentDownloadBlock:
 # ---------------------------------------------------------------------------

 class TestMediaGroups:
+    @pytest.mark.asyncio
+    async def test_non_album_photo_burst_is_buffered_and_combined(self, adapter):
+        first_photo = _make_photo(_make_file_obj(b"first"))
+        second_photo = _make_photo(_make_file_obj(b"second"))
+
+        msg1 = _make_message(caption="two images", photo=[first_photo])
+        msg2 = _make_message(photo=[second_photo])
+
+        with patch("gateway.platforms.telegram.cache_image_from_bytes", side_effect=["/tmp/burst-one.jpg", "/tmp/burst-two.jpg"]):
+            await adapter._handle_media_message(_make_update(msg1), MagicMock())
+            await adapter._handle_media_message(_make_update(msg2), MagicMock())
+            assert adapter.handle_message.await_count == 0
+            await asyncio.sleep(adapter.MEDIA_GROUP_WAIT_SECONDS + 0.05)
+
+        adapter.handle_message.assert_awaited_once()
+        event = adapter.handle_message.await_args.args[0]
+        assert event.text == "two images"
+        assert event.media_urls == ["/tmp/burst-one.jpg", "/tmp/burst-two.jpg"]
+        assert len(event.media_types) == 2
+
    @pytest.mark.asyncio
    async def test_photo_album_is_buffered_and_combined(self, adapter):
        first_photo = _make_photo(_make_file_obj(b"first"))
@@ -537,6 +558,51 @@ class TestSendDocument:
        assert call_kwargs["reply_to_message_id"] == 50


+class TestTelegramPhotoBatching:
+    @pytest.mark.asyncio
+    async def test_flush_photo_batch_does_not_drop_newer_scheduled_task(self, adapter):
+        old_task = MagicMock()
+        new_task = MagicMock()
+        batch_key = "session:photo-burst"
+        adapter._pending_photo_batch_tasks[batch_key] = new_task
+        adapter._pending_photo_batches[batch_key] = MessageEvent(
+            text="",
+            message_type=MessageType.PHOTO,
+            source=SimpleNamespace(channel_id="chat-1"),
+            media_urls=["/tmp/a.jpg"],
+            media_types=["image/jpeg"],
+        )
+
+        with (
+            patch("gateway.platforms.telegram.asyncio.current_task", return_value=old_task),
+            patch("gateway.platforms.telegram.asyncio.sleep", new=AsyncMock()),
+        ):
+            await adapter._flush_photo_batch(batch_key)
+
+        assert adapter._pending_photo_batch_tasks[batch_key] is new_task
+
+    @pytest.mark.asyncio
+    async def test_disconnect_cancels_pending_photo_batch_tasks(self, adapter):
+        task = MagicMock()
+        task.done.return_value = False
+        adapter._pending_photo_batch_tasks["session:photo-burst"] = task
+        adapter._pending_photo_batches["session:photo-burst"] = MessageEvent(
+            text="",
+            message_type=MessageType.PHOTO,
+            source=SimpleNamespace(channel_id="chat-1"),
+        )
+        adapter._app = MagicMock()
+        adapter._app.updater.stop = AsyncMock()
+        adapter._app.stop = AsyncMock()
+        adapter._app.shutdown = AsyncMock()
+
+        await adapter.disconnect()
+
+        task.cancel.assert_called_once()
+        assert adapter._pending_photo_batch_tasks == {}
+        assert adapter._pending_photo_batches == {}
+
+
 # ---------------------------------------------------------------------------
 # TestSendVideo — outbound video delivery
 # ---------------------------------------------------------------------------
@@ -7,7 +7,7 @@ or corrupt user-visible content.

 import re
 import sys
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock

 import pytest

@@ -392,3 +392,27 @@ class TestStripMdv2:

    def test_empty_string(self):
        assert _strip_mdv2("") == ""
+
+
+@pytest.mark.asyncio
+async def test_send_escapes_chunk_indicator_for_markdownv2(adapter):
+    adapter.MAX_MESSAGE_LENGTH = 80
+    adapter._bot = MagicMock()
+
+    sent_texts = []
+
+    async def _fake_send_message(**kwargs):
+        sent_texts.append(kwargs["text"])
+        msg = MagicMock()
+        msg.message_id = len(sent_texts)
+        return msg
+
+    adapter._bot.send_message = AsyncMock(side_effect=_fake_send_message)
+
+    content = ("**bold** chunk content " * 12).strip()
+    result = await adapter.send("123", content)
+
+    assert result.success is True
+    assert len(sent_texts) > 1
+    assert re.search(r" \\\([0-9]+/[0-9]+\\\)$", sent_texts[0])
+    assert re.search(r" \\\([0-9]+/[0-9]+\\\)$", sent_texts[-1])
@@ -0,0 +1,49 @@
+import asyncio
+from unittest.mock import MagicMock
+
+import pytest
+
+from gateway.config import GatewayConfig, Platform, PlatformConfig
+from gateway.platforms.base import MessageEvent, MessageType
+from gateway.session import SessionSource, build_session_key
+from gateway.run import GatewayRunner
+
+
+class _PendingAdapter:
+    def __init__(self):
+        self._pending_messages = {}
+
+
+def _make_runner():
+    runner = object.__new__(GatewayRunner)
+    runner.config = GatewayConfig(platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")})
+    runner.adapters = {Platform.TELEGRAM: _PendingAdapter()}
+    runner._running_agents = {}
+    runner._pending_messages = {}
+    runner._pending_approvals = {}
+    runner._voice_mode = {}
+    runner._is_user_authorized = lambda _source: True
+    return runner
+
+
+@pytest.mark.asyncio
+async def test_handle_message_does_not_priority_interrupt_photo_followup():
+    runner = _make_runner()
+    source = SessionSource(platform=Platform.TELEGRAM, chat_id="12345", chat_type="dm")
+    session_key = build_session_key(source)
+    running_agent = MagicMock()
+    runner._running_agents[session_key] = running_agent
+
+    event = MessageEvent(
+        text="caption",
+        message_type=MessageType.PHOTO,
+        source=source,
+        media_urls=["/tmp/photo-a.jpg"],
+        media_types=["image/jpeg"],
+    )
+
+    result = await runner._handle_message(event)
+
+    assert result is None
+    running_agent.interrupt.assert_not_called()
+    assert runner.adapters[Platform.TELEGRAM]._pending_messages[session_key] is event
@@ -1,5 +1,6 @@
 """Tests for the /voice command and auto voice reply in the gateway."""

+import importlib.util
 import json
 import os
 import queue
@@ -206,9 +207,11 @@ class TestAutoVoiceReply:
      2. gateway _send_voice_reply: fires based on voice_mode setting

    To prevent double audio, _send_voice_reply is skipped when voice input
-    already triggered base adapter auto-TTS (skip_double = is_voice_input).
-    Exception: Discord voice channel — both auto-TTS and Discord play_tts
-    override skip, so the runner must handle it via play_in_voice_channel.
+    already triggered base adapter auto-TTS.
+
+    For Discord voice channels, the base adapter now routes play_tts directly
+    into VC playback, so the runner should still skip voice-input follow-ups to
+    avoid double playback.
    """

    @pytest.fixture
@@ -292,14 +295,14 @@ class TestAutoVoiceReply:

    # -- Discord VC exception: runner must handle --------------------------

-    def test_discord_vc_voice_input_runner_fires(self, runner):
-        """Discord VC + voice input: base play_tts skips (VC override),
-        so runner must handle via play_in_voice_channel."""
-        assert self._call(runner, "all", MessageType.VOICE, in_voice_channel=True) is True
+    def test_discord_vc_voice_input_base_handles(self, runner):
+        """Discord VC + voice input: base adapter play_tts plays in VC,
+        so runner skips to avoid double playback."""
+        assert self._call(runner, "all", MessageType.VOICE, in_voice_channel=True) is False

-    def test_discord_vc_voice_only_runner_fires(self, runner):
-        """Discord VC + voice_only + voice: runner must handle."""
-        assert self._call(runner, "voice_only", MessageType.VOICE, in_voice_channel=True) is True
+    def test_discord_vc_voice_only_base_handles(self, runner):
+        """Discord VC + voice_only + voice: base adapter handles."""
+        assert self._call(runner, "voice_only", MessageType.VOICE, in_voice_channel=True) is False

    # -- Edge cases --------------------------------------------------------

@@ -422,17 +425,23 @@ class TestDiscordPlayTtsSkip:
        return adapter

    @pytest.mark.asyncio
-    async def test_play_tts_skipped_when_in_vc(self):
+    async def test_play_tts_plays_in_vc_when_connected(self):
        adapter = self._make_discord_adapter()
        # Simulate bot in voice channel for guild 111, text channel 123
        mock_vc = MagicMock()
        mock_vc.is_connected.return_value = True
+        mock_vc.is_playing.return_value = False
        adapter._voice_clients[111] = mock_vc
        adapter._voice_text_channels[111] = 123

+        # Mock play_in_voice_channel to avoid actual ffmpeg call
+        async def fake_play(gid, path):
+            return True
+        adapter.play_in_voice_channel = fake_play
+
        result = await adapter.play_tts(chat_id="123", audio_path="/tmp/test.ogg")
+        # play_tts now plays in VC instead of being a no-op
        assert result.success is True
-        # send_voice should NOT have been called (no client, would fail)

    @pytest.mark.asyncio
    async def test_play_tts_not_skipped_when_not_in_vc(self):
@@ -728,6 +737,24 @@ class TestVoiceChannelCommands:
        result = await runner._handle_voice_channel_join(event)
        assert "failed" in result.lower()

+    @pytest.mark.asyncio
+    async def test_join_missing_voice_dependencies(self, runner):
+        """Missing PyNaCl/davey should return a user-actionable install hint."""
+        mock_channel = MagicMock()
+        mock_channel.name = "General"
+        mock_adapter = AsyncMock()
+        mock_adapter.join_voice_channel = AsyncMock(
+            side_effect=RuntimeError("PyNaCl library needed in order to use voice")
+        )
+        mock_adapter.get_user_voice_channel = AsyncMock(return_value=mock_channel)
+        event = self._make_discord_event()
+        runner.adapters[event.source.platform] = mock_adapter
+
+        result = await runner._handle_voice_channel_join(event)
+
+        assert "voice dependencies are missing" in result.lower()
+        assert "hermes-agent[messaging]" in result
+
    # -- _handle_voice_channel_leave --

    @pytest.mark.asyncio
@@ -2031,3 +2058,534 @@ class TestDisconnectVoiceCleanup:
        assert len(adapter._voice_receivers) == 0
        assert len(adapter._voice_listen_tasks) == 0
        assert len(adapter._voice_timeout_tasks) == 0
+
+
+# =====================================================================
+# Discord Voice Channel Flow Tests
+# =====================================================================
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("nacl") is None,
+    reason="PyNaCl not installed",
+)
+class TestVoiceReception:
+    """Audio reception: SSRC mapping, DAVE passthrough, buffer lifecycle."""
+
+    @staticmethod
+    def _make_receiver(allowed_ids=None, members=None, dave=False, bot_id=9999):
+        from gateway.platforms.discord import VoiceReceiver
+        vc = MagicMock()
+        vc._connection.secret_key = [0] * 32
+        vc._connection.dave_session = MagicMock() if dave else None
+        vc._connection.ssrc = bot_id
+        vc._connection.add_socket_listener = MagicMock()
+        vc._connection.remove_socket_listener = MagicMock()
+        vc._connection.hook = None
+        vc.user = SimpleNamespace(id=bot_id)
+        vc.channel = MagicMock()
+        vc.channel.members = members or []
+        receiver = VoiceReceiver(vc, allowed_user_ids=allowed_ids)
+        return receiver
+
+    @staticmethod
+    def _fill_buffer(receiver, ssrc, duration_s=1.0, age_s=3.0):
+        """Add PCM data to buffer. 48kHz stereo 16-bit = 192000 bytes/sec."""
+        size = int(192000 * duration_s)
+        receiver._buffers[ssrc] = bytearray(b"\x00" * size)
+        receiver._last_packet_time[ssrc] = time.monotonic() - age_s
+
+    # -- Known SSRC (normal flow) --
+
+    def test_known_ssrc_returns_completed(self):
+        receiver = self._make_receiver()
+        receiver.start()
+        receiver.map_ssrc(100, 42)
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+        assert len(receiver._buffers[100]) == 0  # cleared
+
+    def test_known_ssrc_short_buffer_ignored(self):
+        receiver = self._make_receiver()
+        receiver.start()
+        receiver.map_ssrc(100, 42)
+        self._fill_buffer(receiver, 100, duration_s=0.1)  # too short
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+
+    def test_known_ssrc_recent_audio_waits(self):
+        receiver = self._make_receiver()
+        receiver.start()
+        receiver.map_ssrc(100, 42)
+        self._fill_buffer(receiver, 100, age_s=0.0)  # just arrived
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+
+    # -- Unknown SSRC + DAVE passthrough --
+
+    def test_unknown_ssrc_no_automap_no_completed(self):
+        """Unknown SSRC, no members to infer — buffer cleared, not returned."""
+        receiver = self._make_receiver(dave=True, members=[])
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+        assert len(receiver._buffers[100]) == 0
+
+    def test_unknown_ssrc_late_speaking_event(self):
+        """Audio buffered before SPEAKING → SPEAKING maps → next check returns it."""
+        receiver = self._make_receiver(dave=True)
+        receiver.start()
+        self._fill_buffer(receiver, 100, age_s=0.0)  # still receiving
+        # No user yet
+        assert receiver.check_silence() == []
+        # SPEAKING event arrives
+        receiver.map_ssrc(100, 42)
+        # Silence kicks in
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+    # -- SSRC auto-mapping --
+
+    def test_automap_single_allowed_user(self):
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = self._make_receiver(allowed_ids={"42"}, members=members)
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+        assert receiver._ssrc_to_user[100] == 42
+
+    def test_automap_multiple_allowed_users_no_map(self):
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+            SimpleNamespace(id=43, name="Bob"),
+        ]
+        receiver = self._make_receiver(allowed_ids={"42", "43"}, members=members)
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+
+    def test_automap_no_allowlist_single_member(self):
+        """No allowed_user_ids → sole non-bot member inferred."""
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = self._make_receiver(allowed_ids=None, members=members)
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+    def test_automap_unallowed_user_rejected(self):
+        """User in channel but not in allowed list — not mapped."""
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = self._make_receiver(allowed_ids={"99"}, members=members)
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+
+    def test_automap_only_bot_in_channel(self):
+        """Only bot in channel — no one to map to."""
+        members = [SimpleNamespace(id=9999, name="Bot")]
+        receiver = self._make_receiver(allowed_ids=None, members=members)
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+
+    def test_automap_persists_across_calls(self):
+        """Auto-mapped SSRC stays mapped for subsequent checks."""
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = self._make_receiver(allowed_ids={"42"}, members=members)
+        receiver.start()
+        self._fill_buffer(receiver, 100)
+        receiver.check_silence()
+        assert receiver._ssrc_to_user[100] == 42
+        # Second utterance — should use cached mapping
+        self._fill_buffer(receiver, 100)
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+    # -- Stale buffer cleanup --
+
+    def test_stale_unknown_buffer_discarded(self):
+        """Buffer with no user and very old timestamp is discarded."""
+        receiver = self._make_receiver()
+        receiver.start()
+        receiver._buffers[200] = bytearray(b"\x00" * 100)
+        receiver._last_packet_time[200] = time.monotonic() - 10.0
+        receiver.check_silence()
+        assert 200 not in receiver._buffers
+
+    # -- Pause / resume (echo prevention) --
+
+    def test_paused_receiver_ignores_packets(self):
+        receiver = self._make_receiver()
+        receiver.start()
+        receiver.pause()
+        receiver._on_packet(b"\x00" * 100)
+        assert len(receiver._buffers) == 0
+
+    def test_resumed_receiver_accepts_packets(self):
+        receiver = self._make_receiver()
+        receiver.start()
+        receiver.pause()
+        receiver.resume()
+        assert receiver._paused is False
+
+    # -- _on_packet DAVE passthrough behavior --
+
+    def _make_receiver_with_nacl(self, dave_session=None, mapped_ssrcs=None):
+        """Create a receiver that can process _on_packet with mocked NaCl + Opus."""
+        from gateway.platforms.discord import VoiceReceiver
+        vc = MagicMock()
+        vc._connection.secret_key = [0] * 32
+        vc._connection.dave_session = dave_session
+        vc._connection.ssrc = 9999
+        vc._connection.add_socket_listener = MagicMock()
+        vc._connection.remove_socket_listener = MagicMock()
+        vc._connection.hook = None
+        vc.user = SimpleNamespace(id=9999)
+        vc.channel = MagicMock()
+        vc.channel.members = []
+        receiver = VoiceReceiver(vc)
+        receiver.start()
+        # Pre-map SSRCs if provided
+        if mapped_ssrcs:
+            for ssrc, uid in mapped_ssrcs.items():
+                receiver.map_ssrc(ssrc, uid)
+        return receiver
+
+    @staticmethod
+    def _build_rtp_packet(ssrc=100, seq=1, timestamp=960):
+        """Build a minimal valid RTP packet for _on_packet.
+
+        We need: RTP header (12 bytes) + encrypted payload + 4-byte nonce.
+        NaCl decrypt is mocked so payload content doesn't matter.
+        """
+        import struct
+        # RTP header: version=2, payload_type=0x78, no extension, no CSRC
+        header = struct.pack(">BBHII", 0x80, 0x78, seq, timestamp, ssrc)
+        # Fake encrypted payload (NaCl will be mocked) + 4 byte nonce
+        payload = b"\x00" * 20 + b"\x00\x00\x00\x01"
+        return header + payload
+
+    def _inject_mock_decoder(self, receiver, ssrc):
+        """Pre-inject a mock Opus decoder for the given SSRC."""
+        mock_decoder = MagicMock()
+        mock_decoder.decode.return_value = b"\x00" * 3840
+        receiver._decoders[ssrc] = mock_decoder
+        return mock_decoder
+
+    def test_on_packet_dave_known_user_decrypt_ok(self):
+        """Known SSRC + DAVE decrypt success → audio buffered."""
+        dave = MagicMock()
+        dave.decrypt.return_value = b"\xf8\xff\xfe"
+        receiver = self._make_receiver_with_nacl(
+            dave_session=dave, mapped_ssrcs={100: 42}
+        )
+        self._inject_mock_decoder(receiver, 100)
+
+        with patch("nacl.secret.Aead") as mock_aead:
+            mock_aead.return_value.decrypt.return_value = b"\xf8\xff\xfe"
+            receiver._on_packet(self._build_rtp_packet(ssrc=100))
+
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+        dave.decrypt.assert_called_once()
+
+    def test_on_packet_dave_unknown_ssrc_passthrough(self):
+        """Unknown SSRC + DAVE → skip DAVE, attempt Opus decode (passthrough)."""
+        dave = MagicMock()
+        receiver = self._make_receiver_with_nacl(dave_session=dave)
+        self._inject_mock_decoder(receiver, 100)
+
+        with patch("nacl.secret.Aead") as mock_aead:
+            mock_aead.return_value.decrypt.return_value = b"\xf8\xff\xfe"
+            receiver._on_packet(self._build_rtp_packet(ssrc=100))
+
+        dave.decrypt.assert_not_called()
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_on_packet_dave_unencrypted_error_passthrough(self):
+        """DAVE decrypt 'Unencrypted' error → use data as-is, don't drop."""
+        dave = MagicMock()
+        dave.decrypt.side_effect = Exception(
+            "Failed to decrypt: DecryptionFailed(UnencryptedWhenPassthroughDisabled)"
+        )
+        receiver = self._make_receiver_with_nacl(
+            dave_session=dave, mapped_ssrcs={100: 42}
+        )
+        self._inject_mock_decoder(receiver, 100)
+
+        with patch("nacl.secret.Aead") as mock_aead:
+            mock_aead.return_value.decrypt.return_value = b"\xf8\xff\xfe"
+            receiver._on_packet(self._build_rtp_packet(ssrc=100))
+
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_on_packet_dave_other_error_drops(self):
+        """DAVE decrypt non-Unencrypted error → packet dropped."""
+        dave = MagicMock()
+        dave.decrypt.side_effect = Exception("KeyRotationFailed")
+        receiver = self._make_receiver_with_nacl(
+            dave_session=dave, mapped_ssrcs={100: 42}
+        )
+
+        with patch("nacl.secret.Aead") as mock_aead:
+            mock_aead.return_value.decrypt.return_value = b"\xf8\xff\xfe"
+            receiver._on_packet(self._build_rtp_packet(ssrc=100))
+
+        assert len(receiver._buffers.get(100, b"")) == 0
+
+    def test_on_packet_no_dave_direct_decode(self):
+        """No DAVE session → decode directly."""
+        receiver = self._make_receiver_with_nacl(dave_session=None)
+        self._inject_mock_decoder(receiver, 100)
+
+        with patch("nacl.secret.Aead") as mock_aead:
+            mock_aead.return_value.decrypt.return_value = b"\xf8\xff\xfe"
+            receiver._on_packet(self._build_rtp_packet(ssrc=100))
+
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_on_packet_bot_own_ssrc_ignored(self):
+        """Bot's own SSRC → dropped (echo prevention)."""
+        receiver = self._make_receiver_with_nacl()
+        with patch("nacl.secret.Aead"):
+            receiver._on_packet(self._build_rtp_packet(ssrc=9999))
+        assert len(receiver._buffers) == 0
+
+    def test_on_packet_multiple_ssrcs_separate_buffers(self):
+        """Different SSRCs → separate buffers."""
+        receiver = self._make_receiver_with_nacl(dave_session=None)
+        self._inject_mock_decoder(receiver, 100)
+        self._inject_mock_decoder(receiver, 200)
+
+        with patch("nacl.secret.Aead") as mock_aead:
+            mock_aead.return_value.decrypt.return_value = b"\xf8\xff\xfe"
+            receiver._on_packet(self._build_rtp_packet(ssrc=100))
+            receiver._on_packet(self._build_rtp_packet(ssrc=200))
+
+        assert 100 in receiver._buffers
+        assert 200 in receiver._buffers
+
+
+class TestVoiceTTSPlayback:
+    """TTS playback: play_tts in VC, dedup, fallback."""
+
+    @staticmethod
+    def _make_discord_adapter():
+        from gateway.platforms.discord import DiscordAdapter
+        from gateway.config import PlatformConfig, Platform
+        config = PlatformConfig(enabled=True, extra={})
+        config.token = "fake-token"
+        adapter = object.__new__(DiscordAdapter)
+        adapter.platform = Platform.DISCORD
+        adapter.config = config
+        adapter._voice_clients = {}
+        adapter._voice_text_channels = {}
+        adapter._voice_receivers = {}
+        return adapter
+
+    # -- play_tts behavior --
+
+    @pytest.mark.asyncio
+    async def test_play_tts_plays_in_vc(self):
+        """play_tts calls play_in_voice_channel when bot is in VC."""
+        adapter = self._make_discord_adapter()
+        mock_vc = MagicMock()
+        mock_vc.is_connected.return_value = True
+        adapter._voice_clients[111] = mock_vc
+        adapter._voice_text_channels[111] = 123
+
+        played = []
+        async def fake_play(gid, path):
+            played.append((gid, path))
+            return True
+        adapter.play_in_voice_channel = fake_play
+
+        result = await adapter.play_tts(chat_id="123", audio_path="/tmp/tts.ogg")
+        assert result.success is True
+        assert played == [(111, "/tmp/tts.ogg")]
+
+    @pytest.mark.asyncio
+    async def test_play_tts_fallback_when_not_in_vc(self):
+        """play_tts sends as file attachment when bot is not in VC."""
+        adapter = self._make_discord_adapter()
+        from gateway.platforms.base import SendResult
+        adapter.send_voice = AsyncMock(return_value=SendResult(success=False, error="no client"))
+        result = await adapter.play_tts(chat_id="123", audio_path="/tmp/tts.ogg")
+        assert result.success is False
+        adapter.send_voice.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_play_tts_wrong_channel_no_match(self):
+        """play_tts doesn't match if chat_id is for a different channel."""
+        adapter = self._make_discord_adapter()
+        mock_vc = MagicMock()
+        mock_vc.is_connected.return_value = True
+        adapter._voice_clients[111] = mock_vc
+        adapter._voice_text_channels[111] = 123
+
+        from gateway.platforms.base import SendResult
+        adapter.send_voice = AsyncMock(return_value=SendResult(success=True))
+        # Different chat_id — shouldn't match VC
+        result = await adapter.play_tts(chat_id="999", audio_path="/tmp/tts.ogg")
+        adapter.send_voice.assert_called_once()
+
+    # -- Runner dedup --
+
+    @staticmethod
+    def _make_runner():
+        from gateway.run import GatewayRunner
+        runner = object.__new__(GatewayRunner)
+        runner._voice_mode = {}
+        runner.adapters = {}
+        return runner
+
+    def _call_should_reply(self, runner, voice_mode, msg_type, response="Hello", agent_msgs=None):
+        from gateway.platforms.base import MessageType, MessageEvent, SessionSource
+        from gateway.config import Platform
+        runner._voice_mode["ch1"] = voice_mode
+        source = SessionSource(
+            platform=Platform.DISCORD, chat_id="ch1",
+            user_id="1", user_name="test", chat_type="channel",
+        )
+        event = MessageEvent(source=source, text="test", message_type=msg_type)
+        return runner._should_send_voice_reply(event, response, agent_msgs or [])
+
+    def test_voice_input_runner_skips(self):
+        """Voice input: runner skips — base adapter handles via play_tts."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.VOICE) is False
+
+    def test_text_input_voice_all_runner_fires(self):
+        """Text input + voice_mode=all: runner generates TTS."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.TEXT) is True
+
+    def test_text_input_voice_off_no_tts(self):
+        """Text input + voice_mode=off: no TTS."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "off", MessageType.TEXT) is False
+
+    def test_text_input_voice_only_no_tts(self):
+        """Text input + voice_mode=voice_only: no TTS for text."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "voice_only", MessageType.TEXT) is False
+
+    def test_error_response_no_tts(self):
+        """Error response: no TTS regardless of voice_mode."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.TEXT, response="Error: boom") is False
+
+    def test_empty_response_no_tts(self):
+        """Empty response: no TTS."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        assert self._call_should_reply(runner, "all", MessageType.TEXT, response="") is False
+
+    def test_agent_tts_tool_dedup(self):
+        """Agent already called text_to_speech tool: runner skips."""
+        from gateway.platforms.base import MessageType
+        runner = self._make_runner()
+        agent_msgs = [{"role": "assistant", "tool_calls": [
+            {"id": "1", "type": "function", "function": {"name": "text_to_speech", "arguments": "{}"}}
+        ]}]
+        assert self._call_should_reply(runner, "all", MessageType.TEXT, agent_msgs=agent_msgs) is False
+
+
+class TestUDPKeepalive:
+    """UDP keepalive prevents Discord from dropping the voice session."""
+
+    def test_keepalive_interval_is_reasonable(self):
+        from gateway.platforms.discord import DiscordAdapter
+        interval = DiscordAdapter._KEEPALIVE_INTERVAL
+        assert 5 <= interval <= 30, f"Keepalive interval {interval}s should be between 5-30s"
+
+    @pytest.mark.asyncio
+    async def test_keepalive_sends_silence_frame(self):
+        """Listen loop sends silence frame via send_packet after interval."""
+        from gateway.platforms.discord import DiscordAdapter
+        from gateway.config import PlatformConfig, Platform
+
+        config = PlatformConfig(enabled=True, extra={})
+        config.token = "fake"
+        adapter = object.__new__(DiscordAdapter)
+        adapter.platform = Platform.DISCORD
+        adapter.config = config
+        adapter._voice_clients = {}
+        adapter._voice_text_channels = {}
+        adapter._voice_receivers = {}
+        adapter._voice_listen_tasks = {}
+
+        # Mock VC and receiver
+        mock_vc = MagicMock()
+        mock_vc.is_connected.return_value = True
+        mock_conn = MagicMock()
+        adapter._voice_clients[111] = mock_vc
+        mock_vc._connection = mock_conn
+
+        from gateway.platforms.discord import VoiceReceiver
+        mock_receiver_vc = MagicMock()
+        mock_receiver_vc._connection.secret_key = [0] * 32
+        mock_receiver_vc._connection.dave_session = None
+        mock_receiver_vc._connection.ssrc = 9999
+        mock_receiver_vc._connection.add_socket_listener = MagicMock()
+        mock_receiver_vc._connection.remove_socket_listener = MagicMock()
+        mock_receiver_vc._connection.hook = None
+        receiver = VoiceReceiver(mock_receiver_vc)
+        receiver.start()
+        adapter._voice_receivers[111] = receiver
+
+        # Set keepalive interval very short for test
+        original_interval = DiscordAdapter._KEEPALIVE_INTERVAL
+        DiscordAdapter._KEEPALIVE_INTERVAL = 0.1
+
+        try:
+            # Run listen loop briefly
+            import asyncio
+            loop_task = asyncio.create_task(adapter._voice_listen_loop(111))
+            await asyncio.sleep(0.3)
+            receiver._running = False  # stop loop
+            await asyncio.sleep(0.1)
+            loop_task.cancel()
+            try:
+                await loop_task
+            except asyncio.CancelledError:
+                pass
+
+            # send_packet should have been called with silence frame
+            mock_conn.send_packet.assert_called_with(b'\xf8\xff\xfe')
+        finally:
+            DiscordAdapter._KEEPALIVE_INTERVAL = original_interval
@@ -0,0 +1,70 @@
+import importlib
+import os
+import sys
+from pathlib import Path
+
+from hermes_cli.env_loader import load_hermes_dotenv
+
+
+def test_user_env_overrides_stale_shell_values(tmp_path, monkeypatch):
+    home = tmp_path / "hermes"
+    home.mkdir()
+    env_file = home / ".env"
+    env_file.write_text("OPENAI_BASE_URL=https://new.example/v1\n", encoding="utf-8")
+
+    monkeypatch.setenv("OPENAI_BASE_URL", "https://old.example/v1")
+
+    loaded = load_hermes_dotenv(hermes_home=home)
+
+    assert loaded == [env_file]
+    assert os.getenv("OPENAI_BASE_URL") == "https://new.example/v1"
+
+
+def test_project_env_overrides_stale_shell_values_when_user_env_missing(tmp_path, monkeypatch):
+    home = tmp_path / "hermes"
+    project_env = tmp_path / ".env"
+    project_env.write_text("OPENAI_BASE_URL=https://project.example/v1\n", encoding="utf-8")
+
+    monkeypatch.setenv("OPENAI_BASE_URL", "https://old.example/v1")
+
+    loaded = load_hermes_dotenv(hermes_home=home, project_env=project_env)
+
+    assert loaded == [project_env]
+    assert os.getenv("OPENAI_BASE_URL") == "https://project.example/v1"
+
+
+def test_user_env_takes_precedence_over_project_env(tmp_path, monkeypatch):
+    home = tmp_path / "hermes"
+    home.mkdir()
+    user_env = home / ".env"
+    project_env = tmp_path / ".env"
+    user_env.write_text("OPENAI_BASE_URL=https://user.example/v1\n", encoding="utf-8")
+    project_env.write_text("OPENAI_BASE_URL=https://project.example/v1\nOPENAI_API_KEY=project-key\n", encoding="utf-8")
+
+    monkeypatch.setenv("OPENAI_BASE_URL", "https://old.example/v1")
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+    loaded = load_hermes_dotenv(hermes_home=home, project_env=project_env)
+
+    assert loaded == [user_env, project_env]
+    assert os.getenv("OPENAI_BASE_URL") == "https://user.example/v1"
+    assert os.getenv("OPENAI_API_KEY") == "project-key"
+
+
+def test_main_import_applies_user_env_over_shell_values(tmp_path, monkeypatch):
+    home = tmp_path / "hermes"
+    home.mkdir()
+    (home / ".env").write_text(
+        "OPENAI_BASE_URL=https://new.example/v1\nHERMES_INFERENCE_PROVIDER=custom\n",
+        encoding="utf-8",
+    )
+
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setenv("OPENAI_BASE_URL", "https://old.example/v1")
+    monkeypatch.setenv("HERMES_INFERENCE_PROVIDER", "openrouter")
+
+    sys.modules.pop("hermes_cli.main", None)
+    importlib.import_module("hermes_cli.main")
+
+    assert os.getenv("OPENAI_BASE_URL") == "https://new.example/v1"
+    assert os.getenv("HERMES_INFERENCE_PROVIDER") == "custom"
@@ -39,7 +39,7 @@ def test_systemd_status_warns_when_linger_disabled(monkeypatch, tmp_path, capsys
    monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, ""))

    def fake_run(cmd, capture_output=False, text=False, check=False):
-        if cmd[:4] == ["systemctl", "--user", "status", gateway.SERVICE_NAME]:
+        if cmd[:4] == ["systemctl", "--user", "status", gateway.get_service_name()]:
            return SimpleNamespace(returncode=0, stdout="", stderr="")
        if cmd[:3] == ["systemctl", "--user", "is-active"]:
            return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
@@ -76,7 +76,7 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys):
    assert unit_path.exists()
    assert [cmd for cmd, _ in calls] == [
        ["systemctl", "--user", "daemon-reload"],
-        ["systemctl", "--user", "enable", gateway.SERVICE_NAME],
+        ["systemctl", "--user", "enable", gateway.get_service_name()],
    ]
    assert helper_calls == [True]
    assert "User service installed and enabled" in out
@@ -110,7 +110,7 @@ def test_systemd_install_system_scope_skips_linger_and_uses_systemctl(monkeypatc
    assert unit_path.read_text(encoding="utf-8") == "scope=True user=alice\n"
    assert [cmd for cmd, _ in calls] == [
        ["systemctl", "daemon-reload"],
-        ["systemctl", "enable", gateway.SERVICE_NAME],
+        ["systemctl", "enable", gateway.get_service_name()],
    ]
    assert helper_calls == []
    assert "Configured to run as: alice" not in out  # generated test unit has no User= line
@@ -114,7 +114,7 @@ def test_systemd_install_calls_linger_helper(monkeypatch, tmp_path, capsys):
    assert unit_path.exists()
    assert [cmd for cmd, _ in calls] == [
        ["systemctl", "--user", "daemon-reload"],
-        ["systemctl", "--user", "enable", gateway.SERVICE_NAME],
+        ["systemctl", "--user", "enable", gateway.get_service_name()],
    ]
    assert helper_calls == [True]
    assert "User service installed and enabled" in out
@@ -26,7 +26,7 @@ class TestSystemdServiceRefresh:
        assert unit_path.read_text(encoding="utf-8") == "new unit\n"
        assert calls[:2] == [
            ["systemctl", "--user", "daemon-reload"],
-            ["systemctl", "--user", "start", gateway_cli.SERVICE_NAME],
+            ["systemctl", "--user", "start", gateway_cli.get_service_name()],
        ]

    def test_systemd_restart_refreshes_outdated_unit(self, tmp_path, monkeypatch):
@@ -49,7 +49,7 @@ class TestSystemdServiceRefresh:
        assert unit_path.read_text(encoding="utf-8") == "new unit\n"
        assert calls[:2] == [
            ["systemctl", "--user", "daemon-reload"],
-            ["systemctl", "--user", "restart", gateway_cli.SERVICE_NAME],
+            ["systemctl", "--user", "restart", gateway_cli.get_service_name()],
        ]


@@ -92,9 +92,9 @@ class TestGatewayServiceDetection:
        )

        def fake_run(cmd, capture_output=True, text=True, **kwargs):
-            if cmd == ["systemctl", "--user", "is-active", gateway_cli.SERVICE_NAME]:
+            if cmd == ["systemctl", "--user", "is-active", gateway_cli.get_service_name()]:
                return SimpleNamespace(returncode=0, stdout="inactive\n", stderr="")
-            if cmd == ["systemctl", "is-active", gateway_cli.SERVICE_NAME]:
+            if cmd == ["systemctl", "is-active", gateway_cli.get_service_name()]:
                return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
            raise AssertionError(f"Unexpected command: {cmd}")

@@ -7,6 +7,7 @@ from hermes_cli.models import (
    fetch_api_models,
    normalize_provider,
    parse_model_input,
+    probe_api_models,
    provider_label,
    provider_model_ids,
    validate_requested_model,
@@ -26,7 +27,15 @@ FAKE_API_MODELS = [

 def _validate(model, provider="openrouter", api_models=FAKE_API_MODELS, **kw):
    """Shortcut: call validate_requested_model with mocked API."""
-    with patch("hermes_cli.models.fetch_api_models", return_value=api_models):
+    probe_payload = {
+        "models": api_models,
+        "probed_url": "http://localhost:11434/v1/models",
+        "resolved_base_url": kw.get("base_url", "") or "http://localhost:11434/v1",
+        "suggested_base_url": None,
+        "used_fallback": False,
+    }
+    with patch("hermes_cli.models.fetch_api_models", return_value=api_models), \
+         patch("hermes_cli.models.probe_api_models", return_value=probe_payload):
        return validate_requested_model(model, provider, **kw)


@@ -147,6 +156,33 @@ class TestFetchApiModels:
        with patch("hermes_cli.models.urllib.request.urlopen", side_effect=Exception("timeout")):
            assert fetch_api_models("key", "https://example.com/v1") is None

+    def test_probe_api_models_tries_v1_fallback(self):
+        class _Resp:
+            def __enter__(self):
+                return self
+
+            def __exit__(self, exc_type, exc, tb):
+                return False
+
+            def read(self):
+                return b'{"data": [{"id": "local-model"}]}'
+
+        calls = []
+
+        def _fake_urlopen(req, timeout=5.0):
+            calls.append(req.full_url)
+            if req.full_url.endswith("/v1/models"):
+                return _Resp()
+            raise Exception("404")
+
+        with patch("hermes_cli.models.urllib.request.urlopen", side_effect=_fake_urlopen):
+            probe = probe_api_models("key", "http://localhost:8000")
+
+        assert calls == ["http://localhost:8000/models", "http://localhost:8000/v1/models"]
+        assert probe["models"] == ["local-model"]
+        assert probe["resolved_base_url"] == "http://localhost:8000/v1"
+        assert probe["used_fallback"] is True
+

 # -- validate — format checks -----------------------------------------------

@@ -191,6 +227,7 @@ class TestValidateApiFound:
        )
        assert result["accepted"] is True
        assert result["persist"] is True
+        assert result["recognized"] is True


 # -- validate — API not found ------------------------------------------------
@@ -232,3 +269,26 @@ class TestValidateApiFallback:
        result = _validate("some-model", provider="totally-unknown", api_models=None)
        assert result["accepted"] is True
        assert result["persist"] is True
+
+    def test_custom_endpoint_warns_with_probed_url_and_v1_hint(self):
+        with patch(
+            "hermes_cli.models.probe_api_models",
+            return_value={
+                "models": None,
+                "probed_url": "http://localhost:8000/v1/models",
+                "resolved_base_url": "http://localhost:8000",
+                "suggested_base_url": "http://localhost:8000/v1",
+                "used_fallback": False,
+            },
+        ):
+            result = validate_requested_model(
+                "qwen3",
+                "custom",
+                api_key="local-key",
+                base_url="http://localhost:8000",
+            )
+
+        assert result["accepted"] is True
+        assert result["persist"] is True
+        assert "http://localhost:8000/v1/models" in result["message"]
+        assert "http://localhost:8000/v1" in result["message"]
@@ -1,6 +1,6 @@
 """Tests for the hermes_cli models module."""

-from hermes_cli.models import OPENROUTER_MODELS, menu_labels, model_ids
+from hermes_cli.models import OPENROUTER_MODELS, menu_labels, model_ids, detect_provider_for_model


 class TestModelIds:
@@ -54,3 +54,66 @@ class TestOpenRouterModels:
    def test_at_least_5_models(self):
        """Sanity check that the models list hasn't been accidentally truncated."""
        assert len(OPENROUTER_MODELS) >= 5
+
+
+class TestFindOpenrouterSlug:
+    def test_exact_match(self):
+        from hermes_cli.models import _find_openrouter_slug
+        assert _find_openrouter_slug("anthropic/claude-opus-4.6") == "anthropic/claude-opus-4.6"
+
+    def test_bare_name_match(self):
+        from hermes_cli.models import _find_openrouter_slug
+        result = _find_openrouter_slug("claude-opus-4.6")
+        assert result == "anthropic/claude-opus-4.6"
+
+    def test_case_insensitive(self):
+        from hermes_cli.models import _find_openrouter_slug
+        result = _find_openrouter_slug("Anthropic/Claude-Opus-4.6")
+        assert result is not None
+
+    def test_unknown_returns_none(self):
+        from hermes_cli.models import _find_openrouter_slug
+        assert _find_openrouter_slug("totally-fake-model-xyz") is None
+
+
+class TestDetectProviderForModel:
+    def test_anthropic_model_detected(self):
+        """claude-opus-4-6 should resolve to anthropic provider."""
+        result = detect_provider_for_model("claude-opus-4-6", "openai-codex")
+        assert result is not None
+        assert result[0] == "anthropic"
+
+    def test_deepseek_model_detected(self):
+        """deepseek-chat should resolve to deepseek provider."""
+        result = detect_provider_for_model("deepseek-chat", "openai-codex")
+        assert result is not None
+        # Provider is deepseek (direct) or openrouter (fallback) depending on creds
+        assert result[0] in ("deepseek", "openrouter")
+
+    def test_current_provider_model_returns_none(self):
+        """Models belonging to the current provider should not trigger a switch."""
+        assert detect_provider_for_model("gpt-5.3-codex", "openai-codex") is None
+
+    def test_openrouter_slug_match(self):
+        """Models in the OpenRouter catalog should be found."""
+        result = detect_provider_for_model("anthropic/claude-opus-4.6", "openai-codex")
+        assert result is not None
+        assert result[0] == "openrouter"
+        assert result[1] == "anthropic/claude-opus-4.6"
+
+    def test_bare_name_gets_openrouter_slug(self):
+        """Bare model names should get mapped to full OpenRouter slugs."""
+        result = detect_provider_for_model("claude-opus-4.6", "openai-codex")
+        assert result is not None
+        # Should find it on OpenRouter with full slug
+        assert result[1] == "anthropic/claude-opus-4.6"
+
+    def test_unknown_model_returns_none(self):
+        """Completely unknown model names should return None."""
+        assert detect_provider_for_model("nonexistent-model-xyz", "openai-codex") is None
+
+    def test_aggregator_not_suggested(self):
+        """nous/openrouter should never be auto-suggested as target provider."""
+        result = detect_provider_for_model("claude-opus-4-6", "openai-codex")
+        assert result is not None
+        assert result[0] not in ("nous",)  # nous has claude models but shouldn't be suggested
@@ -0,0 +1,64 @@
+import sys
+
+
+def test_sessions_delete_accepts_unique_id_prefix(monkeypatch, capsys):
+    import hermes_cli.main as main_mod
+    import hermes_state
+
+    captured = {}
+
+    class FakeDB:
+        def resolve_session_id(self, session_id):
+            captured["resolved_from"] = session_id
+            return "20260315_092437_c9a6ff"
+
+        def delete_session(self, session_id):
+            captured["deleted"] = session_id
+            return True
+
+        def close(self):
+            captured["closed"] = True
+
+    monkeypatch.setattr(hermes_state, "SessionDB", lambda: FakeDB())
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["hermes", "sessions", "delete", "20260315_092437_c9a6", "--yes"],
+    )
+
+    main_mod.main()
+
+    output = capsys.readouterr().out
+    assert captured == {
+        "resolved_from": "20260315_092437_c9a6",
+        "deleted": "20260315_092437_c9a6ff",
+        "closed": True,
+    }
+    assert "Deleted session '20260315_092437_c9a6ff'." in output
+
+
+def test_sessions_delete_reports_not_found_when_prefix_is_unknown(monkeypatch, capsys):
+    import hermes_cli.main as main_mod
+    import hermes_state
+
+    class FakeDB:
+        def resolve_session_id(self, session_id):
+            return None
+
+        def delete_session(self, session_id):
+            raise AssertionError("delete_session should not be called when resolution fails")
+
+        def close(self):
+            pass
+
+    monkeypatch.setattr(hermes_state, "SessionDB", lambda: FakeDB())
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["hermes", "sessions", "delete", "missing-prefix", "--yes"],
+    )
+
+    main_mod.main()
+
+    output = capsys.readouterr().out
+    assert "Session 'missing-prefix' not found." in output
@@ -115,3 +115,13 @@ class TestConfigYamlRouting:
        set_config_value("terminal.docker_image", "python:3.12")
        config = _read_config(_isolated_hermes_home)
        assert "python:3.12" in config
+
+    def test_terminal_docker_cwd_mount_flag_goes_to_config_and_env(self, _isolated_hermes_home):
+        set_config_value("terminal.docker_mount_cwd_to_workspace", "true")
+        config = _read_config(_isolated_hermes_home)
+        env_content = _read_env(_isolated_hermes_home)
+        assert "docker_mount_cwd_to_workspace: 'true'" in config or "docker_mount_cwd_to_workspace: true" in config
+        assert (
+            "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE=true" in env_content
+            or "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE=True" in env_content
+        )
@@ -75,6 +75,58 @@ def test_setup_keep_current_custom_from_config_does_not_fall_through(tmp_path, m
    assert calls["count"] == 1


+def test_setup_custom_endpoint_saves_working_v1_base_url(tmp_path, monkeypatch):
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _clear_provider_env(monkeypatch)
+
+    config = load_config()
+
+    def fake_prompt_choice(question, choices, default=0):
+        if question == "Select your inference provider:":
+            return 3  # Custom endpoint
+        if question == "Configure vision:":
+            return len(choices) - 1  # Skip
+        raise AssertionError(f"Unexpected prompt_choice call: {question}")
+
+    def fake_prompt(message, current=None, **kwargs):
+        if "API base URL" in message:
+            return "http://localhost:8000"
+        if "API key" in message:
+            return "local-key"
+        if "Model name" in message:
+            return "llm"
+        return ""
+
+    monkeypatch.setattr("hermes_cli.setup.prompt_choice", fake_prompt_choice)
+    monkeypatch.setattr("hermes_cli.setup.prompt", fake_prompt)
+    monkeypatch.setattr("hermes_cli.setup.prompt_yes_no", lambda *args, **kwargs: False)
+    monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
+    monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
+    monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: [])
+    monkeypatch.setattr(
+        "hermes_cli.models.probe_api_models",
+        lambda api_key, base_url: {
+            "models": ["llm"],
+            "probed_url": "http://localhost:8000/v1/models",
+            "resolved_base_url": "http://localhost:8000/v1",
+            "suggested_base_url": "http://localhost:8000/v1",
+            "used_fallback": True,
+        },
+    )
+
+    setup_model_provider(config)
+    save_config(config)
+
+    env = _read_env(tmp_path)
+    reloaded = load_config()
+
+    assert env.get("OPENAI_BASE_URL") == "http://localhost:8000/v1"
+    assert env.get("OPENAI_API_KEY") == "local-key"
+    assert reloaded["model"]["provider"] == "custom"
+    assert reloaded["model"]["base_url"] == "http://localhost:8000/v1"
+    assert reloaded["model"]["default"] == "llm"
+
+
 def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tmp_path, monkeypatch):
    """Keep-current should respect config-backed providers, not fall back to OpenRouter."""
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
@@ -0,0 +1,29 @@
+from hermes_cli import setup as setup_mod
+
+
+def test_prompt_choice_uses_curses_helper(monkeypatch):
+    monkeypatch.setattr(setup_mod, "_curses_prompt_choice", lambda question, choices, default=0: 1)
+
+    idx = setup_mod.prompt_choice("Pick one", ["a", "b", "c"], default=0)
+
+    assert idx == 1
+
+
+def test_prompt_choice_falls_back_to_numbered_input(monkeypatch):
+    monkeypatch.setattr(setup_mod, "_curses_prompt_choice", lambda question, choices, default=0: -1)
+    monkeypatch.setattr("builtins.input", lambda _prompt="": "2")
+
+    idx = setup_mod.prompt_choice("Pick one", ["a", "b", "c"], default=0)
+
+    assert idx == 1
+
+
+def test_prompt_checklist_uses_shared_curses_checklist(monkeypatch):
+    monkeypatch.setattr(
+        "hermes_cli.curses_ui.curses_checklist",
+        lambda title, items, selected, cancel_returns=None: {0, 2},
+    )
+
+    selected = setup_mod.prompt_checklist("Pick tools", ["one", "two", "three"], pre_selected=[1])
+
+    assert selected == [0, 2]
@@ -1,6 +1,13 @@
 """Tests for hermes_cli.tools_config platform tool persistence."""

-from hermes_cli.tools_config import _get_platform_tools, _platform_toolset_summary, _toolset_has_keys
+from unittest.mock import patch
+
+from hermes_cli.tools_config import (
+    _get_platform_tools,
+    _platform_toolset_summary,
+    _save_platform_tools,
+    _toolset_has_keys,
+)


 def test_get_platform_tools_uses_default_when_platform_not_configured():
@@ -31,7 +38,7 @@ def test_platform_toolset_summary_uses_explicit_platform_list():
 def test_toolset_has_keys_for_vision_accepts_codex_auth(tmp_path, monkeypatch):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    (tmp_path / "auth.json").write_text(
-        '{"active_provider":"openai-codex","providers":{"openai-codex":{"tokens":{"access_token":"codex-access-token","refresh_token":"codex-refresh-token"}}}}'
+        '{"active_provider":"openai-codex","providers":{"openai-codex":{"tokens":{"access_token": "codex-...oken","refresh_token": "codex-...oken"}}}}'
    )
    monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
    monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
@@ -40,3 +47,56 @@ def test_toolset_has_keys_for_vision_accepts_codex_auth(tmp_path, monkeypatch):
    monkeypatch.delenv("CONTEXT_VISION_PROVIDER", raising=False)

    assert _toolset_has_keys("vision") is True
+
+
+def test_save_platform_tools_preserves_mcp_server_names():
+    """Ensure MCP server names are preserved when saving platform tools.
+
+    Regression test for https://github.com/NousResearch/hermes-agent/issues/1247
+    """
+    config = {
+        "platform_toolsets": {
+            "cli": ["web", "terminal", "time", "github", "custom-mcp-server"]
+        }
+    }
+
+    new_selection = {"web", "browser"}
+
+    with patch("hermes_cli.tools_config.save_config"):
+        _save_platform_tools(config, "cli", new_selection)
+
+    saved_toolsets = config["platform_toolsets"]["cli"]
+
+    assert "time" in saved_toolsets
+    assert "github" in saved_toolsets
+    assert "custom-mcp-server" in saved_toolsets
+    assert "web" in saved_toolsets
+    assert "browser" in saved_toolsets
+    assert "terminal" not in saved_toolsets
+
+
+def test_save_platform_tools_handles_empty_existing_config():
+    """Saving platform tools works when no existing config exists."""
+    config = {}
+
+    with patch("hermes_cli.tools_config.save_config"):
+        _save_platform_tools(config, "telegram", {"web", "terminal"})
+
+    saved_toolsets = config["platform_toolsets"]["telegram"]
+    assert "web" in saved_toolsets
+    assert "terminal" in saved_toolsets
+
+
+def test_save_platform_tools_handles_invalid_existing_config():
+    """Saving platform tools works when existing config is not a list."""
+    config = {
+        "platform_toolsets": {
+            "cli": "invalid-string-value"
+        }
+    }
+
+    with patch("hermes_cli.tools_config.save_config"):
+        _save_platform_tools(config, "cli", {"web"})
+
+    saved_toolsets = config["platform_toolsets"]["cli"]
+    assert "web" in saved_toolsets
@@ -134,6 +134,16 @@ def test_restore_stashed_changes_applies_without_prompt_when_disabled(monkeypatc



+def test_print_stash_cleanup_guidance_with_selector(capsys):
+    hermes_main._print_stash_cleanup_guidance("abc123", "stash@{2}")
+
+    out = capsys.readouterr().out
+    assert "Check `git status` first" in out
+    assert "git stash list --format='%gd %H %s'" in out
+    assert "git stash drop stash@{2}" in out
+
+
+
 def test_restore_stashed_changes_keeps_going_when_stash_entry_cannot_be_resolved(monkeypatch, tmp_path, capsys):
    calls = []

@@ -157,6 +167,8 @@ def test_restore_stashed_changes_keeps_going_when_stash_entry_cannot_be_resolved
    out = capsys.readouterr().out
    assert "couldn't find the stash entry to drop" in out
    assert "stash was left in place" in out
+    assert "Check `git status` first" in out
+    assert "git stash list --format='%gd %H %s'" in out
    assert "Look for commit abc123" in out


@@ -183,6 +195,8 @@ def test_restore_stashed_changes_keeps_going_when_drop_fails(monkeypatch, tmp_pa
    out = capsys.readouterr().out
    assert "couldn't drop the saved stash entry" in out
    assert "drop failed" in out
+    assert "Check `git status` first" in out
+    assert "git stash list --format='%gd %H %s'" in out
    assert "git stash drop stash@{0}" in out


@@ -0,0 +1,611 @@
+"""Integration tests for Discord voice channel audio flow.
+
+Uses real NaCl encryption and Opus codec (no mocks for crypto/codec).
+Does NOT require a Discord connection — tests the VoiceReceiver
+packet processing pipeline end-to-end.
+
+Requires: PyNaCl>=1.5.0, discord.py[voice] (opus codec)
+"""
+
+import struct
+import time
+import pytest
+
+pytestmark = pytest.mark.integration
+
+# Skip entire module if voice deps are missing
+pytest.importorskip("nacl.secret", reason="PyNaCl required for voice integration tests")
+discord = pytest.importorskip("discord", reason="discord.py required for voice integration tests")
+
+import nacl.secret
+
+try:
+    if not discord.opus.is_loaded():
+        import ctypes.util
+        opus_path = ctypes.util.find_library("opus")
+        if not opus_path:
+            import sys
+            for p in ("/opt/homebrew/lib/libopus.dylib", "/usr/local/lib/libopus.dylib"):
+                import os
+                if os.path.isfile(p):
+                    opus_path = p
+                    break
+        if opus_path:
+            discord.opus.load_opus(opus_path)
+    OPUS_AVAILABLE = discord.opus.is_loaded()
+except Exception:
+    OPUS_AVAILABLE = False
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+from gateway.platforms.discord import VoiceReceiver
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_secret_key():
+    """Generate a random 32-byte key."""
+    import os
+    return os.urandom(32)
+
+
+def _build_encrypted_rtp_packet(secret_key, opus_payload, ssrc=100, seq=1, timestamp=960):
+    """Build a real NaCl-encrypted RTP packet matching Discord's format.
+
+    Format: RTP header (12 bytes) + encrypted(opus) + 4-byte nonce
+    Encryption: aead_xchacha20_poly1305 with RTP header as AAD.
+    """
+    # RTP header: version=2, payload_type=0x78, no extension, no CSRC
+    header = struct.pack(">BBHII", 0x80, 0x78, seq, timestamp, ssrc)
+
+    # Encrypt with NaCl AEAD
+    box = nacl.secret.Aead(secret_key)
+    nonce_counter = struct.pack(">I", seq)  # 4-byte counter as nonce seed
+    # Full 24-byte nonce: counter in first 4 bytes, rest zeros
+    full_nonce = nonce_counter + b'\x00' * 20
+
+    enc_msg = box.encrypt(opus_payload, header, full_nonce)
+    ciphertext = enc_msg.ciphertext  # without nonce prefix
+
+    # Discord format: header + ciphertext + 4-byte nonce
+    return header + ciphertext + nonce_counter
+
+
+def _make_voice_receiver(secret_key, dave_session=None, bot_ssrc=9999,
+                         allowed_user_ids=None, members=None):
+    """Create a VoiceReceiver with real secret key."""
+    vc = MagicMock()
+    vc._connection.secret_key = list(secret_key)
+    vc._connection.dave_session = dave_session
+    vc._connection.ssrc = bot_ssrc
+    vc._connection.add_socket_listener = MagicMock()
+    vc._connection.remove_socket_listener = MagicMock()
+    vc._connection.hook = None
+    vc.user = SimpleNamespace(id=bot_ssrc)
+    vc.channel = MagicMock()
+    vc.channel.members = members or []
+    receiver = VoiceReceiver(vc, allowed_user_ids=allowed_user_ids)
+    receiver.start()
+    return receiver
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestRealNaClDecrypt:
+    """End-to-end: real NaCl encrypt → _on_packet decrypt → buffer."""
+
+    def test_valid_encrypted_packet_buffered(self):
+        """Real NaCl encrypted packet → decrypted → buffered."""
+        key = _make_secret_key()
+        opus_silence = b'\xf8\xff\xfe'
+        receiver = _make_voice_receiver(key)
+
+        packet = _build_encrypted_rtp_packet(key, opus_silence, ssrc=100)
+        receiver._on_packet(packet)
+
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_wrong_key_packet_dropped(self):
+        """Packet encrypted with wrong key → NaCl fails → not buffered."""
+        real_key = _make_secret_key()
+        wrong_key = _make_secret_key()
+        opus_silence = b'\xf8\xff\xfe'
+        receiver = _make_voice_receiver(real_key)
+
+        packet = _build_encrypted_rtp_packet(wrong_key, opus_silence, ssrc=100)
+        receiver._on_packet(packet)
+
+        assert len(receiver._buffers.get(100, b"")) == 0
+
+    def test_bot_ssrc_ignored(self):
+        """Packet from bot's own SSRC → ignored."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key, bot_ssrc=9999)
+
+        packet = _build_encrypted_rtp_packet(key, b'\xf8\xff\xfe', ssrc=9999)
+        receiver._on_packet(packet)
+
+        assert len(receiver._buffers) == 0
+
+    def test_multiple_packets_accumulate(self):
+        """Multiple valid packets → buffer grows."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+
+        for seq in range(1, 6):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        assert 100 in receiver._buffers
+        buf_size = len(receiver._buffers[100])
+        assert buf_size > 0, "Multiple packets should accumulate in buffer"
+
+    def test_different_ssrcs_separate_buffers(self):
+        """Packets from different SSRCs → separate buffers."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+
+        for ssrc in [100, 200, 300]:
+            packet = _build_encrypted_rtp_packet(key, b'\xf8\xff\xfe', ssrc=ssrc)
+            receiver._on_packet(packet)
+
+        assert len(receiver._buffers) == 3
+        for ssrc in [100, 200, 300]:
+            assert ssrc in receiver._buffers
+
+
+class TestRealNaClWithDAVE:
+    """NaCl decrypt + DAVE passthrough scenarios with real crypto."""
+
+    def test_dave_unknown_ssrc_passthrough(self):
+        """DAVE enabled but SSRC unknown → skip DAVE, buffer audio."""
+        key = _make_secret_key()
+        dave = MagicMock()  # DAVE session present but SSRC not mapped
+        receiver = _make_voice_receiver(key, dave_session=dave)
+
+        packet = _build_encrypted_rtp_packet(key, b'\xf8\xff\xfe', ssrc=100)
+        receiver._on_packet(packet)
+
+        # DAVE decrypt not called (SSRC unknown)
+        dave.decrypt.assert_not_called()
+        # Audio still buffered via passthrough
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_dave_unencrypted_error_passthrough(self):
+        """DAVE raises 'Unencrypted' → use NaCl-decrypted data as-is."""
+        key = _make_secret_key()
+        dave = MagicMock()
+        dave.decrypt.side_effect = Exception(
+            "DecryptionFailed(UnencryptedWhenPassthroughDisabled)"
+        )
+        receiver = _make_voice_receiver(key, dave_session=dave)
+        receiver.map_ssrc(100, 42)
+
+        packet = _build_encrypted_rtp_packet(key, b'\xf8\xff\xfe', ssrc=100)
+        receiver._on_packet(packet)
+
+        # DAVE was called but failed → passthrough
+        dave.decrypt.assert_called_once()
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_dave_real_error_drops(self):
+        """DAVE raises non-Unencrypted error → packet dropped."""
+        key = _make_secret_key()
+        dave = MagicMock()
+        dave.decrypt.side_effect = Exception("KeyRotationFailed")
+        receiver = _make_voice_receiver(key, dave_session=dave)
+        receiver.map_ssrc(100, 42)
+
+        packet = _build_encrypted_rtp_packet(key, b'\xf8\xff\xfe', ssrc=100)
+        receiver._on_packet(packet)
+
+        assert len(receiver._buffers.get(100, b"")) == 0
+
+
+class TestFullVoiceFlow:
+    """End-to-end: encrypt → receive → buffer → silence detect → complete."""
+
+    def test_single_utterance_flow(self):
+        """Encrypt packets → buffer → silence → check_silence returns utterance."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(100, 42)
+
+        # Send enough packets to exceed MIN_SPEECH_DURATION (0.5s)
+        # At 48kHz stereo 16-bit, each Opus silence frame decodes to ~3840 bytes
+        # Need 96000 bytes = ~25 frames
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        # Simulate silence by setting last_packet_time in the past
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        user_id, pcm_data = completed[0]
+        assert user_id == 42
+        assert len(pcm_data) > 0
+
+    def test_utterance_with_ssrc_automap(self):
+        """No SPEAKING event → auto-map sole allowed user → utterance processed."""
+        key = _make_secret_key()
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = _make_voice_receiver(
+            key, allowed_user_ids={"42"}, members=members
+        )
+        # No map_ssrc call — simulating missing SPEAKING event
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42  # auto-mapped to sole allowed user
+
+    def test_pause_blocks_during_playback(self):
+        """Pause receiver → packets ignored → resume → packets accepted."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+
+        # Pause (echo prevention during TTS playback)
+        receiver.pause()
+        packet = _build_encrypted_rtp_packet(key, b'\xf8\xff\xfe', ssrc=100)
+        receiver._on_packet(packet)
+        assert len(receiver._buffers.get(100, b"")) == 0
+
+        # Resume
+        receiver.resume()
+        receiver._on_packet(packet)
+        assert 100 in receiver._buffers
+        assert len(receiver._buffers[100]) > 0
+
+    def test_corrupted_packet_ignored(self):
+        """Corrupted/truncated packet → silently ignored."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+
+        # Too short
+        receiver._on_packet(b"\x00" * 5)
+        assert len(receiver._buffers) == 0
+
+        # Wrong RTP version
+        bad_header = struct.pack(">BBHII", 0x00, 0x78, 1, 960, 100)
+        receiver._on_packet(bad_header + b"\x00" * 20)
+        assert len(receiver._buffers) == 0
+
+        # Wrong payload type
+        bad_pt = struct.pack(">BBHII", 0x80, 0x00, 1, 960, 100)
+        receiver._on_packet(bad_pt + b"\x00" * 20)
+        assert len(receiver._buffers) == 0
+
+    def test_stop_cleans_everything(self):
+        """stop() clears all state cleanly."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(100, 42)
+
+        for seq in range(1, 10):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        assert len(receiver._buffers[100]) > 0
+
+        receiver.stop()
+        assert receiver._running is False
+        assert len(receiver._buffers) == 0
+        assert len(receiver._ssrc_to_user) == 0
+        assert len(receiver._decoders) == 0
+
+
+class TestSPEAKINGHook:
+    """SPEAKING event hook correctly maps SSRC to user_id."""
+
+    def test_speaking_hook_installed(self):
+        """start() installs speaking hook on connection."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        conn = receiver._vc._connection
+        # hook should be set (wrapped)
+        assert conn.hook is not None
+
+    def test_map_ssrc_via_speaking(self):
+        """SPEAKING op 5 event maps SSRC to user_id."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(500, 12345)
+        assert receiver._ssrc_to_user[500] == 12345
+
+    def test_map_ssrc_overwrites(self):
+        """New SPEAKING event for same SSRC overwrites old mapping."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(500, 111)
+        receiver.map_ssrc(500, 222)
+        assert receiver._ssrc_to_user[500] == 222
+
+    def test_speaking_mapped_audio_processed(self):
+        """After SSRC is mapped, audio from that SSRC gets correct user_id."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(100, 42)
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+
+class TestAuthFiltering:
+    """Only allowed users' audio should be processed."""
+
+    def test_allowed_user_audio_processed(self):
+        """Allowed user's utterance is returned by check_silence."""
+        key = _make_secret_key()
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = _make_voice_receiver(
+            key, allowed_user_ids={"42"}, members=members,
+        )
+        receiver.map_ssrc(100, 42)
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+    def test_automap_rejects_unallowed_user(self):
+        """Auto-map refuses to map SSRC to user not in allowed list."""
+        key = _make_secret_key()
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = _make_voice_receiver(
+            key, allowed_user_ids={"99"},  # Alice not allowed
+            members=members,
+        )
+        # No map_ssrc — SSRC unknown, auto-map should reject
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+        completed = receiver.check_silence()
+        assert len(completed) == 0
+
+    def test_empty_allowlist_allows_all(self):
+        """Empty allowed_user_ids means no restriction."""
+        key = _make_secret_key()
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+        receiver = _make_voice_receiver(
+            key, allowed_user_ids=None, members=members,
+        )
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+        completed = receiver.check_silence()
+        # Auto-mapped to sole non-bot member
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+
+class TestRejoinFlow:
+    """Leave and rejoin: state cleanup and fresh receiver."""
+
+    def test_stop_then_new_receiver_clean_state(self):
+        """After stop(), a new receiver starts with empty state."""
+        key = _make_secret_key()
+        receiver1 = _make_voice_receiver(key)
+        receiver1.map_ssrc(100, 42)
+
+        for seq in range(1, 10):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver1._on_packet(packet)
+
+        assert len(receiver1._buffers[100]) > 0
+        receiver1.stop()
+
+        # New receiver (simulates rejoin)
+        receiver2 = _make_voice_receiver(key)
+        assert len(receiver2._buffers) == 0
+        assert len(receiver2._ssrc_to_user) == 0
+        assert len(receiver2._decoders) == 0
+
+    def test_rejoin_new_ssrc_works(self):
+        """After rejoin, user may get new SSRC — still works."""
+        key = _make_secret_key()
+        receiver1 = _make_voice_receiver(key)
+        receiver1.map_ssrc(100, 42)  # old SSRC
+        receiver1.stop()
+
+        receiver2 = _make_voice_receiver(key)
+        receiver2.map_ssrc(200, 42)  # new SSRC after rejoin
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=200, seq=seq, timestamp=960 * seq
+            )
+            receiver2._on_packet(packet)
+
+        receiver2._last_packet_time[200] = time.monotonic() - 3.0
+        completed = receiver2.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+    def test_rejoin_without_speaking_event_automap(self):
+        """Rejoin without SPEAKING event — auto-map sole allowed user."""
+        key = _make_secret_key()
+        members = [
+            SimpleNamespace(id=9999, name="Bot"),
+            SimpleNamespace(id=42, name="Alice"),
+        ]
+
+        # First session
+        receiver1 = _make_voice_receiver(
+            key, allowed_user_ids={"42"}, members=members,
+        )
+        receiver1.stop()
+
+        # Rejoin — new key (Discord may assign new secret_key)
+        new_key = _make_secret_key()
+        receiver2 = _make_voice_receiver(
+            new_key, allowed_user_ids={"42"}, members=members,
+        )
+        # No map_ssrc — simulating missing SPEAKING event
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                new_key, b'\xf8\xff\xfe', ssrc=300, seq=seq, timestamp=960 * seq
+            )
+            receiver2._on_packet(packet)
+
+        receiver2._last_packet_time[300] = time.monotonic() - 3.0
+        completed = receiver2.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
+
+
+class TestMultiGuildIsolation:
+    """Each guild has independent voice state."""
+
+    def test_separate_receivers_independent(self):
+        """Two receivers (different guilds) don't interfere."""
+        key1 = _make_secret_key()
+        key2 = _make_secret_key()
+
+        receiver1 = _make_voice_receiver(key1, bot_ssrc=1111)
+        receiver2 = _make_voice_receiver(key2, bot_ssrc=2222)
+
+        receiver1.map_ssrc(100, 42)
+        receiver2.map_ssrc(200, 99)
+
+        # Send to receiver1
+        for seq in range(1, 10):
+            packet = _build_encrypted_rtp_packet(
+                key1, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver1._on_packet(packet)
+
+        # receiver2 should be empty
+        assert len(receiver2._buffers) == 0
+        assert 100 in receiver1._buffers
+
+    def test_stop_one_doesnt_affect_other(self):
+        """Stopping one receiver doesn't affect another."""
+        key1 = _make_secret_key()
+        key2 = _make_secret_key()
+
+        receiver1 = _make_voice_receiver(key1)
+        receiver2 = _make_voice_receiver(key2)
+
+        receiver1.map_ssrc(100, 42)
+        receiver2.map_ssrc(200, 99)
+
+        for seq in range(1, 10):
+            packet = _build_encrypted_rtp_packet(
+                key2, b'\xf8\xff\xfe', ssrc=200, seq=seq, timestamp=960 * seq
+            )
+            receiver2._on_packet(packet)
+
+        receiver1.stop()
+
+        # receiver2 still has data
+        assert receiver2._running is True
+        assert len(receiver2._buffers[200]) > 0
+
+
+class TestEchoPreventionFlow:
+    """Receiver pause/resume during TTS playback prevents echo."""
+
+    def test_audio_during_pause_ignored(self):
+        """Audio arriving while paused is completely ignored."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(100, 42)
+        receiver.pause()
+
+        for seq in range(1, 30):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        assert len(receiver._buffers.get(100, b"")) == 0
+
+    def test_audio_after_resume_processed(self):
+        """Audio arriving after resume is processed normally."""
+        key = _make_secret_key()
+        receiver = _make_voice_receiver(key)
+        receiver.map_ssrc(100, 42)
+
+        # Pause → send packets → resume → send more packets
+        receiver.pause()
+        for seq in range(1, 5):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+        assert len(receiver._buffers.get(100, b"")) == 0
+
+        receiver.resume()
+        for seq in range(5, 35):
+            packet = _build_encrypted_rtp_packet(
+                key, b'\xf8\xff\xfe', ssrc=100, seq=seq, timestamp=960 * seq
+            )
+            receiver._on_packet(packet)
+
+        assert len(receiver._buffers[100]) > 0
+        receiver._last_packet_time[100] = time.monotonic() - 3.0
+        completed = receiver.check_silence()
+        assert len(completed) == 1
+        assert completed[0][0] == 42
@@ -16,126 +16,131 @@ from run_agent import AIAgent, IterationBudget
 from tools.delegate_tool import _run_single_child
 from tools.interrupt import set_interrupt, is_interrupted

-set_interrupt(False)
+def main() -> int:
+    set_interrupt(False)

-# Create parent agent (minimal)
-parent = AIAgent.__new__(AIAgent)
-parent._interrupt_requested = False
-parent._interrupt_message = None
-parent._active_children = []
-parent.quiet_mode = True
-parent.model = "test/model"
-parent.base_url = "http://localhost:1"
-parent.api_key = "test"
-parent.provider = "test"
-parent.api_mode = "chat_completions"
-parent.platform = "cli"
-parent.enabled_toolsets = ["terminal", "file"]
-parent.providers_allowed = None
-parent.providers_ignored = None
-parent.providers_order = None
-parent.provider_sort = None
-parent.max_tokens = None
-parent.reasoning_config = None
-parent.prefill_messages = None
-parent._session_db = None
-parent._delegate_depth = 0
-parent._delegate_spinner = None
-parent.tool_progress_callback = None
-parent.iteration_budget = IterationBudget(max_total=100)
-parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}
+    # Create parent agent (minimal)
+    parent = AIAgent.__new__(AIAgent)
+    parent._interrupt_requested = False
+    parent._interrupt_message = None
+    parent._active_children = []
+    parent.quiet_mode = True
+    parent.model = "test/model"
+    parent.base_url = "http://localhost:1"
+    parent.api_key = "test"
+    parent.provider = "test"
+    parent.api_mode = "chat_completions"
+    parent.platform = "cli"
+    parent.enabled_toolsets = ["terminal", "file"]
+    parent.providers_allowed = None
+    parent.providers_ignored = None
+    parent.providers_order = None
+    parent.provider_sort = None
+    parent.max_tokens = None
+    parent.reasoning_config = None
+    parent.prefill_messages = None
+    parent._session_db = None
+    parent._delegate_depth = 0
+    parent._delegate_spinner = None
+    parent.tool_progress_callback = None
+    parent.iteration_budget = IterationBudget(max_total=100)
+    parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}

-child_started = threading.Event()
-result_holder = [None]
+    child_started = threading.Event()
+    result_holder = [None]

+    def run_delegate():
+        with patch("run_agent.OpenAI") as MockOpenAI:
+            mock_client = MagicMock()

-def run_delegate():
-    with patch("run_agent.OpenAI") as MockOpenAI:
-        mock_client = MagicMock()
+            def slow_create(**kwargs):
+                time.sleep(3)
+                resp = MagicMock()
+                resp.choices = [MagicMock()]
+                resp.choices[0].message.content = "Done"
+                resp.choices[0].message.tool_calls = None
+                resp.choices[0].message.refusal = None
+                resp.choices[0].finish_reason = "stop"
+                resp.usage.prompt_tokens = 100
+                resp.usage.completion_tokens = 10
+                resp.usage.total_tokens = 110
+                resp.usage.prompt_tokens_details = None
+                return resp

-        def slow_create(**kwargs):
-            time.sleep(3)
-            resp = MagicMock()
-            resp.choices = [MagicMock()]
-            resp.choices[0].message.content = "Done"
-            resp.choices[0].message.tool_calls = None
-            resp.choices[0].message.refusal = None
-            resp.choices[0].finish_reason = "stop"
-            resp.usage.prompt_tokens = 100
-            resp.usage.completion_tokens = 10
-            resp.usage.total_tokens = 110
-            resp.usage.prompt_tokens_details = None
-            return resp
+            mock_client.chat.completions.create = slow_create
+            mock_client.close = MagicMock()
+            MockOpenAI.return_value = mock_client

-        mock_client.chat.completions.create = slow_create
-        mock_client.close = MagicMock()
-        MockOpenAI.return_value = mock_client
+            original_init = AIAgent.__init__

-        original_init = AIAgent.__init__
+            def patched_init(self_agent, *a, **kw):
+                original_init(self_agent, *a, **kw)
+                child_started.set()

-        def patched_init(self_agent, *a, **kw):
-            original_init(self_agent, *a, **kw)
-            child_started.set()
+            with patch.object(AIAgent, "__init__", patched_init):
+                try:
+                    result = _run_single_child(
+                        task_index=0,
+                        goal="Test slow task",
+                        context=None,
+                        toolsets=["terminal"],
+                        model="test/model",
+                        max_iterations=5,
+                        parent_agent=parent,
+                        task_count=1,
+                        override_provider="test",
+                        override_base_url="http://localhost:1",
+                        override_api_key="test",
+                        override_api_mode="chat_completions",
+                    )
+                    result_holder[0] = result
+                except Exception as e:
+                    print(f"ERROR in delegate: {e}")
+                    import traceback
+                    traceback.print_exc()

-        with patch.object(AIAgent, "__init__", patched_init):
-            try:
-                result = _run_single_child(
-                    task_index=0,
-                    goal="Test slow task",
-                    context=None,
-                    toolsets=["terminal"],
-                    model="test/model",
-                    max_iterations=5,
-                    parent_agent=parent,
-                    task_count=1,
-                    override_provider="test",
-                    override_base_url="http://localhost:1",
-                    override_api_key="test",
-                    override_api_mode="chat_completions",
-                )
-                result_holder[0] = result
-            except Exception as e:
-                print(f"ERROR in delegate: {e}")
-                import traceback
-                traceback.print_exc()
+    print("Starting agent thread...")
+    agent_thread = threading.Thread(target=run_delegate, daemon=True)
+    agent_thread.start()

+    started = child_started.wait(timeout=10)
+    if not started:
+        print("ERROR: Child never started")
+        set_interrupt(False)
+        return 1

-print("Starting agent thread...")
-agent_thread = threading.Thread(target=run_delegate, daemon=True)
-agent_thread.start()
+    time.sleep(0.5)

-started = child_started.wait(timeout=10)
-if not started:
-    print("ERROR: Child never started")
-    sys.exit(1)
+    print(f"Active children: {len(parent._active_children)}")
+    for i, c in enumerate(parent._active_children):
+        print(f"  Child {i}: _interrupt_requested={c._interrupt_requested}")

-time.sleep(0.5)
+    t0 = time.monotonic()
+    parent.interrupt("User typed a new message")
+    print("Called parent.interrupt()")

-print(f"Active children: {len(parent._active_children)}")
-for i, c in enumerate(parent._active_children):
-    print(f"  Child {i}: _interrupt_requested={c._interrupt_requested}")
+    for i, c in enumerate(parent._active_children):
+        print(f"  Child {i} after interrupt: _interrupt_requested={c._interrupt_requested}")
+    print(f"Global is_interrupted: {is_interrupted()}")

-t0 = time.monotonic()
-parent.interrupt("User typed a new message")
-print(f"Called parent.interrupt()")
+    agent_thread.join(timeout=10)
+    elapsed = time.monotonic() - t0
+    print(f"Agent thread finished in {elapsed:.2f}s")

-for i, c in enumerate(parent._active_children):
-    print(f"  Child {i} after interrupt: _interrupt_requested={c._interrupt_requested}")
-print(f"Global is_interrupted: {is_interrupted()}")
-
-agent_thread.join(timeout=10)
-elapsed = time.monotonic() - t0
-print(f"Agent thread finished in {elapsed:.2f}s")
-
-result = result_holder[0]
-if result:
-    print(f"Status: {result['status']}")
-    print(f"Duration: {result['duration_seconds']}s")
-    if elapsed < 2.0:
-        print("✅ PASS: Interrupt detected quickly!")
+    result = result_holder[0]
+    if result:
+        print(f"Status: {result['status']}")
+        print(f"Duration: {result['duration_seconds']}s")
+        if elapsed < 2.0:
+            print("✅ PASS: Interrupt detected quickly!")
+        else:
+            print(f"❌ FAIL: Took {elapsed:.2f}s — interrupt was too slow or not detected")
    else:
-        print(f"❌ FAIL: Took {elapsed:.2f}s — interrupt was too slow or not detected")
-else:
-    print("❌ FAIL: No result!")
+        print("❌ FAIL: No result!")

-set_interrupt(False)
+    set_interrupt(False)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -495,6 +495,59 @@ class TestConvertMessages:
        assert len(result) == 1
        assert result[0]["role"] == "user"

+    def test_converts_user_image_url_blocks_to_anthropic_image_blocks(self):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Can you see this?"},
+                    {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}},
+                ],
+            }
+        ]
+
+        _, result = convert_messages_to_anthropic(messages)
+
+        assert result == [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Can you see this?"},
+                    {"type": "image", "source": {"type": "url", "url": "https://example.com/cat.png"}},
+                ],
+            }
+        ]
+
+    def test_converts_data_url_image_blocks_to_base64_anthropic_image_blocks(self):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_text", "text": "What is in this screenshot?"},
+                    {"type": "input_image", "image_url": "data:image/png;base64,AAAA"},
+                ],
+            }
+        ]
+
+        _, result = convert_messages_to_anthropic(messages)
+
+        assert result == [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is in this screenshot?"},
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": "AAAA",
+                        },
+                    },
+                ],
+            }
+        ]
+
    def test_converts_tool_calls(self):
        messages = [
            {
@@ -68,6 +68,22 @@ class TestAtomicJsonWrite:
        tmp_files = [f for f in tmp_path.iterdir() if ".tmp" in f.name]
        assert len(tmp_files) == 0

+    def test_cleans_up_temp_file_on_baseexception(self, tmp_path):
+        class SimulatedAbort(BaseException):
+            pass
+
+        target = tmp_path / "data.json"
+        original = {"preserved": True}
+        target.write_text(json.dumps(original), encoding="utf-8")
+
+        with patch("utils.json.dump", side_effect=SimulatedAbort):
+            with pytest.raises(SimulatedAbort):
+                atomic_json_write(target, {"new": True})
+
+        tmp_files = [f for f in tmp_path.iterdir() if ".tmp" in f.name]
+        assert len(tmp_files) == 0
+        assert json.loads(target.read_text(encoding="utf-8")) == original
+
    def test_accepts_string_path(self, tmp_path):
        target = str(tmp_path / "string_path.json")
        atomic_json_write(target, {"string": True})
@@ -0,0 +1,44 @@
+"""Tests for utils.atomic_yaml_write — crash-safe YAML file writes."""
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+from utils import atomic_yaml_write
+
+
+class TestAtomicYamlWrite:
+    def test_writes_valid_yaml(self, tmp_path):
+        target = tmp_path / "data.yaml"
+        data = {"key": "value", "nested": {"a": 1}}
+
+        atomic_yaml_write(target, data)
+
+        assert yaml.safe_load(target.read_text(encoding="utf-8")) == data
+
+    def test_cleans_up_temp_file_on_baseexception(self, tmp_path):
+        class SimulatedAbort(BaseException):
+            pass
+
+        target = tmp_path / "data.yaml"
+        original = {"preserved": True}
+        target.write_text(yaml.safe_dump(original), encoding="utf-8")
+
+        with patch("utils.yaml.dump", side_effect=SimulatedAbort):
+            with pytest.raises(SimulatedAbort):
+                atomic_yaml_write(target, {"new": True})
+
+        tmp_files = [f for f in tmp_path.iterdir() if ".tmp" in f.name]
+        assert len(tmp_files) == 0
+        assert yaml.safe_load(target.read_text(encoding="utf-8")) == original
+
+    def test_appends_extra_content(self, tmp_path):
+        target = tmp_path / "data.yaml"
+
+        atomic_yaml_write(target, {"key": "value"}, extra_content="\n# comment\n")
+
+        text = target.read_text(encoding="utf-8")
+        assert "key: value" in text
+        assert "# comment" in text
@@ -0,0 +1,103 @@
+"""Tests for automatic MCP reload when config.yaml mcp_servers section changes."""
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+
+def _make_cli(tmp_path, mcp_servers=None):
+    """Create a minimal HermesCLI instance with mocked config."""
+    import cli as cli_mod
+    obj = object.__new__(cli_mod.HermesCLI)
+    obj.config = {"mcp_servers": mcp_servers or {}}
+    obj._agent_running = False
+    obj._last_config_check = 0.0
+    obj._config_mcp_servers = mcp_servers or {}
+
+    cfg_file = tmp_path / "config.yaml"
+    cfg_file.write_text("mcp_servers: {}\n")
+    obj._config_mtime = cfg_file.stat().st_mtime
+
+    obj._reload_mcp = MagicMock()
+    obj._busy_command = MagicMock()
+    obj._busy_command.return_value.__enter__ = MagicMock(return_value=None)
+    obj._busy_command.return_value.__exit__ = MagicMock(return_value=False)
+    obj._slow_command_status = MagicMock(return_value="reloading...")
+
+    return obj, cfg_file
+
+
+class TestMCPConfigWatch:
+
+    def test_no_change_does_not_reload(self, tmp_path):
+        """If mtime and mcp_servers unchanged, _reload_mcp is NOT called."""
+        obj, cfg_file = _make_cli(tmp_path)
+
+        with patch("hermes_cli.config.get_config_path", return_value=cfg_file):
+            obj._check_config_mcp_changes()
+
+        obj._reload_mcp.assert_not_called()
+
+    def test_mtime_change_with_same_mcp_servers_does_not_reload(self, tmp_path):
+        """If file mtime changes but mcp_servers is identical, no reload."""
+        import yaml
+        obj, cfg_file = _make_cli(tmp_path, mcp_servers={"fs": {"command": "npx"}})
+
+        # Write same mcp_servers but touch the file
+        cfg_file.write_text(yaml.dump({"mcp_servers": {"fs": {"command": "npx"}}}))
+        # Force mtime to appear changed
+        obj._config_mtime = 0.0
+
+        with patch("hermes_cli.config.get_config_path", return_value=cfg_file):
+            obj._check_config_mcp_changes()
+
+        obj._reload_mcp.assert_not_called()
+
+    def test_new_mcp_server_triggers_reload(self, tmp_path):
+        """Adding a new MCP server to config triggers auto-reload."""
+        import yaml
+        obj, cfg_file = _make_cli(tmp_path, mcp_servers={})
+
+        # Simulate user adding a new MCP server to config.yaml
+        cfg_file.write_text(yaml.dump({"mcp_servers": {"github": {"url": "https://mcp.github.com"}}}))
+        obj._config_mtime = 0.0  # force stale mtime
+
+        with patch("hermes_cli.config.get_config_path", return_value=cfg_file):
+            obj._check_config_mcp_changes()
+
+        obj._reload_mcp.assert_called_once()
+
+    def test_removed_mcp_server_triggers_reload(self, tmp_path):
+        """Removing an MCP server from config triggers auto-reload."""
+        import yaml
+        obj, cfg_file = _make_cli(tmp_path, mcp_servers={"github": {"url": "https://mcp.github.com"}})
+
+        # Simulate user removing the server
+        cfg_file.write_text(yaml.dump({"mcp_servers": {}}))
+        obj._config_mtime = 0.0
+
+        with patch("hermes_cli.config.get_config_path", return_value=cfg_file):
+            obj._check_config_mcp_changes()
+
+        obj._reload_mcp.assert_called_once()
+
+    def test_interval_throttle_skips_check(self, tmp_path):
+        """If called within CONFIG_WATCH_INTERVAL, stat() is skipped."""
+        obj, cfg_file = _make_cli(tmp_path)
+        obj._last_config_check = time.monotonic()  # just checked
+
+        with patch("hermes_cli.config.get_config_path", return_value=cfg_file), \
+             patch.object(Path, "stat") as mock_stat:
+            obj._check_config_mcp_changes()
+            mock_stat.assert_not_called()
+
+        obj._reload_mcp.assert_not_called()
+
+    def test_missing_config_file_does_not_crash(self, tmp_path):
+        """If config.yaml doesn't exist, _check_config_mcp_changes is a no-op."""
+        obj, cfg_file = _make_cli(tmp_path)
+        missing = tmp_path / "nonexistent.yaml"
+
+        with patch("hermes_cli.config.get_config_path", return_value=missing):
+            obj._check_config_mcp_changes()  # should not raise
+
+        obj._reload_mcp.assert_not_called()
@@ -64,8 +64,8 @@ class TestModelCommand:
            cli_obj.process_command("/model gpt-5.4")

        output = capsys.readouterr().out
-        # Model is accepted (with warning) even if not in API listing
-        assert cli_obj.model == "gpt-5.4"
+        # Auto-detection remaps bare model names to proper OpenRouter slugs
+        assert cli_obj.model == "openai/gpt-5.4"

    def test_validation_crash_falls_back_to_save(self, capsys):
        cli_obj = self._make_cli()
@@ -162,6 +162,57 @@ def test_runtime_resolution_rebuilds_agent_on_routing_change(monkeypatch):
    assert shell.api_mode == "codex_responses"


+def test_cli_turn_routing_uses_primary_when_disabled(monkeypatch):
+    cli = _import_cli()
+    shell = cli.HermesCLI(model="gpt-5", compact=True, max_turns=1)
+    shell.provider = "openrouter"
+    shell.api_mode = "chat_completions"
+    shell.base_url = "https://openrouter.ai/api/v1"
+    shell.api_key = "sk-primary"
+    shell._smart_model_routing = {"enabled": False}
+
+    result = shell._resolve_turn_agent_config("what time is it in tokyo?")
+
+    assert result["model"] == "gpt-5"
+    assert result["runtime"]["provider"] == "openrouter"
+    assert result["label"] is None
+
+
+def test_cli_turn_routing_uses_cheap_model_when_simple(monkeypatch):
+    cli = _import_cli()
+
+    def _runtime_resolve(**kwargs):
+        assert kwargs["requested"] == "zai"
+        return {
+            "provider": "zai",
+            "api_mode": "chat_completions",
+            "base_url": "https://open.z.ai/api/v1",
+            "api_key": "cheap-key",
+            "source": "env/config",
+        }
+
+    monkeypatch.setattr("hermes_cli.runtime_provider.resolve_runtime_provider", _runtime_resolve)
+
+    shell = cli.HermesCLI(model="anthropic/claude-sonnet-4", compact=True, max_turns=1)
+    shell.provider = "openrouter"
+    shell.api_mode = "chat_completions"
+    shell.base_url = "https://openrouter.ai/api/v1"
+    shell.api_key = "primary-key"
+    shell._smart_model_routing = {
+        "enabled": True,
+        "cheap_model": {"provider": "zai", "model": "glm-5-air"},
+        "max_simple_chars": 160,
+        "max_simple_words": 28,
+    }
+
+    result = shell._resolve_turn_agent_config("what time is it in tokyo?")
+
+    assert result["model"] == "glm-5-air"
+    assert result["runtime"]["provider"] == "zai"
+    assert result["runtime"]["api_key"] == "cheap-key"
+    assert result["label"] is not None
+
+
 def test_cli_prefers_config_provider_over_stale_env_override(monkeypatch):
    cli = _import_cli()

@@ -336,4 +387,42 @@ def test_cmd_model_falls_back_to_auto_on_invalid_provider(monkeypatch, capsys):

    assert "Warning:" in output
    assert "falling back to auto provider detection" in output.lower()
-    assert "No change." in output
+    assert "No change." in output
+
+
+def test_model_flow_custom_saves_verified_v1_base_url(monkeypatch, capsys):
+    monkeypatch.setattr(
+        "hermes_cli.config.get_env_value",
+        lambda key: "" if key in {"OPENAI_BASE_URL", "OPENAI_API_KEY"} else "",
+    )
+    saved_env = {}
+    monkeypatch.setattr("hermes_cli.config.save_env_value", lambda key, value: saved_env.__setitem__(key, value))
+    monkeypatch.setattr("hermes_cli.auth._save_model_choice", lambda model: saved_env.__setitem__("MODEL", model))
+    monkeypatch.setattr("hermes_cli.auth.deactivate_provider", lambda: None)
+    monkeypatch.setattr("hermes_cli.main._save_custom_provider", lambda *args, **kwargs: None)
+    monkeypatch.setattr(
+        "hermes_cli.models.probe_api_models",
+        lambda api_key, base_url: {
+            "models": ["llm"],
+            "probed_url": "http://localhost:8000/v1/models",
+            "resolved_base_url": "http://localhost:8000/v1",
+            "suggested_base_url": "http://localhost:8000/v1",
+            "used_fallback": True,
+        },
+    )
+    monkeypatch.setattr(
+        "hermes_cli.config.load_config",
+        lambda: {"model": {"default": "", "provider": "custom", "base_url": ""}},
+    )
+    monkeypatch.setattr("hermes_cli.config.save_config", lambda cfg: None)
+
+    answers = iter(["http://localhost:8000", "local-key", "llm"])
+    monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers))
+
+    hermes_main._model_flow_custom({})
+    output = capsys.readouterr().out
+
+    assert "Saving the working base URL instead" in output
+    assert saved_env["OPENAI_BASE_URL"] == "http://localhost:8000/v1"
+    assert saved_env["OPENAI_API_KEY"] == "local-key"
+    assert saved_env["MODEL"] == "llm"
@@ -0,0 +1,160 @@
+from datetime import datetime, timedelta
+from types import SimpleNamespace
+
+from cli import HermesCLI
+
+
+def _make_cli(model: str = "anthropic/claude-sonnet-4-20250514"):
+    cli_obj = HermesCLI.__new__(HermesCLI)
+    cli_obj.model = model
+    cli_obj.session_start = datetime.now() - timedelta(minutes=14, seconds=32)
+    cli_obj.conversation_history = [{"role": "user", "content": "hi"}]
+    cli_obj.agent = None
+    return cli_obj
+
+
+def _attach_agent(
+    cli_obj,
+    *,
+    prompt_tokens: int,
+    completion_tokens: int,
+    total_tokens: int,
+    api_calls: int,
+    context_tokens: int,
+    context_length: int,
+    compressions: int = 0,
+):
+    cli_obj.agent = SimpleNamespace(
+        model=cli_obj.model,
+        session_prompt_tokens=prompt_tokens,
+        session_completion_tokens=completion_tokens,
+        session_total_tokens=total_tokens,
+        session_api_calls=api_calls,
+        context_compressor=SimpleNamespace(
+            last_prompt_tokens=context_tokens,
+            context_length=context_length,
+            compression_count=compressions,
+        ),
+    )
+    return cli_obj
+
+
+class TestCLIStatusBar:
+    def test_context_style_thresholds(self):
+        cli_obj = _make_cli()
+
+        assert cli_obj._status_bar_context_style(None) == "class:status-bar-dim"
+        assert cli_obj._status_bar_context_style(10) == "class:status-bar-good"
+        assert cli_obj._status_bar_context_style(50) == "class:status-bar-warn"
+        assert cli_obj._status_bar_context_style(81) == "class:status-bar-bad"
+        assert cli_obj._status_bar_context_style(95) == "class:status-bar-critical"
+
+    def test_build_status_bar_text_for_wide_terminal(self):
+        cli_obj = _attach_agent(
+            _make_cli(),
+            prompt_tokens=10_230,
+            completion_tokens=2_220,
+            total_tokens=12_450,
+            api_calls=7,
+            context_tokens=12_450,
+            context_length=200_000,
+        )
+
+        text = cli_obj._build_status_bar_text(width=120)
+
+        assert "claude-sonnet-4-20250514" in text
+        assert "12.4K/200K" in text
+        assert "6%" in text
+        assert "$0.06" in text
+        assert "15m" in text
+
+    def test_build_status_bar_text_collapses_for_narrow_terminal(self):
+        cli_obj = _attach_agent(
+            _make_cli(),
+            prompt_tokens=10_230,
+            completion_tokens=2_220,
+            total_tokens=12_450,
+            api_calls=7,
+            context_tokens=12_450,
+            context_length=200_000,
+        )
+
+        text = cli_obj._build_status_bar_text(width=60)
+
+        assert "⚕" in text
+        assert "$0.06" in text
+        assert "15m" in text
+        assert "200K" not in text
+
+    def test_build_status_bar_text_handles_missing_agent(self):
+        cli_obj = _make_cli()
+
+        text = cli_obj._build_status_bar_text(width=100)
+
+        assert "⚕" in text
+        assert "claude-sonnet-4-20250514" in text
+
+
+class TestCLIUsageReport:
+    def test_show_usage_includes_estimated_cost(self, capsys):
+        cli_obj = _attach_agent(
+            _make_cli(),
+            prompt_tokens=10_230,
+            completion_tokens=2_220,
+            total_tokens=12_450,
+            api_calls=7,
+            context_tokens=12_450,
+            context_length=200_000,
+            compressions=1,
+        )
+        cli_obj.verbose = False
+
+        cli_obj._show_usage()
+        output = capsys.readouterr().out
+
+        assert "Model:" in output
+        assert "Input cost:" in output
+        assert "Output cost:" in output
+        assert "Total cost:" in output
+        assert "$" in output
+        assert "0.064" in output
+        assert "Session duration:" in output
+        assert "Compressions:" in output
+
+    def test_show_usage_marks_unknown_pricing(self, capsys):
+        cli_obj = _attach_agent(
+            _make_cli(model="local/my-custom-model"),
+            prompt_tokens=1_000,
+            completion_tokens=500,
+            total_tokens=1_500,
+            api_calls=1,
+            context_tokens=1_000,
+            context_length=32_000,
+        )
+        cli_obj.verbose = False
+
+        cli_obj._show_usage()
+        output = capsys.readouterr().out
+
+        assert "Total cost:" in output
+        assert "n/a" in output
+        assert "Pricing unknown for local/my-custom-model" in output
+
+    def test_zero_priced_provider_models_stay_unknown(self, capsys):
+        cli_obj = _attach_agent(
+            _make_cli(model="glm-5"),
+            prompt_tokens=1_000,
+            completion_tokens=500,
+            total_tokens=1_500,
+            api_calls=1,
+            context_tokens=1_000,
+            context_length=32_000,
+        )
+        cli_obj.verbose = False
+
+        cli_obj._show_usage()
+        output = capsys.readouterr().out
+
+        assert "Total cost:" in output
+        assert "n/a" in output
+        assert "Pricing unknown for glm-5" in output
@@ -0,0 +1,72 @@
+import json
+from types import SimpleNamespace
+
+
+def _tool_call(name: str, arguments):
+    return SimpleNamespace(
+        id="call_1",
+        type="function",
+        function=SimpleNamespace(name=name, arguments=arguments),
+    )
+
+
+def _response_with_tool_call(arguments):
+    assistant = SimpleNamespace(
+        content=None,
+        reasoning=None,
+        tool_calls=[_tool_call("read_file", arguments)],
+    )
+    choice = SimpleNamespace(message=assistant, finish_reason="tool_calls")
+    return SimpleNamespace(choices=[choice], usage=None)
+
+
+class _FakeChatCompletions:
+    def __init__(self):
+        self.calls = 0
+
+    def create(self, **kwargs):
+        self.calls += 1
+        if self.calls == 1:
+            return _response_with_tool_call({"path": "README.md"})
+        return SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    message=SimpleNamespace(content="done", reasoning=None, tool_calls=[]),
+                    finish_reason="stop",
+                )
+            ],
+            usage=None,
+        )
+
+
+class _FakeClient:
+    def __init__(self):
+        self.chat = SimpleNamespace(completions=_FakeChatCompletions())
+
+
+def test_tool_call_validation_accepts_dict_arguments(monkeypatch):
+    from run_agent import AIAgent
+
+    monkeypatch.setattr("run_agent.OpenAI", lambda **kwargs: _FakeClient())
+    monkeypatch.setattr(
+        "run_agent.get_tool_definitions",
+        lambda *args, **kwargs: [{"function": {"name": "read_file"}}],
+    )
+    monkeypatch.setattr(
+        "run_agent.handle_function_call",
+        lambda name, args, task_id=None, **kwargs: json.dumps({"ok": True, "args": args}),
+    )
+
+    agent = AIAgent(
+        model="test-model",
+        api_key="test-key",
+        base_url="http://localhost:8080/v1",
+        platform="cli",
+        max_iterations=3,
+        quiet_mode=True,
+        skip_memory=True,
+    )
+
+    result = agent.run_conversation("read the file")
+
+    assert result["final_response"] == "done"
@@ -0,0 +1,186 @@
+import os
+import json
+import pytest
+from pathlib import Path
+import importlib.util
+
+# Load the hyphenated script name dynamically
+repo_root = Path(__file__).parent.parent
+script_path = repo_root / "optional-skills" / "security" / "oss-forensics" / "scripts" / "evidence-store.py"
+
+spec = importlib.util.spec_from_file_location("evidence_store", str(script_path))
+evidence_store = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(evidence_store)
+EvidenceStore = evidence_store.EvidenceStore
+
+
+def test_evidence_store_init(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+    assert store.filepath == str(store_file)
+    assert len(store.data["evidence"]) == 0
+    assert "metadata" in store.data
+    assert store.data["metadata"]["version"] == "2.0"
+    assert "chain_of_custody" in store.data
+
+
+def test_evidence_store_add(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    eid = store.add(
+        source="test_source",
+        content="test_content",
+        evidence_type="git",
+        actor="test_actor",
+        notes="test_notes",
+    )
+
+    assert eid == "EV-0001"
+    assert len(store.data["evidence"]) == 1
+    assert store.data["evidence"][0]["content"] == "test_content"
+    assert store.data["evidence"][0]["id"] == "EV-0001"
+    assert store.data["evidence"][0]["actor"] == "test_actor"
+    assert store.data["evidence"][0]["notes"] == "test_notes"
+    # Verify SHA-256 was computed
+    assert store.data["evidence"][0]["content_sha256"] is not None
+    assert len(store.data["evidence"][0]["content_sha256"]) == 64
+
+
+def test_evidence_store_add_persists(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+    store.add(source="s1", content="c1", evidence_type="git")
+
+    # Reload from disk
+    store2 = EvidenceStore(str(store_file))
+    assert len(store2.data["evidence"]) == 1
+    assert store2.data["evidence"][0]["id"] == "EV-0001"
+
+
+def test_evidence_store_sequential_ids(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    eid1 = store.add(source="s1", content="c1", evidence_type="git")
+    eid2 = store.add(source="s2", content="c2", evidence_type="gh_api")
+    eid3 = store.add(source="s3", content="c3", evidence_type="ioc")
+
+    assert eid1 == "EV-0001"
+    assert eid2 == "EV-0002"
+    assert eid3 == "EV-0003"
+
+
+def test_evidence_store_list(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="s1", content="c1", evidence_type="git", actor="a1")
+    store.add(source="s2", content="c2", evidence_type="gh_api", actor="a2")
+
+    all_evidence = store.list_evidence()
+    assert len(all_evidence) == 2
+
+    git_evidence = store.list_evidence(filter_type="git")
+    assert len(git_evidence) == 1
+    assert git_evidence[0]["actor"] == "a1"
+
+    actor_evidence = store.list_evidence(filter_actor="a2")
+    assert len(actor_evidence) == 1
+    assert actor_evidence[0]["type"] == "gh_api"
+
+
+def test_evidence_store_verify_integrity(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="s1", content="c1", evidence_type="git")
+    assert len(store.verify_integrity()) == 0
+
+    # Manually corrupt the content to trigger a hash mismatch
+    store.data["evidence"][0]["content"] = "corrupted_content"
+    issues = store.verify_integrity()
+    assert len(issues) == 1
+    assert issues[0]["id"] == "EV-0001"
+
+
+def test_evidence_store_query(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="github_api", content="malicious activity detected", evidence_type="gh_api")
+    store.add(source="manual", content="clean observation", evidence_type="manual")
+
+    results = store.query("malicious")
+    assert len(results) == 1
+    assert results[0]["source"] == "github_api"
+
+    # Query should be case-insensitive
+    results = store.query("MALICIOUS")
+    assert len(results) == 1
+
+
+def test_evidence_store_query_searches_multiple_fields(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="git_fsck", content="dangling commit abc123", evidence_type="git", actor="attacker")
+    store.add(source="manual", content="clean", evidence_type="manual")
+
+    # Search by source
+    assert len(store.query("fsck")) == 1
+    # Search by actor
+    assert len(store.query("attacker")) == 1
+    # Search returns nothing for non-matching
+    assert len(store.query("nonexistent")) == 0
+
+
+def test_evidence_store_chain_of_custody(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="s1", content="c1", evidence_type="git")
+    store.add(source="s2", content="c2", evidence_type="gh_api")
+
+    chain = store.data["chain_of_custody"]
+    assert len(chain) == 2
+    assert chain[0]["evidence_id"] == "EV-0001"
+    assert chain[0]["action"] == "add"
+    assert chain[1]["evidence_id"] == "EV-0002"
+
+
+def test_evidence_store_export_markdown(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="git_log", content="suspicious commit", evidence_type="git", actor="actor1")
+
+    md = store.export_markdown()
+    assert "# Evidence Registry" in md
+    assert "EV-0001" in md
+    assert "Chain of Custody" in md
+    assert "actor1" in md
+
+
+def test_evidence_store_summary(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store = EvidenceStore(str(store_file))
+
+    store.add(source="s1", content="c1", evidence_type="git", actor="a1")
+    store.add(source="s2", content="c2", evidence_type="git", actor="a2")
+    store.add(source="s3", content="c3", evidence_type="gh_api", actor="a1")
+
+    s = store.summary()
+    assert s["total"] == 3
+    assert s["by_type"]["git"] == 2
+    assert s["by_type"]["gh_api"] == 1
+    assert "a1" in s["unique_actors"]
+    assert "a2" in s["unique_actors"]
+
+
+def test_evidence_store_corrupted_file(tmp_path):
+    store_file = tmp_path / "test_evidence.json"
+    store_file.write_text("NOT VALID JSON {{{")
+
+    with pytest.raises(SystemExit):
+        EvidenceStore(str(store_file))
@@ -361,6 +361,24 @@ class TestDeleteAndExport:
    def test_delete_nonexistent(self, db):
        assert db.delete_session("nope") is False

+    def test_resolve_session_id_exact(self, db):
+        db.create_session(session_id="20260315_092437_c9a6ff", source="cli")
+        assert db.resolve_session_id("20260315_092437_c9a6ff") == "20260315_092437_c9a6ff"
+
+    def test_resolve_session_id_unique_prefix(self, db):
+        db.create_session(session_id="20260315_092437_c9a6ff", source="cli")
+        assert db.resolve_session_id("20260315_092437_c9a6") == "20260315_092437_c9a6ff"
+
+    def test_resolve_session_id_ambiguous_prefix_returns_none(self, db):
+        db.create_session(session_id="20260315_092437_c9a6aa", source="cli")
+        db.create_session(session_id="20260315_092437_c9a6bb", source="cli")
+        assert db.resolve_session_id("20260315_092437_c9a6") is None
+
+    def test_resolve_session_id_escapes_like_wildcards(self, db):
+        db.create_session(session_id="20260315_092437_c9a6ff", source="cli")
+        db.create_session(session_id="20260315X092437_c9a6ff", source="cli")
+        assert db.resolve_session_id("20260315_092437") == "20260315_092437_c9a6ff"
+
    def test_export_session(self, db):
        db.create_session(session_id="s1", source="cli", model="test")
        db.append_message("s1", role="user", content="Hello")
@@ -206,6 +206,7 @@ class TestHasKnownPricing:
    def test_unknown_custom_model(self):
        assert _has_known_pricing("FP16_Hermes_4.5") is False
        assert _has_known_pricing("my-custom-model") is False
+        assert _has_known_pricing("glm-5") is False
        assert _has_known_pricing("") is False
        assert _has_known_pricing(None) is False

@@ -29,51 +29,6 @@ from unittest.mock import MagicMock, patch
 from run_agent import AIAgent, IterationBudget
 from tools.interrupt import set_interrupt, is_interrupted

-set_interrupt(False)
-
-# ─── Create parent agent ───
-parent = AIAgent.__new__(AIAgent)
-parent._interrupt_requested = False
-parent._interrupt_message = None
-parent._active_children = []
-parent.quiet_mode = True
-parent.model = "test/model"
-parent.base_url = "http://localhost:1"
-parent.api_key = "test"
-parent.provider = "test"
-parent.api_mode = "chat_completions"
-parent.platform = "cli"
-parent.enabled_toolsets = ["terminal", "file"]
-parent.providers_allowed = None
-parent.providers_ignored = None
-parent.providers_order = None
-parent.provider_sort = None
-parent.max_tokens = None
-parent.reasoning_config = None
-parent.prefill_messages = None
-parent._session_db = None
-parent._delegate_depth = 0
-parent._delegate_spinner = None
-parent.tool_progress_callback = None
-parent.iteration_budget = IterationBudget(max_total=100)
-parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}
-
-# Monkey-patch parent.interrupt to log
-_original_interrupt = AIAgent.interrupt
-def logged_interrupt(self, message=None):
-    log.info(f"🔴 parent.interrupt() called with: {message!r}")
-    log.info(f"   _active_children count: {len(self._active_children)}")
-    _original_interrupt(self, message)
-    log.info(f"   After interrupt: _interrupt_requested={self._interrupt_requested}")
-    for i, c in enumerate(self._active_children):
-        log.info(f"   Child {i}._interrupt_requested={c._interrupt_requested}")
-parent.interrupt = lambda msg=None: logged_interrupt(parent, msg)
-
-# ─── Simulate the exact CLI flow ───
-interrupt_queue = queue.Queue()
-child_running = threading.Event()
-agent_result = [None]
-
 def make_slow_response(delay=2.0):
    """API response that takes a while."""
    def create(**kwargs):
@@ -94,96 +49,154 @@ def make_slow_response(delay=2.0):
    return create


-def agent_thread_func():
-    """Simulates the agent_thread in cli.py's chat() method."""
-    log.info("🟢 agent_thread starting")
+def main() -> int:
+    set_interrupt(False)

-    with patch("run_agent.OpenAI") as MockOpenAI:
-        mock_client = MagicMock()
-        mock_client.chat.completions.create = make_slow_response(delay=3.0)
-        mock_client.close = MagicMock()
-        MockOpenAI.return_value = mock_client
+    # ─── Create parent agent ───
+    parent = AIAgent.__new__(AIAgent)
+    parent._interrupt_requested = False
+    parent._interrupt_message = None
+    parent._active_children = []
+    parent.quiet_mode = True
+    parent.model = "test/model"
+    parent.base_url = "http://localhost:1"
+    parent.api_key = "test"
+    parent.provider = "test"
+    parent.api_mode = "chat_completions"
+    parent.platform = "cli"
+    parent.enabled_toolsets = ["terminal", "file"]
+    parent.providers_allowed = None
+    parent.providers_ignored = None
+    parent.providers_order = None
+    parent.provider_sort = None
+    parent.max_tokens = None
+    parent.reasoning_config = None
+    parent.prefill_messages = None
+    parent._session_db = None
+    parent._delegate_depth = 0
+    parent._delegate_spinner = None
+    parent.tool_progress_callback = None
+    parent.iteration_budget = IterationBudget(max_total=100)
+    parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}

-        from tools.delegate_tool import _run_single_child
+    # Monkey-patch parent.interrupt to log
+    _original_interrupt = AIAgent.interrupt

-        # Signal that child is about to start
-        original_init = AIAgent.__init__
-        def patched_init(self_agent, *a, **kw):
-            log.info("🟡 Child AIAgent.__init__ called")
-            original_init(self_agent, *a, **kw)
-            child_running.set()
-            log.info(f"🟡 Child started, parent._active_children = {len(parent._active_children)}")
+    def logged_interrupt(self, message=None):
+        log.info(f"🔴 parent.interrupt() called with: {message!r}")
+        log.info(f"   _active_children count: {len(self._active_children)}")
+        _original_interrupt(self, message)
+        log.info(f"   After interrupt: _interrupt_requested={self._interrupt_requested}")
+        for i, child in enumerate(self._active_children):
+            log.info(f"   Child {i}._interrupt_requested={child._interrupt_requested}")

-        with patch.object(AIAgent, "__init__", patched_init):
-            result = _run_single_child(
-                task_index=0,
-                goal="Do a slow thing",
-                context=None,
-                toolsets=["terminal"],
-                model="test/model",
-                max_iterations=3,
-                parent_agent=parent,
-                task_count=1,
-                override_provider="test",
-                override_base_url="http://localhost:1",
-                override_api_key="test",
-                override_api_mode="chat_completions",
-            )
-            agent_result[0] = result
-            log.info(f"🟢 agent_thread finished. Result status: {result.get('status')}")
+    parent.interrupt = lambda msg=None: logged_interrupt(parent, msg)

+    # ─── Simulate the exact CLI flow ───
+    interrupt_queue = queue.Queue()
+    child_running = threading.Event()
+    agent_result = [None]

-# ─── Start agent thread (like chat() does) ───
-agent_thread = threading.Thread(target=agent_thread_func, name="agent_thread", daemon=True)
-agent_thread.start()
+    def agent_thread_func():
+        """Simulates the agent_thread in cli.py's chat() method."""
+        log.info("🟢 agent_thread starting")

-# ─── Wait for child to start ───
-if not child_running.wait(timeout=10):
-    print("FAIL: Child never started", file=sys.stderr)
-    sys.exit(1)
+        with patch("run_agent.OpenAI") as MockOpenAI:
+            mock_client = MagicMock()
+            mock_client.chat.completions.create = make_slow_response(delay=3.0)
+            mock_client.close = MagicMock()
+            MockOpenAI.return_value = mock_client

-# Give child time to enter its main loop and start API call
-time.sleep(1.0)
+            from tools.delegate_tool import _run_single_child

-# ─── Simulate user typing a message (like handle_enter does) ───
-log.info("📝 Simulating user typing 'Hey stop that'")
-interrupt_queue.put("Hey stop that")
+            # Signal that child is about to start
+            original_init = AIAgent.__init__

-# ─── Simulate chat() polling loop (like the real chat() method) ───
-log.info("📡 Starting interrupt queue polling (like chat())")
-interrupt_msg = None
-poll_count = 0
-while agent_thread.is_alive():
-    try:
-        interrupt_msg = interrupt_queue.get(timeout=0.1)
-        if interrupt_msg:
-            log.info(f"📨 Got interrupt message from queue: {interrupt_msg!r}")
-            log.info(f"   Calling parent.interrupt()...")
-            parent.interrupt(interrupt_msg)
-            log.info(f"   parent.interrupt() returned. Breaking poll loop.")
-            break
-    except queue.Empty:
-        poll_count += 1
-        if poll_count % 20 == 0:  # Log every 2s
-            log.info(f"   Still polling ({poll_count} iterations)...")
+            def patched_init(self_agent, *a, **kw):
+                log.info("🟡 Child AIAgent.__init__ called")
+                original_init(self_agent, *a, **kw)
+                child_running.set()
+                log.info(
+                    f"🟡 Child started, parent._active_children = {len(parent._active_children)}"
+                )

-# ─── Wait for agent to finish ───
-log.info("⏳ Waiting for agent_thread to join...")
-t0 = time.monotonic()
-agent_thread.join(timeout=10)
-elapsed = time.monotonic() - t0
-log.info(f"✅ agent_thread joined after {elapsed:.2f}s")
+            with patch.object(AIAgent, "__init__", patched_init):
+                result = _run_single_child(
+                    task_index=0,
+                    goal="Do a slow thing",
+                    context=None,
+                    toolsets=["terminal"],
+                    model="test/model",
+                    max_iterations=3,
+                    parent_agent=parent,
+                    task_count=1,
+                    override_provider="test",
+                    override_base_url="http://localhost:1",
+                    override_api_key="test",
+                    override_api_mode="chat_completions",
+                )
+                agent_result[0] = result
+                log.info(f"🟢 agent_thread finished. Result status: {result.get('status')}")

-# ─── Check results ───
-result = agent_result[0]
-if result:
-    log.info(f"Result status: {result['status']}")
-    log.info(f"Result duration: {result['duration_seconds']}s")
-    if result["status"] == "interrupted" and elapsed < 2.0:
-        print("✅ PASS: Interrupt worked correctly!", file=sys.stderr)
-    else:
+    # ─── Start agent thread (like chat() does) ───
+    agent_thread = threading.Thread(target=agent_thread_func, name="agent_thread", daemon=True)
+    agent_thread.start()
+
+    # ─── Wait for child to start ───
+    if not child_running.wait(timeout=10):
+        print("FAIL: Child never started", file=sys.stderr)
+        set_interrupt(False)
+        return 1
+
+    # Give child time to enter its main loop and start API call
+    time.sleep(1.0)
+
+    # ─── Simulate user typing a message (like handle_enter does) ───
+    log.info("📝 Simulating user typing 'Hey stop that'")
+    interrupt_queue.put("Hey stop that")
+
+    # ─── Simulate chat() polling loop (like the real chat() method) ───
+    log.info("📡 Starting interrupt queue polling (like chat())")
+    interrupt_msg = None
+    poll_count = 0
+    while agent_thread.is_alive():
+        try:
+            interrupt_msg = interrupt_queue.get(timeout=0.1)
+            if interrupt_msg:
+                log.info(f"📨 Got interrupt message from queue: {interrupt_msg!r}")
+                log.info("   Calling parent.interrupt()...")
+                parent.interrupt(interrupt_msg)
+                log.info("   parent.interrupt() returned. Breaking poll loop.")
+                break
+        except queue.Empty:
+            poll_count += 1
+            if poll_count % 20 == 0:  # Log every 2s
+                log.info(f"   Still polling ({poll_count} iterations)...")
+
+    # ─── Wait for agent to finish ───
+    log.info("⏳ Waiting for agent_thread to join...")
+    t0 = time.monotonic()
+    agent_thread.join(timeout=10)
+    elapsed = time.monotonic() - t0
+    log.info(f"✅ agent_thread joined after {elapsed:.2f}s")
+
+    # ─── Check results ───
+    result = agent_result[0]
+    if result:
+        log.info(f"Result status: {result['status']}")
+        log.info(f"Result duration: {result['duration_seconds']}s")
+        if result["status"] == "interrupted" and elapsed < 2.0:
+            print("✅ PASS: Interrupt worked correctly!", file=sys.stderr)
+            set_interrupt(False)
+            return 0
        print(f"❌ FAIL: status={result['status']}, elapsed={elapsed:.2f}s", file=sys.stderr)
-else:
-    print("❌ FAIL: No result returned", file=sys.stderr)
+        set_interrupt(False)
+        return 1

-set_interrupt(False)
+    print("❌ FAIL: No result returned", file=sys.stderr)
+    set_interrupt(False)
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -145,8 +145,9 @@ def test_concurrent_requests_do_not_break_each_other_when_one_client_closes(monk
    thread_one.join(timeout=5)
    thread_two.join(timeout=5)

-    assert isinstance(results["first"], APIConnectionError)
-    assert results["second"] == {"ok": "second"}
+    values = list(results.values())
+    assert sum(isinstance(value, APIConnectionError) for value in values) == 1
+    assert values.count({"ok": "second"}) == 1
    assert len(factory.calls) == 2


@@ -543,7 +543,7 @@ class TestAuxiliaryClientProviderPriority:
             patch("agent.auxiliary_client._read_codex_access_token", return_value="codex-tok"), \
             patch("agent.auxiliary_client.OpenAI"):
            client, model = get_text_auxiliary_client()
-        assert model == "gpt-5.3-codex"
+        assert model == "gpt-5.2-codex"
        assert isinstance(client, CodexAuxiliaryClient)


@@ -12,7 +12,7 @@ import uuid
 from logging.handlers import RotatingFileHandler
 from pathlib import Path
 from types import SimpleNamespace
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

 import pytest

@@ -612,6 +612,25 @@ class TestBuildApiKwargs:
        kwargs = agent._build_api_kwargs(messages)
        assert kwargs["extra_body"]["reasoning"] == {"enabled": False}

+    def test_reasoning_not_sent_for_unsupported_openrouter_model(self, agent):
+        agent.model = "minimax/minimax-m2.5"
+        messages = [{"role": "user", "content": "hi"}]
+        kwargs = agent._build_api_kwargs(messages)
+        assert "reasoning" not in kwargs.get("extra_body", {})
+
+    def test_reasoning_sent_for_supported_openrouter_model(self, agent):
+        agent.model = "qwen/qwen3.5-plus-02-15"
+        messages = [{"role": "user", "content": "hi"}]
+        kwargs = agent._build_api_kwargs(messages)
+        assert kwargs["extra_body"]["reasoning"]["effort"] == "medium"
+
+    def test_reasoning_sent_for_nous_route(self, agent):
+        agent.base_url = "https://inference-api.nousresearch.com/v1"
+        agent.model = "minimax/minimax-m2.5"
+        messages = [{"role": "user", "content": "hi"}]
+        kwargs = agent._build_api_kwargs(messages)
+        assert kwargs["extra_body"]["reasoning"]["effort"] == "medium"
+
    def test_max_tokens_injected(self, agent):
        agent.max_tokens = 4096
        messages = [{"role": "user", "content": "hi"}]
@@ -911,8 +930,10 @@ class TestConcurrentToolExecution:
            mock_hfc.assert_called_once_with(
                "web_search", {"q": "test"}, "task-1",
                enabled_tools=list(agent.valid_tool_names),
+                honcho_manager=None,
+                honcho_session_key=None,
            )
-        assert result == "result"
+            assert result == "result"

    def test_invoke_tool_handles_agent_level_tools(self, agent):
        """_invoke_tool should handle todo tool directly."""
@@ -942,6 +963,19 @@ class TestHandleMaxIterations:
        assert "error" in result.lower()
        assert "API down" in result

+    def test_summary_skips_reasoning_for_unsupported_openrouter_model(self, agent):
+        agent.model = "minimax/minimax-m2.5"
+        resp = _mock_response(content="Summary")
+        agent.client.chat.completions.create.return_value = resp
+        agent._cached_system_prompt = "You are helpful."
+        messages = [{"role": "user", "content": "do stuff"}]
+
+        result = agent._handle_max_iterations(messages, 60)
+
+        assert result == "Summary"
+        kwargs = agent.client.chat.completions.create.call_args.kwargs
+        assert "reasoning" not in kwargs.get("extra_body", {})
+

 class TestRunConversation:
    """Tests for the main run_conversation method.
@@ -1552,6 +1586,38 @@ class TestSystemPromptStability:
        should_prefetch = not conversation_history
        assert should_prefetch is True

+    def test_run_conversation_can_skip_honcho_sync_for_synthetic_turns(self, agent):
+        captured = {}
+
+        def _fake_api_call(api_kwargs):
+            captured.update(api_kwargs)
+            return _mock_response(content="done", finish_reason="stop")
+
+        agent._honcho = MagicMock()
+        agent._honcho_session_key = "session-1"
+        agent._honcho_config = SimpleNamespace(
+            ai_peer="hermes",
+            memory_mode="hybrid",
+            write_frequency="async",
+            recall_mode="hybrid",
+        )
+        agent._use_prompt_caching = False
+
+        with (
+            patch.object(agent, "_honcho_sync") as mock_sync,
+            patch.object(agent, "_queue_honcho_prefetch") as mock_prefetch,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+            patch.object(agent, "_interruptible_api_call", side_effect=_fake_api_call),
+        ):
+            result = agent.run_conversation("synthetic flush turn", sync_honcho=False)
+
+        assert result["completed"] is True
+        assert captured["messages"][-1]["content"] == "synthetic flush turn"
+        mock_sync.assert_not_called()
+        mock_prefetch.assert_not_called()
+

 class TestHonchoActivation:
    def test_disabled_config_skips_honcho_init(self):
@@ -1986,6 +2052,69 @@ class TestBuildApiKwargsAnthropicMaxTokens:
                assert call_args[0][3] is None


+class TestAnthropicImageFallback:
+    def test_build_api_kwargs_converts_multimodal_user_image_to_text(self, agent):
+        agent.api_mode = "anthropic_messages"
+        agent.reasoning_config = None
+
+        api_messages = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Can you see this now?"},
+                {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}},
+            ],
+        }]
+
+        with (
+            patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=json.dumps({"success": True, "analysis": "A cat sitting on a chair."}))),
+            patch("agent.anthropic_adapter.build_anthropic_kwargs") as mock_build,
+        ):
+            mock_build.return_value = {"model": "claude-sonnet-4-20250514", "messages": [], "max_tokens": 4096}
+            agent._build_api_kwargs(api_messages)
+
+        kwargs = mock_build.call_args.kwargs or dict(zip(
+            ["model", "messages", "tools", "max_tokens", "reasoning_config"],
+            mock_build.call_args.args,
+        ))
+        transformed = kwargs["messages"]
+        assert isinstance(transformed[0]["content"], str)
+        assert "A cat sitting on a chair." in transformed[0]["content"]
+        assert "Can you see this now?" in transformed[0]["content"]
+        assert "vision_analyze with image_url: https://example.com/cat.png" in transformed[0]["content"]
+
+    def test_build_api_kwargs_reuses_cached_image_analysis_for_duplicate_images(self, agent):
+        agent.api_mode = "anthropic_messages"
+        agent.reasoning_config = None
+        data_url = "data:image/png;base64,QUFBQQ=="
+
+        api_messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "first"},
+                    {"type": "input_image", "image_url": data_url},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "second"},
+                    {"type": "input_image", "image_url": data_url},
+                ],
+            },
+        ]
+
+        mock_vision = AsyncMock(return_value=json.dumps({"success": True, "analysis": "A small test image."}))
+        with (
+            patch("tools.vision_tools.vision_analyze_tool", new=mock_vision),
+            patch("agent.anthropic_adapter.build_anthropic_kwargs") as mock_build,
+        ):
+            mock_build.return_value = {"model": "claude-sonnet-4-20250514", "messages": [], "max_tokens": 4096}
+            agent._build_api_kwargs(api_messages)
+
+        assert mock_vision.await_count == 1
+
+
 class TestFallbackAnthropicProvider:
    """Bug fix: _try_activate_fallback had no case for anthropic provider."""

@@ -1,8 +1,10 @@
 """Tests for tools/checkpoint_manager.py — CheckpointManager."""

+import logging
 import os
 import json
 import shutil
+import subprocess
 import pytest
 from pathlib import Path
 from unittest.mock import patch
@@ -143,6 +145,12 @@ class TestTakeCheckpoint:
        result = mgr.ensure_checkpoint(str(work_dir), "initial")
        assert result is True

+    def test_successful_checkpoint_does_not_log_expected_diff_exit(self, mgr, work_dir, caplog):
+        with caplog.at_level(logging.ERROR, logger="tools.checkpoint_manager"):
+            result = mgr.ensure_checkpoint(str(work_dir), "initial")
+        assert result is True
+        assert not any("diff --cached --quiet" in r.getMessage() for r in caplog.records)
+
    def test_dedup_same_turn(self, mgr, work_dir):
        r1 = mgr.ensure_checkpoint(str(work_dir), "first")
        r2 = mgr.ensure_checkpoint(str(work_dir), "second")
@@ -375,6 +383,26 @@ class TestErrorResilience:
        result = mgr.ensure_checkpoint(str(work_dir), "test")
        assert result is False

+    def test_run_git_allows_expected_nonzero_without_error_log(self, tmp_path, caplog):
+        completed = subprocess.CompletedProcess(
+            args=["git", "diff", "--cached", "--quiet"],
+            returncode=1,
+            stdout="",
+            stderr="",
+        )
+        with patch("tools.checkpoint_manager.subprocess.run", return_value=completed):
+            with caplog.at_level(logging.ERROR, logger="tools.checkpoint_manager"):
+                ok, stdout, stderr = _run_git(
+                    ["diff", "--cached", "--quiet"],
+                    tmp_path / "shadow",
+                    str(tmp_path / "work"),
+                    allowed_returncodes={1},
+                )
+        assert ok is False
+        assert stdout == ""
+        assert stderr == ""
+        assert not caplog.records
+
    def test_checkpoint_failure_does_not_raise(self, mgr, work_dir, monkeypatch):
        """Checkpoint failures should never raise — they're silently logged."""
        def broken_run_git(*args, **kwargs):
@@ -153,6 +153,36 @@ class TestScheduleCronjob:
        assert job["provider"] == "custom"
        assert job["base_url"] == "http://127.0.0.1:4000/v1"

+    def test_thread_id_captured_in_origin(self, monkeypatch):
+        monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
+        monkeypatch.setenv("HERMES_SESSION_CHAT_ID", "123456")
+        monkeypatch.setenv("HERMES_SESSION_THREAD_ID", "42")
+        import cron.jobs as _jobs
+        created = json.loads(schedule_cronjob(
+            prompt="Thread test",
+            schedule="every 1h",
+            deliver="origin",
+        ))
+        assert created["success"] is True
+        job_id = created["job_id"]
+        job = _jobs.get_job(job_id)
+        assert job["origin"]["thread_id"] == "42"
+
+    def test_thread_id_absent_when_not_set(self, monkeypatch):
+        monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
+        monkeypatch.setenv("HERMES_SESSION_CHAT_ID", "123456")
+        monkeypatch.delenv("HERMES_SESSION_THREAD_ID", raising=False)
+        import cron.jobs as _jobs
+        created = json.loads(schedule_cronjob(
+            prompt="No thread test",
+            schedule="every 1h",
+            deliver="origin",
+        ))
+        assert created["success"] is True
+        job_id = created["job_id"]
+        job = _jobs.get_job(job_id)
+        assert job["origin"].get("thread_id") is None
+

 # =========================================================================
 # list_cronjobs
@@ -1,11 +1,31 @@
 import logging
 import subprocess
+import sys
+import types

 import pytest

 from tools.environments import docker as docker_env


+def _install_fake_minisweagent(monkeypatch, captured_run_args):
+    class MockInnerDocker:
+        container_id = "fake-container"
+        config = type("Config", (), {"executable": "/usr/bin/docker", "forward_env": [], "env": {}})()
+
+        def __init__(self, **kwargs):
+            captured_run_args.extend(kwargs.get("run_args", []))
+
+    minisweagent_mod = types.ModuleType("minisweagent")
+    environments_mod = types.ModuleType("minisweagent.environments")
+    docker_mod = types.ModuleType("minisweagent.environments.docker")
+    docker_mod.DockerEnvironment = MockInnerDocker
+
+    monkeypatch.setitem(sys.modules, "minisweagent", minisweagent_mod)
+    monkeypatch.setitem(sys.modules, "minisweagent.environments", environments_mod)
+    monkeypatch.setitem(sys.modules, "minisweagent.environments.docker", docker_mod)
+
+
 def _make_dummy_env(**kwargs):
    """Helper to construct DockerEnvironment with minimal required args."""
    return docker_env.DockerEnvironment(
@@ -19,6 +39,8 @@ def _make_dummy_env(**kwargs):
        task_id=kwargs.get("task_id", "test-task"),
        volumes=kwargs.get("volumes", []),
        network=kwargs.get("network", True),
+        host_cwd=kwargs.get("host_cwd"),
+        auto_mount_cwd=kwargs.get("auto_mount_cwd", False),
    )


@@ -86,3 +108,106 @@ def test_ensure_docker_available_uses_resolved_executable(monkeypatch):
        })
    ]

+
+def test_auto_mount_host_cwd_adds_volume(monkeypatch, tmp_path):
+    """Opt-in docker cwd mounting should bind the host cwd to /workspace."""
+    project_dir = tmp_path / "my-project"
+    project_dir.mkdir()
+
+    def _run_docker_version(*args, **kwargs):
+        return subprocess.CompletedProcess(args[0], 0, stdout="Docker version", stderr="")
+
+    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
+    monkeypatch.setattr(docker_env.subprocess, "run", _run_docker_version)
+
+    captured_run_args = []
+    _install_fake_minisweagent(monkeypatch, captured_run_args)
+
+    _make_dummy_env(
+        cwd="/workspace",
+        host_cwd=str(project_dir),
+        auto_mount_cwd=True,
+    )
+
+    run_args_str = " ".join(captured_run_args)
+    assert f"{project_dir}:/workspace" in run_args_str
+
+
+def test_auto_mount_disabled_by_default(monkeypatch, tmp_path):
+    """Host cwd should not be mounted unless the caller explicitly opts in."""
+    project_dir = tmp_path / "my-project"
+    project_dir.mkdir()
+
+    def _run_docker_version(*args, **kwargs):
+        return subprocess.CompletedProcess(args[0], 0, stdout="Docker version", stderr="")
+
+    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
+    monkeypatch.setattr(docker_env.subprocess, "run", _run_docker_version)
+
+    captured_run_args = []
+    _install_fake_minisweagent(monkeypatch, captured_run_args)
+
+    _make_dummy_env(
+        cwd="/root",
+        host_cwd=str(project_dir),
+        auto_mount_cwd=False,
+    )
+
+    run_args_str = " ".join(captured_run_args)
+    assert f"{project_dir}:/workspace" not in run_args_str
+
+
+def test_auto_mount_skipped_when_workspace_already_mounted(monkeypatch, tmp_path):
+    """Explicit user volumes for /workspace should take precedence over cwd mount."""
+    project_dir = tmp_path / "my-project"
+    project_dir.mkdir()
+    other_dir = tmp_path / "other"
+    other_dir.mkdir()
+
+    def _run_docker_version(*args, **kwargs):
+        return subprocess.CompletedProcess(args[0], 0, stdout="Docker version", stderr="")
+
+    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
+    monkeypatch.setattr(docker_env.subprocess, "run", _run_docker_version)
+
+    captured_run_args = []
+    _install_fake_minisweagent(monkeypatch, captured_run_args)
+
+    _make_dummy_env(
+        cwd="/workspace",
+        host_cwd=str(project_dir),
+        auto_mount_cwd=True,
+        volumes=[f"{other_dir}:/workspace"],
+    )
+
+    run_args_str = " ".join(captured_run_args)
+    assert f"{other_dir}:/workspace" in run_args_str
+    assert run_args_str.count(":/workspace") == 1
+
+
+def test_auto_mount_replaces_persistent_workspace_bind(monkeypatch, tmp_path):
+    """Persistent mode should still prefer the configured host cwd at /workspace."""
+    project_dir = tmp_path / "my-project"
+    project_dir.mkdir()
+
+    def _run_docker_version(*args, **kwargs):
+        return subprocess.CompletedProcess(args[0], 0, stdout="Docker version", stderr="")
+
+    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
+    monkeypatch.setattr(docker_env.subprocess, "run", _run_docker_version)
+
+    captured_run_args = []
+    _install_fake_minisweagent(monkeypatch, captured_run_args)
+
+    _make_dummy_env(
+        cwd="/workspace",
+        persistent_filesystem=True,
+        host_cwd=str(project_dir),
+        auto_mount_cwd=True,
+        task_id="test-persistent-auto-mount",
+    )
+
+    run_args_str = " ".join(captured_run_args)
+    assert f"{project_dir}:/workspace" in run_args_str
+    assert "/sandboxes/docker/test-persistent-auto-mount/workspace:/workspace" not in run_args_str
+
@@ -5,6 +5,7 @@ handling without requiring a running terminal environment.
 """

 import json
+import logging
 from unittest.mock import MagicMock, patch

 from tools.file_tools import (
@@ -87,13 +88,26 @@ class TestWriteFileHandler:
        mock_ops.write_file.assert_called_once_with("/tmp/out.txt", "hello world!\n")

    @patch("tools.file_tools._get_file_ops")
-    def test_exception_returns_error_json(self, mock_get):
+    def test_permission_error_returns_error_json_without_error_log(self, mock_get, caplog):
        mock_get.side_effect = PermissionError("read-only filesystem")

        from tools.file_tools import write_file_tool
-        result = json.loads(write_file_tool("/tmp/out.txt", "data"))
+        with caplog.at_level(logging.DEBUG, logger="tools.file_tools"):
+            result = json.loads(write_file_tool("/tmp/out.txt", "data"))
        assert "error" in result
        assert "read-only" in result["error"]
+        assert any("write_file expected denial" in r.getMessage() for r in caplog.records)
+        assert not any(r.levelno >= logging.ERROR for r in caplog.records)
+
+    @patch("tools.file_tools._get_file_ops")
+    def test_unexpected_exception_still_logs_error(self, mock_get, caplog):
+        mock_get.side_effect = RuntimeError("boom")
+
+        from tools.file_tools import write_file_tool
+        with caplog.at_level(logging.ERROR, logger="tools.file_tools"):
+            result = json.loads(write_file_tool("/tmp/out.txt", "data"))
+        assert result["error"] == "boom"
+        assert any("write_file error" in r.getMessage() for r in caplog.records)


 class TestPatchHandler:
@@ -0,0 +1,36 @@
+"""Regression tests for per-call Honcho tool session routing."""
+
+import json
+from unittest.mock import MagicMock
+
+from tools import honcho_tools
+
+
+class TestHonchoToolSessionContext:
+    def setup_method(self):
+        self.orig_manager = honcho_tools._session_manager
+        self.orig_key = honcho_tools._session_key
+
+    def teardown_method(self):
+        honcho_tools._session_manager = self.orig_manager
+        honcho_tools._session_key = self.orig_key
+
+    def test_explicit_call_context_wins_over_module_global_state(self):
+        global_manager = MagicMock()
+        global_manager.get_peer_card.return_value = ["global"]
+        explicit_manager = MagicMock()
+        explicit_manager.get_peer_card.return_value = ["explicit"]
+
+        honcho_tools.set_session_context(global_manager, "global-session")
+
+        result = json.loads(
+            honcho_tools._handle_honcho_profile(
+                {},
+                honcho_manager=explicit_manager,
+                honcho_session_key="explicit-session",
+            )
+        )
+
+        assert result == {"result": ["explicit"]}
+        explicit_manager.get_peer_card.assert_called_once_with("explicit-session")
+        global_manager.get_peer_card.assert_not_called()
--- a/Show More
+++ b/Show More