fix(dashboard): include cache tokens in totals, track real API call count

The analytics dashboard had three accuracy issues: 1. TOTAL TOKENS excluded cache_read and cache_write tokens — only counted the non-cached input portion. With 90%+ cache hit rates typical in Hermes, this dramatically undercounted actual token usage (e.g. showing 9.1M when the real total was 169M+). 2. The 'API Calls' card displayed session count (COUNT(*) from sessions table), not actual LLM API requests. A single session makes 10-90 API calls through the tool loop, so this was ~30x lower than reality. 3. cache_write_tokens was stored in the DB but never exposed through the analytics API endpoint or frontend. Changes: - Add api_call_count column to sessions table (schema v7 migration) - Persist api_call_count=1 per LLM API call in run_agent.py - Analytics SQL queries now include cache_write_tokens and api_call_count in daily, by_model, and totals aggregations - Frontend TOTAL TOKENS card now shows input + cache_read + cache_write + output (the full prompt total + output) - API CALLS card now uses real api_call_count from DB - New Cache Hit Rate card shows cache efficiency percentage - Bar chart, tooltips, daily table, model table all use prompt totals (input + cache_read + cache_write) instead of just input - Labels changed from 'Input' to 'Prompt' to reflect the full prompt total - TypeScript interfaces and i18n strings updated (en + zh)
2026-04-15 12:31:05 +05:30
42 changed files with 193 additions and 998 deletions
@@ -12,8 +12,6 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional

-from hermes_constants import display_hermes_home
-
 logger = logging.getLogger(__name__)

 _skill_commands: Dict[str, Dict[str, Any]] = {}
@@ -110,7 +108,7 @@ def _inject_skill_config(loaded_skill: dict[str, Any], parts: list[str]) -> None
        if not resolved:
            return

-        lines = ["", f"[Skill config (from {display_hermes_home()}/config.yaml):"]
+        lines = ["", "[Skill config (from ~/.hermes/config.yaml):"]
        for key, value in resolved.items():
            display_val = str(value) if value else "(not set)"
            lines.append(f"  {key} = {display_val}")
@@ -4588,19 +4588,16 @@ class HermesCLI:
                self._close_model_picker()
                return
            provider_data = providers[selected]
-            # Use the curated model list from list_authenticated_providers()
-            # (same lists as `hermes model` and gateway pickers).
-            # Only fall back to the live provider catalog when the curated
-            # list is empty (e.g. user-defined endpoints with no curated list).
-            model_list = provider_data.get("models", [])
+            model_list = []
+            try:
+                from hermes_cli.models import provider_model_ids
+                live = provider_model_ids(provider_data["slug"])
+                if live:
+                    model_list = live
+            except Exception:
+                pass
            if not model_list:
-                try:
-                    from hermes_cli.models import provider_model_ids
-                    live = provider_model_ids(provider_data["slug"])
-                    if live:
-                        model_list = live
-                except Exception:
-                    pass
+                model_list = provider_data.get("models", [])
            state["stage"] = "model"
            state["provider_data"] = provider_data
            state["model_list"] = model_list
@@ -5956,7 +5953,7 @@ class HermesCLI:
        parts = cmd.strip().split(None, 1)
        sub = parts[1].lower().strip() if len(parts) > 1 else "status"

-        _DEFAULT_CDP = "http://127.0.0.1:9222"
+        _DEFAULT_CDP = "http://localhost:9222"
        current = os.environ.get("BROWSER_CDP_URL", "").strip()

        if sub.startswith("connect"):
@@ -288,13 +288,11 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option

    if wrap_response:
        task_name = job.get("name", job["id"])
-        job_id = job.get("id", "")
        delivery_content = (
            f"Cronjob Response: {task_name}\n"
-            f"(job_id: {job_id})\n"
            f"-------------\n\n"
            f"{content}\n\n"
-            f"To stop or manage this job, send me a new message (e.g. \"stop reminder {task_name}\")."
+            f"Note: The agent cannot see this message, and therefore cannot respond to it."
        )
    else:
        delivery_content = content
@@ -1624,21 +1624,6 @@ class BasePlatformAdapter(ABC):
            # streaming already delivered the text (already_sent=True) or
            # when the message was queued behind an active agent.  Log at
            # DEBUG to avoid noisy warnings for expected behavior.
-            #
-            # Suppress stale response when the session was interrupted by a
-            # new message that hasn't been consumed yet.  The pending message
-            # is processed by the pending-message handler below (#8221/#2483).
-            if (
-                response
-                and interrupt_event.is_set()
-                and session_key in self._pending_messages
-            ):
-                logger.info(
-                    "[%s] Suppressing stale response for interrupted session %s",
-                    self.name,
-                    session_key,
-                )
-                response = None
            if not response:
                logger.debug("[%s] Handler returned empty/None response for %s", self.name, event.source.chat_id)
            if response:
@@ -1379,68 +1379,6 @@ class DiscordAdapter(BasePlatformAdapter):
            )
            return await super().send_image(chat_id, image_url, caption, reply_to)

-    async def send_animation(
-        self,
-        chat_id: str,
-        animation_url: str,
-        caption: Optional[str] = None,
-        reply_to: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> SendResult:
-        """Send an animated GIF natively as a Discord file attachment."""
-        if not self._client:
-            return SendResult(success=False, error="Not connected")
-
-        if not is_safe_url(animation_url):
-            logger.warning("[%s] Blocked unsafe animation URL during Discord send_animation", self.name)
-            return await super().send_animation(chat_id, animation_url, caption, reply_to, metadata=metadata)
-
-        try:
-            import aiohttp
-
-            channel = self._client.get_channel(int(chat_id))
-            if not channel:
-                channel = await self._client.fetch_channel(int(chat_id))
-            if not channel:
-                return SendResult(success=False, error=f"Channel {chat_id} not found")
-
-            # Download the GIF and send as a Discord file attachment
-            # (Discord renders .gif attachments as auto-playing animations inline)
-            from gateway.platforms.base import resolve_proxy_url, proxy_kwargs_for_aiohttp
-            _proxy = resolve_proxy_url(platform_env_var="DISCORD_PROXY")
-            _sess_kw, _req_kw = proxy_kwargs_for_aiohttp(_proxy)
-            async with aiohttp.ClientSession(**_sess_kw) as session:
-                async with session.get(animation_url, timeout=aiohttp.ClientTimeout(total=30), **_req_kw) as resp:
-                    if resp.status != 200:
-                        raise Exception(f"Failed to download animation: HTTP {resp.status}")
-
-                    animation_data = await resp.read()
-
-                    import io
-                    file = discord.File(io.BytesIO(animation_data), filename="animation.gif")
-
-                    msg = await channel.send(
-                        content=caption if caption else None,
-                        file=file,
-                    )
-                    return SendResult(success=True, message_id=str(msg.id))
-
-        except ImportError:
-            logger.warning(
-                "[%s] aiohttp not installed, falling back to URL. Run: pip install aiohttp",
-                self.name,
-                exc_info=True,
-            )
-            return await super().send_animation(chat_id, animation_url, caption, reply_to, metadata=metadata)
-        except Exception as e:  # pragma: no cover - defensive logging
-            logger.error(
-                "[%s] Failed to send animation attachment, falling back to URL: %s",
-                self.name,
-                e,
-                exc_info=True,
-            )
-            return await super().send_animation(chat_id, animation_url, caption, reply_to, metadata=metadata)
-
    async def send_video(
        self,
        chat_id: str,
@@ -729,14 +729,6 @@ class MatrixAdapter(BasePlatformAdapter):
            except Exception:
                pass

-    async def stop_typing(self, chat_id: str) -> None:
-        """Stop the Matrix typing indicator."""
-        if self._client:
-            try:
-                await self._client.set_typing(RoomID(chat_id), timeout=0)
-            except Exception:
-                pass
-
    async def edit_message(
        self, chat_id: str, message_id: str, content: str
    ) -> SendResult:
@@ -9231,11 +9231,15 @@ class GatewayRunner:
                                pass
                        except Exception as e:
                            logger.debug("Stream consumer wait before queued message failed: %s", e)
+                    _response_previewed = bool(result.get("response_previewed"))
                    _already_streamed = bool(
                        _sc
                        and (
                            getattr(_sc, "final_response_sent", False)
-                            or getattr(_sc, "already_sent", False)
+                            or (
+                                _response_previewed
+                                and getattr(_sc, "already_sent", False)
+                            )
                        )
                    )
                    first_response = result.get("final_response", "")
@@ -9319,9 +9323,13 @@ class GatewayRunner:
        # them even if streaming had sent earlier partial output.
        _sc = stream_consumer_holder[0]
        if _sc and isinstance(response, dict) and not response.get("failed"):
+            _response_previewed = bool(response.get("response_previewed"))
            if (
                getattr(_sc, "final_response_sent", False)
-                or getattr(_sc, "already_sent", False)
+                or (
+                    _response_previewed
+                    and getattr(_sc, "already_sent", False)
+                )
            ):
                response["already_sent"] = True
        
@@ -167,7 +167,6 @@ def _resolve_runtime_from_pool_entry(
        api_mode = "chat_completions"
    elif provider == "copilot":
        api_mode = _copilot_runtime_api_mode(model_cfg, getattr(entry, "runtime_api_key", ""))
-        base_url = base_url or PROVIDER_REGISTRY["copilot"].inference_base_url
    else:
        configured_provider = str(model_cfg.get("provider") or "").strip().lower()
        # Honour model.base_url from config.yaml when the configured provider
@@ -1977,7 +1977,8 @@ async def update_config_raw(body: RawConfigUpdate):
@app.get("/api/analytics/usage")
 async def get_usage_analytics(days: int = 30):
    from hermes_state import SessionDB
-    db = SessionDB()
+    from hermes_constants import get_hermes_home
+    db = SessionDB(db_path=get_hermes_home() / "state.db")
    try:
        cutoff = time.time() - (days * 86400)
        cur = db._conn.execute("""
@@ -1985,10 +1986,12 @@ async def get_usage_analytics(days: int = 30):
                   SUM(input_tokens) as input_tokens,
                   SUM(output_tokens) as output_tokens,
                   SUM(cache_read_tokens) as cache_read_tokens,
+                   SUM(cache_write_tokens) as cache_write_tokens,
                   SUM(reasoning_tokens) as reasoning_tokens,
                   COALESCE(SUM(estimated_cost_usd), 0) as estimated_cost,
                   COALESCE(SUM(actual_cost_usd), 0) as actual_cost,
-                   COUNT(*) as sessions
+                   COUNT(*) as sessions,
+                   SUM(COALESCE(api_call_count, 0)) as api_calls
            FROM sessions WHERE started_at > ?
            GROUP BY day ORDER BY day
        """, (cutoff,))
@@ -1998,10 +2001,13 @@ async def get_usage_analytics(days: int = 30):
            SELECT model,
                   SUM(input_tokens) as input_tokens,
                   SUM(output_tokens) as output_tokens,
+                   SUM(cache_read_tokens) as cache_read_tokens,
+                   SUM(cache_write_tokens) as cache_write_tokens,
                   COALESCE(SUM(estimated_cost_usd), 0) as estimated_cost,
-                   COUNT(*) as sessions
+                   COUNT(*) as sessions,
+                   SUM(COALESCE(api_call_count, 0)) as api_calls
            FROM sessions WHERE started_at > ? AND model IS NOT NULL
-            GROUP BY model ORDER BY SUM(input_tokens) + SUM(output_tokens) DESC
+            GROUP BY model ORDER BY SUM(input_tokens) + SUM(cache_read_tokens) + SUM(cache_write_tokens) + SUM(output_tokens) DESC
        """, (cutoff,))
        by_model = [dict(r) for r in cur2.fetchall()]

@@ -2009,10 +2015,12 @@ async def get_usage_analytics(days: int = 30):
            SELECT SUM(input_tokens) as total_input,
                   SUM(output_tokens) as total_output,
                   SUM(cache_read_tokens) as total_cache_read,
+                   SUM(cache_write_tokens) as total_cache_write,
                   SUM(reasoning_tokens) as total_reasoning,
                   COALESCE(SUM(estimated_cost_usd), 0) as total_estimated_cost,
                   COALESCE(SUM(actual_cost_usd), 0) as total_actual_cost,
-                   COUNT(*) as total_sessions
+                   COUNT(*) as total_sessions,
+                   SUM(COALESCE(api_call_count, 0)) as total_api_calls
            FROM sessions WHERE started_at > ?
        """, (cutoff,))
        totals = dict(cur3.fetchone())
@@ -31,7 +31,7 @@ T = TypeVar("T")

 DEFAULT_DB_PATH = get_hermes_home() / "state.db"

-SCHEMA_VERSION = 6
+SCHEMA_VERSION = 7

 SCHEMA_SQL = """
 CREATE TABLE IF NOT EXISTS schema_version (
@@ -65,6 +65,7 @@ CREATE TABLE IF NOT EXISTS sessions (
    cost_source TEXT,
    pricing_version TEXT,
    title TEXT,
+    api_call_count INTEGER DEFAULT 0,
    FOREIGN KEY (parent_session_id) REFERENCES sessions(id)
 );

@@ -329,6 +330,17 @@ class SessionDB:
                    except sqlite3.OperationalError:
                        pass  # Column already exists
                cursor.execute("UPDATE schema_version SET version = 6")
+            if current_version < 7:
+                # v7: add api_call_count column to sessions — tracks the number
+                # of individual LLM API calls made within a session (as opposed
+                # to the session count itself).
+                try:
+                    cursor.execute(
+                        'ALTER TABLE sessions ADD COLUMN "api_call_count" INTEGER DEFAULT 0'
+                    )
+                except sqlite3.OperationalError:
+                    pass  # Column already exists
+                cursor.execute("UPDATE schema_version SET version = 7")

        # Unique title index — always ensure it exists (safe to run after migrations
        # since the title column is guaranteed to exist at this point)
@@ -426,6 +438,7 @@ class SessionDB:
        billing_provider: Optional[str] = None,
        billing_base_url: Optional[str] = None,
        billing_mode: Optional[str] = None,
+        api_call_count: int = 0,
        absolute: bool = False,
    ) -> None:
        """Update token counters and backfill model if not already set.
@@ -455,7 +468,8 @@ class SessionDB:
                   billing_provider = COALESCE(billing_provider, ?),
                   billing_base_url = COALESCE(billing_base_url, ?),
                   billing_mode = COALESCE(billing_mode, ?),
-                   model = COALESCE(model, ?)
+                   model = COALESCE(model, ?),
+                   api_call_count = ?
                   WHERE id = ?"""
        else:
            sql = """UPDATE sessions SET
@@ -475,7 +489,8 @@ class SessionDB:
                   billing_provider = COALESCE(billing_provider, ?),
                   billing_base_url = COALESCE(billing_base_url, ?),
                   billing_mode = COALESCE(billing_mode, ?),
-                   model = COALESCE(model, ?)
+                   model = COALESCE(model, ?),
+                   api_call_count = COALESCE(api_call_count, 0) + ?
                   WHERE id = ?"""
        params = (
            input_tokens,
@@ -493,6 +508,7 @@ class SessionDB:
            billing_base_url,
            billing_mode,
            model,
+            api_call_count,
            session_id,
        )
        def _do(conn):
@@ -3589,12 +3589,7 @@ class AIAgent:
                                item_id = ri.get("id")
                                if item_id and item_id in seen_item_ids:
                                    continue
-                                # Strip the "id" field — with store=False the
-                                # Responses API cannot look up items by ID and
-                                # returns 404.  The encrypted_content blob is
-                                # self-contained for reasoning chain continuity.
-                                replay_item = {k: v for k, v in ri.items() if k != "id"}
-                                items.append(replay_item)
+                                items.append(ri)
                                if item_id:
                                    seen_item_ids.add(item_id)
                                has_codex_reasoning = True
@@ -3735,10 +3730,8 @@ class AIAgent:
                            continue
                        seen_ids.add(item_id)
                    reasoning_item = {"type": "reasoning", "encrypted_content": encrypted}
-                    # Do NOT include the "id" in the outgoing item — with
-                    # store=False (our default) the API tries to resolve the
-                    # id server-side and returns 404.  The id is still used
-                    # above for local deduplication via seen_ids.
+                    if isinstance(item_id, str) and item_id:
+                        reasoning_item["id"] = item_id
                    summary = item.get("summary")
                    if isinstance(summary, list):
                        reasoning_item["summary"] = summary
@@ -8924,6 +8917,7 @@ class AIAgent:
                                    billing_mode="subscription_included"
                                    if cost_result.status == "included" else None,
                                    model=self.model,
+                                    api_call_count=1,
                                )
                            except Exception:
                                pass  # never block the agent loop
@@ -98,7 +98,7 @@ def find_nearby(lat: float, lon: float, types: list[str], radius: int = 1500, li
        # Get coordinates (nodes have lat/lon directly, ways/relations use center)
        plat = el.get("lat") or (el.get("center", {}) or {}).get("lat")
        plon = el.get("lon") or (el.get("center", {}) or {}).get("lon")
-        if plat is None or plon is None:
+        if not plat or not plon:
            continue

        dist = haversine(lat, lon, plat, plon)
@@ -25,13 +25,6 @@ def refresh_token(token_data: dict) -> dict:
    import urllib.parse
    import urllib.request

-    required_keys = ["client_id", "client_secret", "refresh_token", "token_uri"]
-    missing = [k for k in required_keys if k not in token_data]
-    if missing:
-        print(f"ERROR: google_token.json is missing required fields: {', '.join(missing)}", file=sys.stderr)
-        print("Please re-authenticate by running the Google Workspace setup script.", file=sys.stderr)
-        sys.exit(1)
-
    params = urllib.parse.urlencode({
        "client_id": token_data["client_id"],
        "client_secret": token_data["client_secret"],
@@ -232,7 +232,7 @@ class TestResolveVisionProviderClientModelNormalization:

        assert provider == "zai"
        assert client is not None
-        assert model == "glm-5v-turbo"  # zai has dedicated vision model in _PROVIDER_VISION_MODELS
+        assert model == "glm-5.1"


 class TestVisionPathApiMode:
@@ -233,10 +233,9 @@ class TestDeliverResultWrapping:
        send_mock.assert_called_once()
        sent_content = send_mock.call_args.kwargs.get("content") or send_mock.call_args[0][-1]
        assert "Cronjob Response: daily-report" in sent_content
-        assert "(job_id: test-job)" in sent_content
        assert "-------------" in sent_content
        assert "Here is today's summary." in sent_content
-        assert "To stop or manage this job" in sent_content
+        assert "The agent cannot see this message" in sent_content

    def test_delivery_uses_job_id_when_no_name(self):
        """When a job has no name, the wrapper should fall back to job id."""
@@ -1,66 +0,0 @@
-"""Shared fixtures for gateway tests.
-
-The ``_ensure_telegram_mock`` helper guarantees that a minimal mock of
-the ``telegram`` package is registered in :data:`sys.modules` **before**
-any test file triggers ``from gateway.platforms.telegram import ...``.
-
-Without this, ``pytest-xdist`` workers that happen to collect
-``test_telegram_caption_merge.py`` (bare top-level import, no per-file
-mock) first will cache ``ChatType = None`` from the production
-ImportError fallback, causing 30+ downstream test failures wherever
-``ChatType.GROUP`` / ``ChatType.SUPERGROUP`` is accessed.
-
-Individual test files may still call their own ``_ensure_telegram_mock``
-— it short-circuits when the mock is already present.
-"""
-
-import sys
-from unittest.mock import MagicMock
-
-
-def _ensure_telegram_mock() -> None:
-    """Install a comprehensive telegram mock in sys.modules.
-
-    Idempotent — skips when the real library is already imported.
-    Uses ``sys.modules[name] = mod`` (overwrite) instead of
-    ``setdefault`` so it wins even if a partial/broken import
-    already cached a module with ``ChatType = None``.
-    """
-    if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"):
-        return  # Real library is installed — nothing to mock
-
-    mod = MagicMock()
-    mod.ext.ContextTypes.DEFAULT_TYPE = type(None)
-    mod.constants.ParseMode.MARKDOWN = "Markdown"
-    mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2"
-    mod.constants.ParseMode.HTML = "HTML"
-    mod.constants.ChatType.PRIVATE = "private"
-    mod.constants.ChatType.GROUP = "group"
-    mod.constants.ChatType.SUPERGROUP = "supergroup"
-    mod.constants.ChatType.CHANNEL = "channel"
-
-    # Real exception classes so ``except (NetworkError, ...)`` clauses
-    # in production code don't blow up with TypeError.
-    mod.error.NetworkError = type("NetworkError", (OSError,), {})
-    mod.error.TimedOut = type("TimedOut", (OSError,), {})
-    mod.error.BadRequest = type("BadRequest", (Exception,), {})
-    mod.error.Forbidden = type("Forbidden", (Exception,), {})
-    mod.error.InvalidToken = type("InvalidToken", (Exception,), {})
-    mod.error.RetryAfter = type("RetryAfter", (Exception,), {"retry_after": 1})
-    mod.error.Conflict = type("Conflict", (Exception,), {})
-
-    # Update.ALL_TYPES used in start_polling()
-    mod.Update.ALL_TYPES = []
-
-    for name in (
-        "telegram",
-        "telegram.ext",
-        "telegram.constants",
-        "telegram.request",
-    ):
-        sys.modules[name] = mod
-    sys.modules["telegram.error"] = mod.error
-
-
-# Run at collection time — before any test file's module-level imports.
-_ensure_telegram_mock()
@@ -1,291 +0,0 @@
-"""Tests for duplicate reply suppression across the gateway stack.
-
-Covers three fix paths:
-  1. base.py: stale response suppressed when interrupt_event is set and a
-     pending message exists (#8221 / #2483)
-  2. run.py return path: already_sent propagated from stream consumer's
-     already_sent flag without requiring response_previewed (#8375)
-  3. run.py queued-message path: first response correctly detected as
-     already-streamed when already_sent is True without response_previewed
-"""
-
-import asyncio
-from types import SimpleNamespace
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-
-from gateway.config import Platform, PlatformConfig
-from gateway.platforms.base import (
-    BasePlatformAdapter,
-    MessageEvent,
-    MessageType,
-    ProcessingOutcome,
-    SendResult,
-)
-from gateway.session import SessionSource, build_session_key
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-class StubAdapter(BasePlatformAdapter):
-    """Minimal concrete adapter for testing."""
-
-    def __init__(self):
-        super().__init__(PlatformConfig(enabled=True, token="fake"), Platform.DISCORD)
-        self.sent = []
-
-    async def connect(self):
-        return True
-
-    async def disconnect(self):
-        pass
-
-    async def send(self, chat_id, content, reply_to=None, metadata=None):
-        self.sent.append({"chat_id": chat_id, "content": content})
-        return SendResult(success=True, message_id="msg1")
-
-    async def send_typing(self, chat_id, metadata=None):
-        pass
-
-    async def get_chat_info(self, chat_id):
-        return {"id": chat_id}
-
-
-def _make_event(text="hello", chat_id="c1", user_id="u1"):
-    return MessageEvent(
-        text=text,
-        source=SessionSource(
-            platform=Platform.DISCORD,
-            chat_id=chat_id,
-            chat_type="dm",
-            user_id=user_id,
-        ),
-        message_id="m1",
-    )
-
-
-# ===================================================================
-# Test 1: base.py — stale response suppressed on interrupt (#8221)
-# ===================================================================
-
-class TestBaseInterruptSuppression:
-    @pytest.mark.asyncio
-    async def test_stale_response_suppressed_when_interrupted(self):
-        """When interrupt_event is set AND a pending message exists,
-        base.py should suppress the stale response instead of sending it."""
-        adapter = StubAdapter()
-
-        stale_response = "This is the stale answer to the first question."
-        pending_response = "This is the answer to the second question."
-        call_count = 0
-
-        async def fake_handler(event):
-            nonlocal call_count
-            call_count += 1
-            if call_count == 1:
-                return stale_response
-            return pending_response
-
-        adapter.set_message_handler(fake_handler)
-
-        event_a = _make_event(text="first question")
-        session_key = build_session_key(event_a.source)
-
-        # Simulate: message A is being processed, message B arrives
-        # The interrupt event is set and B is in pending_messages
-        interrupt_event = asyncio.Event()
-        interrupt_event.set()
-        adapter._active_sessions[session_key] = interrupt_event
-
-        event_b = _make_event(text="second question")
-        adapter._pending_messages[session_key] = event_b
-
-        await adapter._process_message_background(event_a, session_key)
-
-        # The stale response should NOT have been sent.
-        stale_sends = [s for s in adapter.sent if s["content"] == stale_response]
-        assert len(stale_sends) == 0, (
-            f"Stale response was sent {len(stale_sends)} time(s) — should be suppressed"
-        )
-        # The pending message's response SHOULD have been sent.
-        pending_sends = [s for s in adapter.sent if s["content"] == pending_response]
-        assert len(pending_sends) == 1, "Pending message response should be sent"
-
-    @pytest.mark.asyncio
-    async def test_response_not_suppressed_without_interrupt(self):
-        """Normal case: no interrupt, response should be sent."""
-        adapter = StubAdapter()
-
-        async def fake_handler(event):
-            return "Normal response"
-
-        adapter.set_message_handler(fake_handler)
-        event = _make_event()
-        session_key = build_session_key(event.source)
-
-        await adapter._process_message_background(event, session_key)
-
-        assert any(s["content"] == "Normal response" for s in adapter.sent)
-
-    @pytest.mark.asyncio
-    async def test_response_not_suppressed_with_interrupt_but_no_pending(self):
-        """Interrupt event set but no pending message (race already resolved) —
-        response should still be sent."""
-        adapter = StubAdapter()
-
-        async def fake_handler(event):
-            return "Valid response"
-
-        adapter.set_message_handler(fake_handler)
-        event = _make_event()
-        session_key = build_session_key(event.source)
-
-        # Set interrupt but no pending message
-        interrupt_event = asyncio.Event()
-        interrupt_event.set()
-        adapter._active_sessions[session_key] = interrupt_event
-
-        await adapter._process_message_background(event, session_key)
-
-        assert any(s["content"] == "Valid response" for s in adapter.sent)
-
-
-# ===================================================================
-# Test 2: run.py — already_sent without response_previewed (#8375)
-# ===================================================================
-
-class TestAlreadySentWithoutResponsePreviewed:
-    """The already_sent flag on the response dict should be set when the
-    stream consumer's already_sent is True, even if response_previewed is
-    False.  This prevents duplicate sends when streaming was interrupted
-    by flood control."""
-
-    def _make_mock_stream_consumer(self, already_sent=False, final_response_sent=False):
-        sc = SimpleNamespace(
-            already_sent=already_sent,
-            final_response_sent=final_response_sent,
-        )
-        return sc
-
-    def test_already_sent_set_without_response_previewed(self):
-        """Stream consumer already_sent=True should propagate to response
-        dict even when response_previewed is False."""
-        sc = self._make_mock_stream_consumer(already_sent=True, final_response_sent=False)
-        response = {"final_response": "text", "response_previewed": False}
-
-        # Reproduce the logic from run.py return path (post-fix)
-        if sc and isinstance(response, dict) and not response.get("failed"):
-            if (
-                getattr(sc, "final_response_sent", False)
-                or getattr(sc, "already_sent", False)
-            ):
-                response["already_sent"] = True
-
-        assert response.get("already_sent") is True
-
-    def test_already_sent_not_set_when_nothing_sent(self):
-        """When stream consumer hasn't sent anything, already_sent should
-        not be set on the response."""
-        sc = self._make_mock_stream_consumer(already_sent=False, final_response_sent=False)
-        response = {"final_response": "text", "response_previewed": False}
-
-        if sc and isinstance(response, dict) and not response.get("failed"):
-            if (
-                getattr(sc, "final_response_sent", False)
-                or getattr(sc, "already_sent", False)
-            ):
-                response["already_sent"] = True
-
-        assert "already_sent" not in response
-
-    def test_already_sent_set_on_final_response_sent(self):
-        """final_response_sent=True should still work as before."""
-        sc = self._make_mock_stream_consumer(already_sent=False, final_response_sent=True)
-        response = {"final_response": "text"}
-
-        if sc and isinstance(response, dict) and not response.get("failed"):
-            if (
-                getattr(sc, "final_response_sent", False)
-                or getattr(sc, "already_sent", False)
-            ):
-                response["already_sent"] = True
-
-        assert response.get("already_sent") is True
-
-    def test_already_sent_not_set_on_failed_response(self):
-        """Failed responses should never be suppressed — user needs to see
-        the error message even if streaming sent earlier partial output."""
-        sc = self._make_mock_stream_consumer(already_sent=True, final_response_sent=False)
-        response = {"final_response": "Error: something broke", "failed": True}
-
-        if sc and isinstance(response, dict) and not response.get("failed"):
-            if (
-                getattr(sc, "final_response_sent", False)
-                or getattr(sc, "already_sent", False)
-            ):
-                response["already_sent"] = True
-
-        assert "already_sent" not in response
-
-
-# ===================================================================
-# Test 3: run.py queued-message path — _already_streamed detection
-# ===================================================================
-
-class TestQueuedMessageAlreadyStreamed:
-    """The queued-message path should detect that the first response was
-    already streamed (already_sent=True) even without response_previewed."""
-
-    def _make_mock_sc(self, already_sent=False, final_response_sent=False):
-        return SimpleNamespace(
-            already_sent=already_sent,
-            final_response_sent=final_response_sent,
-        )
-
-    def test_queued_path_detects_already_streamed(self):
-        """already_sent=True on stream consumer means first response was
-        streamed — skip re-sending before processing queued message."""
-        _sc = self._make_mock_sc(already_sent=True)
-
-        # Reproduce the queued-message logic from run.py (post-fix)
-        _already_streamed = bool(
-            _sc
-            and (
-                getattr(_sc, "final_response_sent", False)
-                or getattr(_sc, "already_sent", False)
-            )
-        )
-
-        assert _already_streamed is True
-
-    def test_queued_path_sends_when_not_streamed(self):
-        """Nothing was streamed — first response should be sent before
-        processing the queued message."""
-        _sc = self._make_mock_sc(already_sent=False)
-
-        _already_streamed = bool(
-            _sc
-            and (
-                getattr(_sc, "final_response_sent", False)
-                or getattr(_sc, "already_sent", False)
-            )
-        )
-
-        assert _already_streamed is False
-
-    def test_queued_path_with_no_stream_consumer(self):
-        """No stream consumer at all (streaming disabled) — not streamed."""
-        _sc = None
-
-        _already_streamed = bool(
-            _sc
-            and (
-                getattr(_sc, "final_response_sent", False)
-                or getattr(_sc, "already_sent", False)
-            )
-        )
-
-        assert _already_streamed is False
@@ -335,29 +335,6 @@ def _make_adapter():
    return adapter


-# ---------------------------------------------------------------------------
-# Typing indicator
-# ---------------------------------------------------------------------------
-
-class TestMatrixTypingIndicator:
-    def setup_method(self):
-        self.adapter = _make_adapter()
-        self.adapter._client = MagicMock()
-        self.adapter._client.set_typing = AsyncMock()
-
-    @pytest.mark.asyncio
-    async def test_stop_typing_clears_matrix_typing_state(self):
-        """stop_typing() should send typing=false instead of waiting for timeout expiry."""
-        from gateway.platforms.matrix import RoomID
-
-        await self.adapter.stop_typing("!room:example.org")
-
-        self.adapter._client.set_typing.assert_awaited_once_with(
-            RoomID("!room:example.org"),
-            timeout=0,
-        )
-
-
 # ---------------------------------------------------------------------------
 # mxc:// URL conversion
 # ---------------------------------------------------------------------------
@@ -1854,3 +1831,4 @@ class TestMatrixPresence:
        assert result is False


+
@@ -613,7 +613,6 @@ class TestDetectVenvDir:
        # Not inside a virtualenv
        monkeypatch.setattr("sys.prefix", "/usr")
        monkeypatch.setattr("sys.base_prefix", "/usr")
-        monkeypatch.delenv("VIRTUAL_ENV", raising=False)
        monkeypatch.setattr(gateway_cli, "PROJECT_ROOT", tmp_path)

        dot_venv = tmp_path / ".venv"
@@ -625,7 +624,6 @@ class TestDetectVenvDir:
    def test_falls_back_to_venv_directory(self, tmp_path, monkeypatch):
        monkeypatch.setattr("sys.prefix", "/usr")
        monkeypatch.setattr("sys.base_prefix", "/usr")
-        monkeypatch.delenv("VIRTUAL_ENV", raising=False)
        monkeypatch.setattr(gateway_cli, "PROJECT_ROOT", tmp_path)

        venv = tmp_path / "venv"
@@ -637,7 +635,6 @@ class TestDetectVenvDir:
    def test_prefers_dot_venv_over_venv(self, tmp_path, monkeypatch):
        monkeypatch.setattr("sys.prefix", "/usr")
        monkeypatch.setattr("sys.base_prefix", "/usr")
-        monkeypatch.delenv("VIRTUAL_ENV", raising=False)
        monkeypatch.setattr(gateway_cli, "PROJECT_ROOT", tmp_path)

        (tmp_path / ".venv").mkdir()
@@ -649,7 +646,6 @@ class TestDetectVenvDir:
    def test_returns_none_when_no_virtualenv(self, tmp_path, monkeypatch):
        monkeypatch.setattr("sys.prefix", "/usr")
        monkeypatch.setattr("sys.base_prefix", "/usr")
-        monkeypatch.delenv("VIRTUAL_ENV", raising=False)
        monkeypatch.setattr(gateway_cli, "PROJECT_ROOT", tmp_path)

        result = gateway_cli._detect_venv_dir()
@@ -694,6 +694,8 @@ class TestNewEndpoints:
        assert "totals" in data
        assert isinstance(data["daily"], list)
        assert "total_sessions" in data["totals"]
+        assert "total_cache_write" in data["totals"]
+        assert "total_api_calls" in data["totals"]

    def test_session_token_endpoint_removed(self):
        """GET /api/auth/session-token no longer exists."""
@@ -9,8 +9,6 @@ def _build_agent(model_cfg, custom_providers=None, model="anthropic/claude-opus-
    if custom_providers is not None:
        cfg["custom_providers"] = custom_providers

-    base_url = model_cfg.get("base_url", "")
-
    with (
        patch("hermes_cli.config.load_config", return_value=cfg),
        patch("agent.model_metadata.get_model_context_length", return_value=128_000),
@@ -23,7 +21,6 @@ def _build_agent(model_cfg, custom_providers=None, model="anthropic/claude-opus-
        agent = AIAgent(
            model=model,
            api_key="test-key-1234567890",
-            base_url=base_url,
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
@@ -805,10 +805,7 @@ class TestCodexReasoningPreflight:
        reasoning_items = [i for i in normalized if i.get("type") == "reasoning"]
        assert len(reasoning_items) == 1
        assert reasoning_items[0]["encrypted_content"] == "abc123encrypted"
-        # Note: "id" is intentionally excluded from normalized output —
-        # with store=False the API returns 404 on server-side id resolution.
-        # The id is only used for local deduplication via seen_ids.
-        assert "id" not in reasoning_items[0]
+        assert reasoning_items[0]["id"] == "r_001"
        assert reasoning_items[0]["summary"] == [{"type": "summary_text", "text": "Thinking about it"}]

    def test_reasoning_item_without_id(self, monkeypatch):
@@ -1249,17 +1249,13 @@ def test_chat_messages_to_responses_input_deduplicates_reasoning_ids(monkeypatch
    ]
    items = agent._chat_messages_to_responses_input(messages)

-    reasoning_items = [it for it in items if it.get("type") == "reasoning"]
-    # Dedup: rs_aaa appears in both turns but should only be emitted once.
-    # 3 unique items total: enc_1 (from rs_aaa), enc_2 (rs_bbb), enc_3 (rs_ccc).
-    assert len(reasoning_items) == 3
-    encrypted = [it["encrypted_content"] for it in reasoning_items]
-    assert encrypted.count("enc_1") == 1
-    assert "enc_2" in encrypted
-    assert "enc_3" in encrypted
-    # IDs must be stripped — with store=False the API 404s on id lookups.
-    for it in reasoning_items:
-        assert "id" not in it
+    reasoning_ids = [it["id"] for it in items if it.get("type") == "reasoning"]
+    # rs_aaa should appear only once (first occurrence kept)
+    assert reasoning_ids.count("rs_aaa") == 1
+    # rs_bbb and rs_ccc should each appear once
+    assert reasoning_ids.count("rs_bbb") == 1
+    assert reasoning_ids.count("rs_ccc") == 1
+    assert len(reasoning_ids) == 3


 def test_preflight_codex_input_deduplicates_reasoning_ids(monkeypatch):
@@ -1276,11 +1272,7 @@ def test_preflight_codex_input_deduplicates_reasoning_ids(monkeypatch):
    normalized = agent._preflight_codex_input_items(raw_input)

    reasoning_items = [it for it in normalized if it.get("type") == "reasoning"]
-    # rs_xyz duplicate should be collapsed to one item; rs_zzz kept.
+    reasoning_ids = [it["id"] for it in reasoning_items]
+    assert reasoning_ids.count("rs_xyz") == 1
+    assert reasoning_ids.count("rs_zzz") == 1
    assert len(reasoning_items) == 2
-    encrypted = [it["encrypted_content"] for it in reasoning_items]
-    assert encrypted.count("enc_a") == 1
-    assert "enc_b" in encrypted
-    # IDs must be stripped — with store=False the API 404s on id lookups.
-    for it in reasoning_items:
-        assert "id" not in it
@@ -46,18 +46,9 @@ def api_module(monkeypatch, tmp_path):
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
-    # Ensure the gws CLI code path is taken even when the binary isn't
-    # installed (CI).  Without this, calendar_list() falls through to the
-    # Python SDK path which imports ``googleapiclient`` — not in deps.
-    module._gws_binary = lambda: "/usr/bin/gws"
-    # Bypass authentication check — no real token file in CI.
-    module._ensure_authenticated = lambda: None
    return module


-_gws_installed = importlib.util.find_spec("shutil") and __import__("shutil").which("gws")
-
-
 def _write_token(path: Path, *, token="ya29.test", expiry=None, **extra):
    data = {
        "token": token,
@@ -133,14 +124,13 @@ def test_bridge_main_injects_token_env(bridge_module, tmp_path):
    assert captured["cmd"] == ["gws", "gmail", "+triage"]


-@pytest.mark.skipif(not _gws_installed, reason="gws CLI not installed")
 def test_api_calendar_list_uses_agenda_by_default(api_module):
    """calendar list without dates uses +agenda helper."""
    captured = {}

    def capture_run(cmd, **kwargs):
        captured["cmd"] = cmd
-        return MagicMock(returncode=0, stdout="{}", stderr="")
+        return MagicMock(returncode=0)

    args = api_module.argparse.Namespace(
        start="", end="", max=25, calendar="primary", func=api_module.calendar_list,
@@ -156,7 +146,6 @@ def test_api_calendar_list_uses_agenda_by_default(api_module):
    assert "--days" in gws_args


-@pytest.mark.skipif(not _gws_installed, reason="gws CLI not installed")
 def test_api_calendar_list_respects_date_range(api_module):
    """calendar list with --start/--end uses raw events list API."""
    captured = {}
@@ -62,6 +62,27 @@ class TestSessionLifecycle:
        assert session["input_tokens"] == 300
        assert session["output_tokens"] == 150

+    def test_update_token_counts_tracks_api_call_count(self, db):
+        """api_call_count increments with each update_token_counts call."""
+        db.create_session(session_id="s1", source="cli")
+        db.update_token_counts("s1", input_tokens=100, output_tokens=50, api_call_count=1)
+        db.update_token_counts("s1", input_tokens=100, output_tokens=50, api_call_count=1)
+        db.update_token_counts("s1", input_tokens=100, output_tokens=50, api_call_count=1)
+
+        session = db.get_session("s1")
+        assert session["api_call_count"] == 3
+
+    def test_update_token_counts_api_call_count_absolute(self, db):
+        """absolute mode sets api_call_count directly."""
+        db.create_session(session_id="s1", source="cli")
+        db.update_token_counts("s1", input_tokens=100, output_tokens=50, api_call_count=1)
+        db.update_token_counts("s1", input_tokens=300, output_tokens=150,
+                               api_call_count=5, absolute=True)
+
+        session = db.get_session("s1")
+        assert session["api_call_count"] == 5
+        assert session["input_tokens"] == 300
+
    def test_update_token_counts_backfills_model_when_null(self, db):
        db.create_session(session_id="s1", source="telegram")
        db.update_token_counts("s1", input_tokens=10, output_tokens=5, model="openai/gpt-5.4")
@@ -935,7 +956,7 @@ class TestSchemaInit:
    def test_schema_version(self, db):
        cursor = db._conn.execute("SELECT version FROM schema_version")
        version = cursor.fetchone()[0]
-        assert version == 6
+        assert version == 7

    def test_title_column_exists(self, db):
        """Verify the title column was created in the sessions table."""
@@ -996,13 +1017,19 @@ class TestSchemaInit:

        # Verify migration
        cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
-        assert cursor.fetchone()[0] == 6
+        assert cursor.fetchone()[0] == 7

        # Verify title column exists and is NULL for existing sessions
        session = migrated_db.get_session("existing")
        assert session is not None
        assert session["title"] is None

+        # Verify api_call_count column was added with default 0
+        cursor = migrated_db._conn.execute(
+            "SELECT api_call_count FROM sessions WHERE id = 'existing'"
+        )
+        assert cursor.fetchone()[0] == 0
+
        # Verify we can set title on migrated session
        assert migrated_db.set_session_title("existing", "Migrated Title") is True
        session = migrated_db.get_session("existing")
@@ -123,7 +123,7 @@ class TestSendMatrix:
        session.put.assert_called_once()
        call_kwargs = session.put.call_args
        url = call_kwargs[0][0]
-        assert url.startswith("https://matrix.example.com/_matrix/client/v3/rooms/%21room%3Aexample.com/send/m.room.message/")
+        assert url.startswith("https://matrix.example.com/_matrix/client/v3/rooms/!room:example.com/send/m.room.message/")
        assert call_kwargs[1]["headers"]["Authorization"] == "Bearer syt_tok"
        payload = call_kwargs[1]["json"]
        assert payload["msgtype"] == "m.text"
@@ -752,38 +752,6 @@ class TestParseTargetRefDiscord:
        assert is_explicit is True


-class TestParseTargetRefMatrix:
-    """_parse_target_ref correctly handles Matrix room IDs and user MXIDs."""
-
-    def test_matrix_room_id_is_explicit(self):
-        """Matrix room IDs (!) are recognized as explicit targets."""
-        chat_id, thread_id, is_explicit = _parse_target_ref("matrix", "!HLOQwxYGgFPMPJUSNR:matrix.org")
-        assert chat_id == "!HLOQwxYGgFPMPJUSNR:matrix.org"
-        assert thread_id is None
-        assert is_explicit is True
-
-    def test_matrix_user_mxid_is_explicit(self):
-        """Matrix user MXIDs (@) are recognized as explicit targets."""
-        chat_id, thread_id, is_explicit = _parse_target_ref("matrix", "@hermes:matrix.org")
-        assert chat_id == "@hermes:matrix.org"
-        assert thread_id is None
-        assert is_explicit is True
-
-    def test_matrix_alias_is_not_explicit(self):
-        """Matrix room aliases (#) are NOT explicit — they need resolution."""
-        chat_id, thread_id, is_explicit = _parse_target_ref("matrix", "#general:matrix.org")
-        assert chat_id is None
-        assert is_explicit is False
-
-    def test_matrix_prefix_only_matches_matrix_platform(self):
-        """! and @ prefixes are only treated as explicit for the matrix platform."""
-        chat_id, _, is_explicit = _parse_target_ref("telegram", "!something")
-        assert is_explicit is False
-
-        chat_id, _, is_explicit = _parse_target_ref("discord", "@someone")
-        assert is_explicit is False
-
-
 class TestSendDiscordThreadId:
    """_send_discord uses thread_id when provided."""

@@ -886,225 +854,3 @@ class TestSendToPlatformDiscordThread:
        send_mock.assert_awaited_once()
        _, call_kwargs = send_mock.await_args
        assert call_kwargs["thread_id"] is None
-
-
-# ---------------------------------------------------------------------------
-# Discord media attachment support
-# ---------------------------------------------------------------------------
-
-
-class TestSendDiscordMedia:
-    """_send_discord uploads media files via multipart/form-data."""
-
-    @staticmethod
-    def _build_mock(response_status, response_data=None, response_text="error body"):
-        """Build a properly-structured aiohttp mock chain."""
-        mock_resp = MagicMock()
-        mock_resp.status = response_status
-        mock_resp.json = AsyncMock(return_value=response_data or {"id": "msg123"})
-        mock_resp.text = AsyncMock(return_value=response_text)
-        mock_resp.__aenter__ = AsyncMock(return_value=mock_resp)
-        mock_resp.__aexit__ = AsyncMock(return_value=None)
-
-        mock_session = MagicMock()
-        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
-        mock_session.__aexit__ = AsyncMock(return_value=None)
-        mock_session.post = MagicMock(return_value=mock_resp)
-
-        return mock_session, mock_resp
-
-    def test_text_and_media_sends_both(self, tmp_path):
-        """Text message is sent first, then each media file as multipart."""
-        img = tmp_path / "photo.png"
-        img.write_bytes(b"\x89PNG fake image data")
-
-        mock_session, _ = self._build_mock(200, {"id": "msg999"})
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            result = asyncio.run(
-                _send_discord("tok", "111", "hello", media_files=[(str(img), False)])
-            )
-
-        assert result["success"] is True
-        assert result["message_id"] == "msg999"
-        # Two POSTs: one text JSON, one multipart upload
-        assert mock_session.post.call_count == 2
-
-    def test_media_only_skips_text_post(self, tmp_path):
-        """When message is empty and media is present, text POST is skipped."""
-        img = tmp_path / "photo.png"
-        img.write_bytes(b"\x89PNG fake image data")
-
-        mock_session, _ = self._build_mock(200, {"id": "media_only"})
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            result = asyncio.run(
-                _send_discord("tok", "222", "  ", media_files=[(str(img), False)])
-            )
-
-        assert result["success"] is True
-        # Only one POST: the media upload (text was whitespace-only)
-        assert mock_session.post.call_count == 1
-
-    def test_missing_media_file_collected_as_warning(self):
-        """Non-existent media paths produce warnings but don't fail."""
-        mock_session, _ = self._build_mock(200, {"id": "txt_ok"})
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            result = asyncio.run(
-                _send_discord("tok", "333", "hello", media_files=[("/nonexistent/file.png", False)])
-            )
-
-        assert result["success"] is True
-        assert "warnings" in result
-        assert any("not found" in w for w in result["warnings"])
-        # Only the text POST was made, media was skipped
-        assert mock_session.post.call_count == 1
-
-    def test_media_upload_failure_collected_as_warning(self, tmp_path):
-        """Failed media upload becomes a warning, text still succeeds."""
-        img = tmp_path / "photo.png"
-        img.write_bytes(b"\x89PNG fake image data")
-
-        # First call (text) succeeds, second call (media) returns 413
-        text_resp = MagicMock()
-        text_resp.status = 200
-        text_resp.json = AsyncMock(return_value={"id": "txt_ok"})
-        text_resp.__aenter__ = AsyncMock(return_value=text_resp)
-        text_resp.__aexit__ = AsyncMock(return_value=None)
-
-        media_resp = MagicMock()
-        media_resp.status = 413
-        media_resp.text = AsyncMock(return_value="Request Entity Too Large")
-        media_resp.__aenter__ = AsyncMock(return_value=media_resp)
-        media_resp.__aexit__ = AsyncMock(return_value=None)
-
-        mock_session = MagicMock()
-        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
-        mock_session.__aexit__ = AsyncMock(return_value=None)
-        mock_session.post = MagicMock(side_effect=[text_resp, media_resp])
-
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            result = asyncio.run(
-                _send_discord("tok", "444", "hello", media_files=[(str(img), False)])
-            )
-
-        assert result["success"] is True
-        assert result["message_id"] == "txt_ok"
-        assert "warnings" in result
-        assert any("413" in w for w in result["warnings"])
-
-    def test_no_text_no_media_returns_error(self):
-        """Empty text with no media returns error dict."""
-        mock_session, _ = self._build_mock(200)
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            result = asyncio.run(
-                _send_discord("tok", "555", "", media_files=[])
-            )
-
-        # Text is empty but media_files is empty, so text POST fires
-        # (the "skip text if media present" condition isn't met)
-        assert result["success"] is True
-
-    def test_multiple_media_files_uploaded_separately(self, tmp_path):
-        """Each media file gets its own multipart POST."""
-        img1 = tmp_path / "a.png"
-        img1.write_bytes(b"img1")
-        img2 = tmp_path / "b.jpg"
-        img2.write_bytes(b"img2")
-
-        mock_session, _ = self._build_mock(200, {"id": "last"})
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            result = asyncio.run(
-                _send_discord("tok", "666", "hi", media_files=[
-                    (str(img1), False), (str(img2), False)
-                ])
-            )
-
-        assert result["success"] is True
-        # 1 text POST + 2 media POSTs = 3
-        assert mock_session.post.call_count == 3
-
-
-class TestSendToPlatformDiscordMedia:
-    """_send_to_platform routes Discord media correctly."""
-
-    def test_media_files_passed_on_last_chunk_only(self):
-        """Discord media_files are only passed on the final chunk."""
-        call_log = []
-
-        async def mock_send_discord(token, chat_id, message, thread_id=None, media_files=None):
-            call_log.append({"message": message, "media_files": media_files or []})
-            return {"success": True, "platform": "discord", "chat_id": chat_id, "message_id": "1"}
-
-        # A message long enough to get chunked (Discord limit is 2000)
-        long_msg = "A" * 1900 + " " + "B" * 1900
-
-        with patch("tools.send_message_tool._send_discord", side_effect=mock_send_discord):
-            result = asyncio.run(
-                _send_to_platform(
-                    Platform.DISCORD,
-                    SimpleNamespace(enabled=True, token="tok", extra={}),
-                    "999",
-                    long_msg,
-                    media_files=[("/fake/img.png", False)],
-                )
-            )
-
-        assert result["success"] is True
-        assert len(call_log) == 2  # Message was chunked
-        assert call_log[0]["media_files"] == []  # First chunk: no media
-        assert call_log[1]["media_files"] == [("/fake/img.png", False)]  # Last chunk: media attached
-
-    def test_single_chunk_gets_media(self):
-        """Short message (single chunk) gets media_files directly."""
-        send_mock = AsyncMock(return_value={"success": True, "message_id": "1"})
-
-        with patch("tools.send_message_tool._send_discord", send_mock):
-            result = asyncio.run(
-                _send_to_platform(
-                    Platform.DISCORD,
-                    SimpleNamespace(enabled=True, token="tok", extra={}),
-                    "888",
-                    "short message",
-                    media_files=[("/fake/img.png", False)],
-                )
-            )
-
-        assert result["success"] is True
-        send_mock.assert_awaited_once()
-        call_kwargs = send_mock.await_args.kwargs
-        assert call_kwargs["media_files"] == [("/fake/img.png", False)]
-
-
-class TestSendMatrixUrlEncoding:
-    """_send_matrix URL-encodes Matrix room IDs in the API path."""
-
-    def test_room_id_is_percent_encoded_in_url(self):
-        """Matrix room IDs with ! and : are percent-encoded in the PUT URL."""
-        import aiohttp
-
-        mock_resp = MagicMock()
-        mock_resp.status = 200
-        mock_resp.json = AsyncMock(return_value={"event_id": "$evt123"})
-        mock_resp.__aenter__ = AsyncMock(return_value=mock_resp)
-        mock_resp.__aexit__ = AsyncMock(return_value=None)
-
-        mock_session = MagicMock()
-        mock_session.put = MagicMock(return_value=mock_resp)
-        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
-        mock_session.__aexit__ = AsyncMock(return_value=None)
-
-        with patch("aiohttp.ClientSession", return_value=mock_session):
-            from tools.send_message_tool import _send_matrix
-            result = asyncio.get_event_loop().run_until_complete(
-                _send_matrix(
-                    "test_token",
-                    {"homeserver": "https://matrix.example.org"},
-                    "!HLOQwxYGgFPMPJUSNR:matrix.org",
-                    "hello",
-                )
-            )
-
-        assert result["success"] is True
-        # Verify the URL was called with percent-encoded room ID
-        put_url = mock_session.put.call_args[0][0]
-        assert "%21HLOQwxYGgFPMPJUSNR%3Amatrix.org" in put_url
-        assert "!HLOQwxYGgFPMPJUSNR:matrix.org" not in put_url
@@ -13,8 +13,6 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional

-from hermes_constants import display_hermes_home
-
 logger = logging.getLogger(__name__)

 # Import from cron module (will be available when properly installed)
@@ -393,8 +391,6 @@ Use action='create' to schedule a new job from a prompt or one or more skills.
 Use action='list' to inspect jobs.
 Use action='update', 'pause', 'resume', 'remove', or 'run' to manage an existing job.

-To stop a job the user no longer wants: first action='list' to find the job_id, then action='remove' with that job_id. Never guess job IDs — always list first.
-
 Jobs run in a fresh session with no current-chat context, so prompts must be self-contained.
 If skills are provided on create, the future cron run loads those skills in order, then follows the prompt as the task instruction.
 On update, passing skills=[] clears attached skills.
@@ -457,7 +453,7 @@ Important safety rule: cron-run sessions should not recursively schedule more cr
            },
            "script": {
                "type": "string",
-                "description": f"Optional path to a Python script that runs before each cron job execution. Its stdout is injected into the prompt as context. Use for data collection and change detection. Relative paths resolve under {display_hermes_home()}/scripts/. On update, pass empty string to clear."
+                "description": "Optional path to a Python script that runs before each cron job execution. Its stdout is injected into the prompt as context. Use for data collection and change detection. Relative paths resolve under ~/.hermes/scripts/. On update, pass empty string to clear."
            },
        },
        "required": ["action"]
@@ -68,7 +68,7 @@ SEND_MESSAGE_SCHEMA = {
            },
            "target": {
                "type": "string",
-                "description": "Delivery target. Format: 'platform' (uses home channel), 'platform:#channel-name', 'platform:chat_id', or 'platform:chat_id:thread_id' for Telegram topics and Discord threads. Examples: 'telegram', 'telegram:-1001234567890:17585', 'discord:999888777:555444333', 'discord:#bot-home', 'slack:#engineering', 'signal:+155****4567', 'matrix:!roomid:server.org', 'matrix:@user:server.org'"
+                "description": "Delivery target. Format: 'platform' (uses home channel), 'platform:#channel-name', 'platform:chat_id', or 'platform:chat_id:thread_id' for Telegram topics and Discord threads. Examples: 'telegram', 'telegram:-1001234567890:17585', 'discord:999888777:555444333', 'discord:#bot-home', 'slack:#engineering', 'signal:+155****4567'"
            },
            "message": {
                "type": "string",
@@ -248,9 +248,6 @@ def _parse_target_ref(platform_name: str, target_ref: str):
            return match.group(1), None, True
    if target_ref.lstrip("-").isdigit():
        return target_ref, None, True
-    # Matrix room IDs (start with !) and user IDs (start with @) are explicit
-    if platform_name == "matrix" and (target_ref.startswith("!") or target_ref.startswith("@")):
-        return target_ref, None, True
    return None, None, False


@@ -387,28 +384,11 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
    if platform == Platform.WEIXIN:
        return await _send_weixin(pconfig, chat_id, message, media_files=media_files)

-    # --- Discord: special handling for media attachments ---
-    if platform == Platform.DISCORD:
-        last_result = None
-        for i, chunk in enumerate(chunks):
-            is_last = (i == len(chunks) - 1)
-            result = await _send_discord(
-                pconfig.token,
-                chat_id,
-                chunk,
-                media_files=media_files if is_last else [],
-                thread_id=thread_id,
-            )
-            if isinstance(result, dict) and result.get("error"):
-                return result
-            last_result = result
-        return last_result
-
-    # --- Non-Telegram/Discord platforms ---
+    # --- Non-Telegram platforms ---
    if media_files and not message.strip():
        return {
            "error": (
-                f"send_message MEDIA delivery is currently only supported for telegram, discord, and weixin; "
+                f"send_message MEDIA delivery is currently only supported for telegram; "
                f"target {platform.value} had only media attachments"
            )
        }
@@ -416,12 +396,14 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
    if media_files:
        warning = (
            f"MEDIA attachments were omitted for {platform.value}; "
-            "native send_message media delivery is currently only supported for telegram, discord, and weixin"
+            "native send_message media delivery is currently only supported for telegram"
        )

    last_result = None
    for chunk in chunks:
-        if platform == Platform.SLACK:
+        if platform == Platform.DISCORD:
+            result = await _send_discord(pconfig.token, chat_id, chunk, thread_id=thread_id)
+        elif platform == Platform.SLACK:
            result = await _send_slack(pconfig.token, chat_id, chunk)
        elif platform == Platform.WHATSAPP:
            result = await _send_whatsapp(pconfig.extra, chat_id, chunk)
@@ -586,16 +568,13 @@ async def _send_telegram(token, chat_id, message, media_files=None, thread_id=No
        return _error(f"Telegram send failed: {e}")


-async def _send_discord(token, chat_id, message, thread_id=None, media_files=None):
+async def _send_discord(token, chat_id, message, thread_id=None):
    """Send a single message via Discord REST API (no websocket client needed).

    Chunking is handled by _send_to_platform() before this is called.

    When thread_id is provided, the message is sent directly to that thread
    via the /channels/{thread_id}/messages endpoint.
-
-    Media files are uploaded one-by-one via multipart/form-data after the
-    text message is sent (same pattern as Telegram).
    """
    try:
        import aiohttp
@@ -610,56 +589,14 @@ async def _send_discord(token, chat_id, message, thread_id=None, media_files=Non
            url = f"https://discord.com/api/v10/channels/{thread_id}/messages"
        else:
            url = f"https://discord.com/api/v10/channels/{chat_id}/messages"
-        auth_headers = {"Authorization": f"Bot {token}"}
-        media_files = media_files or []
-        last_data = None
-        warnings = []
-
+        headers = {"Authorization": f"Bot {token}", "Content-Type": "application/json"}
        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30), **_sess_kw) as session:
-            # Send text message (skip if empty and media is present)
-            if message.strip() or not media_files:
-                headers = {**auth_headers, "Content-Type": "application/json"}
-                async with session.post(url, headers=headers, json={"content": message}, **_req_kw) as resp:
-                    if resp.status not in (200, 201):
-                        body = await resp.text()
-                        return _error(f"Discord API error ({resp.status}): {body}")
-                    last_data = await resp.json()
-
-            # Send each media file as a separate multipart upload
-            for media_path, _is_voice in media_files:
-                if not os.path.exists(media_path):
-                    warning = f"Media file not found, skipping: {media_path}"
-                    logger.warning(warning)
-                    warnings.append(warning)
-                    continue
-                try:
-                    form = aiohttp.FormData()
-                    filename = os.path.basename(media_path)
-                    with open(media_path, "rb") as f:
-                        form.add_field("files[0]", f, filename=filename)
-                        async with session.post(url, headers=auth_headers, data=form, **_req_kw) as resp:
-                            if resp.status not in (200, 201):
-                                body = await resp.text()
-                                warning = _sanitize_error_text(f"Failed to send media {media_path}: Discord API error ({resp.status}): {body}")
-                                logger.error(warning)
-                                warnings.append(warning)
-                                continue
-                            last_data = await resp.json()
-                except Exception as e:
-                    warning = _sanitize_error_text(f"Failed to send media {media_path}: {e}")
-                    logger.error(warning)
-                    warnings.append(warning)
-
-        if last_data is None:
-            error = "No deliverable text or media remained after processing"
-            if warnings:
-                return {"error": error, "warnings": warnings}
-            return {"error": error}
-
-        result = {"success": True, "platform": "discord", "chat_id": chat_id, "message_id": last_data.get("id")}
-        if warnings:
-            result["warnings"] = warnings
-        return result
+            async with session.post(url, headers=headers, json={"content": message}, **_req_kw) as resp:
+                if resp.status not in (200, 201):
+                    body = await resp.text()
+                    return _error(f"Discord API error ({resp.status}): {body}")
+                data = await resp.json()
+        return {"success": True, "platform": "discord", "chat_id": chat_id, "message_id": data.get("id")}
    except Exception as e:
        return _error(f"Discord send failed: {e}")

@@ -879,9 +816,7 @@ async def _send_matrix(token, extra, chat_id, message):
        if not homeserver or not token:
            return {"error": "Matrix not configured (MATRIX_HOMESERVER, MATRIX_ACCESS_TOKEN required)"}
        txn_id = f"hermes_{int(time.time() * 1000)}_{os.urandom(4).hex()}"
-        from urllib.parse import quote
-        encoded_room = quote(chat_id, safe="")
-        url = f"{homeserver}/_matrix/client/v3/rooms/{encoded_room}/send/m.room.message/{txn_id}"
+        url = f"{homeserver}/_matrix/client/v3/rooms/{chat_id}/send/m.room.message/{txn_id}"
        headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}

        # Build message payload with optional HTML formatted_body.
@@ -39,7 +39,7 @@ import re
 import shutil
 import tempfile
 from pathlib import Path
-from hermes_constants import get_hermes_home, display_hermes_home
+from hermes_constants import get_hermes_home
 from typing import Dict, Any, Optional, Tuple

 logger = logging.getLogger(__name__)
@@ -655,7 +655,7 @@ SKILL_MANAGE_SCHEMA = {
    "description": (
        "Manage skills (create, update, delete). Skills are your procedural "
        "memory — reusable approaches for recurring task types. "
-        f"New skills go to {display_hermes_home()}/skills/; existing skills can be modified wherever they live.\n\n"
+        "New skills go to ~/.hermes/skills/; existing skills can be modified wherever they live.\n\n"
        "Actions: create (full SKILL.md + optional category), "
        "patch (old_string/new_string — preferred for fixes), "
        "edit (full SKILL.md rewrite — major overhauls only), "
@@ -69,7 +69,7 @@ Usage:
 import json
 import logging

-from hermes_constants import get_hermes_home, display_hermes_home
+from hermes_constants import get_hermes_home
 import os
 import re
 from enum import Enum
@@ -408,7 +408,7 @@ def _gateway_setup_hint() -> str:

        return GATEWAY_SECRET_CAPTURE_UNSUPPORTED_MESSAGE
    except Exception:
-        return f"Secure secret entry is not available. Load this skill in the local CLI to be prompted, or add the key to {display_hermes_home()}/.env manually."
+        return "Secure secret entry is not available. Load this skill in the local CLI to be prompted, or add the key to ~/.hermes/.env manually."


 def _build_setup_note(
@@ -666,7 +666,7 @@ def skills_list(category: str = None, task_id: str = None) -> str:
                    "success": True,
                    "skills": [],
                    "categories": [],
-                    "message": f"No skills found. Skills directory created at {display_hermes_home()}/skills/",
+                    "message": "No skills found. Skills directory created at ~/.hermes/skills/",
                },
                ensure_ascii=False,
            )
@@ -40,8 +40,6 @@ from pathlib import Path
 from typing import Callable, Dict, Any, Optional
 from urllib.parse import urljoin

-from hermes_constants import display_hermes_home
-
 logger = logging.getLogger(__name__)
 from tools.managed_tool_gateway import resolve_managed_tool_gateway
 from tools.tool_backend_helpers import managed_nous_tools_enabled, resolve_openai_audio_api_key
@@ -1052,7 +1050,7 @@ TTS_SCHEMA = {
            },
            "output_path": {
                "type": "string",
-                "description": f"Optional custom file path to save the audio. Defaults to {display_hermes_home()}/audio_cache/<timestamp>.mp3"
+                "description": "Optional custom file path to save the audio. Defaults to ~/.hermes/audio_cache/<timestamp>.mp3"
            }
        },
        "required": ["text"]
@@ -112,11 +112,14 @@ export const en: Translations = {
    totalTokens: "Total Tokens",
    totalSessions: "Total Sessions",
    apiCalls: "API Calls",
+    cacheHitRate: "Cache Hit Rate",
    dailyTokenUsage: "Daily Token Usage",
    dailyBreakdown: "Daily Breakdown",
    perModelBreakdown: "Per-Model Breakdown",
+    prompt: "Prompt",
    input: "Input",
    output: "Output",
+    cached: "cached",
    total: "Total",
    noUsageData: "No usage data for this period",
    startSession: "Start a session to see analytics here",
@@ -125,7 +128,6 @@ export const en: Translations = {
    tokens: "Tokens",
    perDayAvg: "/day avg",
    acrossModels: "across {count} models",
-    inOut: "{input} in / {output} out",
  },

  logs: {
@@ -117,11 +117,14 @@ export interface Translations {
    totalTokens: string;
    totalSessions: string;
    apiCalls: string;
+    cacheHitRate: string;
    dailyTokenUsage: string;
    dailyBreakdown: string;
    perModelBreakdown: string;
+    prompt: string;
    input: string;
    output: string;
+    cached: string;
    total: string;
    noUsageData: string;
    startSession: string;
@@ -130,7 +133,6 @@ export interface Translations {
    tokens: string;
    perDayAvg: string;
    acrossModels: string;
-    inOut: string;
  };

  // ── Logs page ──
@@ -112,11 +112,14 @@ export const zh: Translations = {
    totalTokens: "总 Token 数",
    totalSessions: "总会话数",
    apiCalls: "API 调用",
+    cacheHitRate: "缓存命中率",
    dailyTokenUsage: "每日 Token 用量",
    dailyBreakdown: "每日明细",
    perModelBreakdown: "模型用量明细",
+    prompt: "提示",
    input: "输入",
    output: "输出",
+    cached: "已缓存",
    total: "总计",
    noUsageData: "该时间段暂无使用数据",
    startSession: "开始会话后将在此显示分析数据",
@@ -125,7 +128,6 @@ export const zh: Translations = {
    tokens: "Token",
    perDayAvg: "/天 平均",
    acrossModels: "共 {count} 个模型",
-    inOut: "输入 {input} / 输出 {output}",
  },

  logs: {
@@ -269,18 +269,23 @@ export interface AnalyticsDailyEntry {
  input_tokens: number;
  output_tokens: number;
  cache_read_tokens: number;
+  cache_write_tokens: number;
  reasoning_tokens: number;
  estimated_cost: number;
  actual_cost: number;
  sessions: number;
+  api_calls: number;
 }

 export interface AnalyticsModelEntry {
  model: string;
  input_tokens: number;
  output_tokens: number;
+  cache_read_tokens: number;
+  cache_write_tokens: number;
  estimated_cost: number;
  sessions: number;
+  api_calls: number;
 }

 export interface AnalyticsResponse {
@@ -290,10 +295,12 @@ export interface AnalyticsResponse {
    total_input: number;
    total_output: number;
    total_cache_read: number;
+    total_cache_write: number;
    total_reasoning: number;
    total_estimated_cost: number;
    total_actual_cost: number;
    total_sessions: number;
+    total_api_calls: number;
  };
 }

@@ -4,6 +4,7 @@ import {
  Cpu,
  Hash,
  TrendingUp,
+  Zap,
 } from "lucide-react";
 import { api } from "@/lib/api";
 import type { AnalyticsResponse, AnalyticsDailyEntry, AnalyticsModelEntry } from "@/lib/api";
@@ -19,6 +20,11 @@ const PERIODS = [

 const CHART_HEIGHT_PX = 160;

+/** Compute total prompt tokens (input + cache_read + cache_write). */
+function getPromptTokens(d: { input_tokens: number; cache_read_tokens?: number; cache_write_tokens?: number }): number {
+  return d.input_tokens + (d.cache_read_tokens ?? 0) + (d.cache_write_tokens ?? 0);
+}
+
 function formatTokens(n: number): string {
  if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`;
  if (n >= 1_000) return `${(n / 1_000).toFixed(1)}K`;
@@ -63,7 +69,7 @@ function TokenBarChart({ daily }: { daily: AnalyticsDailyEntry[] }) {
  const { t } = useI18n();
  if (daily.length === 0) return null;

-  const maxTokens = Math.max(...daily.map((d) => d.input_tokens + d.output_tokens), 1);
+  const maxTokens = Math.max(...daily.map((d) => getPromptTokens(d) + d.output_tokens), 1);

  return (
    <Card>
@@ -75,7 +81,7 @@ function TokenBarChart({ daily }: { daily: AnalyticsDailyEntry[] }) {
          <div className="flex items-center gap-4 text-xs text-muted-foreground">
          <div className="flex items-center gap-1.5">
            <div className="h-2.5 w-2.5 bg-[#ffe6cb]" />
-            {t.analytics.input}
+            {t.analytics.prompt}
          </div>
          <div className="flex items-center gap-1.5">
            <div className="h-2.5 w-2.5 bg-emerald-500" />
@@ -86,8 +92,9 @@ function TokenBarChart({ daily }: { daily: AnalyticsDailyEntry[] }) {
      <CardContent>
        <div className="flex items-end gap-[2px]" style={{ height: CHART_HEIGHT_PX }}>
          {daily.map((d) => {
-            const total = d.input_tokens + d.output_tokens;
-            const inputH = Math.round((d.input_tokens / maxTokens) * CHART_HEIGHT_PX);
+            const promptTokens = getPromptTokens(d);
+            const total = promptTokens + d.output_tokens;
+            const inputH = Math.round((promptTokens / maxTokens) * CHART_HEIGHT_PX);
            const outputH = Math.round((d.output_tokens / maxTokens) * CHART_HEIGHT_PX);
            return (
              <div
@@ -99,7 +106,7 @@ function TokenBarChart({ daily }: { daily: AnalyticsDailyEntry[] }) {
                <div className="absolute bottom-full left-1/2 -translate-x-1/2 mb-2 hidden group-hover:block z-10 pointer-events-none">
                  <div className="bg-card border border-border px-2.5 py-1.5 text-[10px] text-foreground shadow-lg whitespace-nowrap">
                    <div className="font-medium">{formatDate(d.day)}</div>
-                    <div>{t.analytics.input}: {formatTokens(d.input_tokens)}</div>
+                    <div>{t.analytics.prompt}: {formatTokens(promptTokens)}</div>
                    <div>{t.analytics.output}: {formatTokens(d.output_tokens)}</div>
                    <div>{t.analytics.total}: {formatTokens(total)}</div>
                  </div>
@@ -152,18 +159,19 @@ function DailyTable({ daily }: { daily: AnalyticsDailyEntry[] }) {
              <tr className="border-b border-border text-muted-foreground text-xs">
                <th className="text-left py-2 pr-4 font-medium">{t.analytics.date}</th>
                <th className="text-right py-2 px-4 font-medium">{t.sessions.title}</th>
-                <th className="text-right py-2 px-4 font-medium">{t.analytics.input}</th>
+                <th className="text-right py-2 px-4 font-medium">{t.analytics.prompt}</th>
                <th className="text-right py-2 pl-4 font-medium">{t.analytics.output}</th>
              </tr>
            </thead>
            <tbody>
              {sorted.map((d) => {
+                const promptTokens = getPromptTokens(d);
                return (
                  <tr key={d.day} className="border-b border-border/50 hover:bg-secondary/20 transition-colors">
                    <td className="py-2 pr-4 font-medium">{formatDate(d.day)}</td>
                    <td className="text-right py-2 px-4 text-muted-foreground">{d.sessions}</td>
                    <td className="text-right py-2 px-4">
-                      <span className="text-[#ffe6cb]">{formatTokens(d.input_tokens)}</span>
+                      <span className="text-[#ffe6cb]">{formatTokens(promptTokens)}</span>
                    </td>
                    <td className="text-right py-2 pl-4">
                      <span className="text-emerald-400">{formatTokens(d.output_tokens)}</span>
@@ -184,7 +192,7 @@ function ModelTable({ models }: { models: AnalyticsModelEntry[] }) {
  if (models.length === 0) return null;

  const sorted = [...models].sort(
-    (a, b) => b.input_tokens + b.output_tokens - (a.input_tokens + a.output_tokens),
+    (a, b) => (getPromptTokens(b) + b.output_tokens) - (getPromptTokens(a) + a.output_tokens),
  );

  return (
@@ -213,7 +221,7 @@ function ModelTable({ models }: { models: AnalyticsModelEntry[] }) {
                  </td>
                  <td className="text-right py-2 px-4 text-muted-foreground">{m.sessions}</td>
                  <td className="text-right py-2 pl-4">
-                    <span className="text-[#ffe6cb]">{formatTokens(m.input_tokens)}</span>
+                    <span className="text-[#ffe6cb]">{formatTokens(getPromptTokens(m))}</span>
                    {" / "}
                    <span className="text-emerald-400">{formatTokens(m.output_tokens)}</span>
                  </td>
@@ -283,12 +291,17 @@ export default function AnalyticsPage() {
      {data && (
        <>
          {/* Summary cards */}
-          <div className="grid gap-4 sm:grid-cols-2 lg:grid-cols-3">
+          <div className="grid gap-4 sm:grid-cols-2 lg:grid-cols-4">
            <SummaryCard
              icon={Hash}
              label={t.analytics.totalTokens}
-              value={formatTokens(data.totals.total_input + data.totals.total_output)}
-              sub={t.analytics.inOut.replace("{input}", formatTokens(data.totals.total_input)).replace("{output}", formatTokens(data.totals.total_output))}
+              value={formatTokens(
+                (data.totals.total_input ?? 0) +
+                (data.totals.total_cache_read ?? 0) +
+                (data.totals.total_cache_write ?? 0) +
+                (data.totals.total_output ?? 0)
+              )}
+              sub={`${formatTokens((data.totals.total_input ?? 0) + (data.totals.total_cache_read ?? 0) + (data.totals.total_cache_write ?? 0))} ${t.analytics.prompt} / ${formatTokens(data.totals.total_output ?? 0)} ${t.analytics.output.toLowerCase()}`}
            />
            <SummaryCard
              icon={BarChart3}
@@ -297,11 +310,25 @@ export default function AnalyticsPage() {
              sub={`~${(data.totals.total_sessions / days).toFixed(1)}${t.analytics.perDayAvg}`}
            />
            <SummaryCard
-              icon={TrendingUp}
+              icon={Zap}
              label={t.analytics.apiCalls}
-              value={String(data.daily.reduce((sum, d) => sum + d.sessions, 0))}
+              value={String(data.totals.total_api_calls ?? data.daily.reduce((sum, d) => sum + d.sessions, 0))}
              sub={t.analytics.acrossModels.replace("{count}", String(data.by_model.length))}
            />
+            {(() => {
+              const promptSent = (data.totals.total_input ?? 0) + (data.totals.total_cache_read ?? 0);
+              const rate = promptSent > 0
+                ? `${((data.totals.total_cache_read ?? 0) / promptSent * 100).toFixed(0)}%`
+                : "—";
+              return (
+                <SummaryCard
+                  icon={TrendingUp}
+                  label={t.analytics.cacheHitRate}
+                  value={rate}
+                  sub={`${formatTokens(data.totals.total_cache_read ?? 0)} ${t.analytics.cached}`}
+                />
+              );
+            })()}
          </div>

          {/* Bar chart */}
@@ -49,17 +49,6 @@ The OpenAI Codex provider authenticates via device code (open a URL, enter a cod
 Even when using Nous Portal, Codex, or a custom endpoint, some tools (vision, web summarization, MoA) use a separate "auxiliary" model — by default Gemini Flash via OpenRouter. An `OPENROUTER_API_KEY` enables these tools automatically. You can also configure which model and provider these tools use — see [Auxiliary Models](/docs/user-guide/configuration#auxiliary-models).
 :::

-### Two Commands for Model Management
-
-Hermes has **two** model commands that serve different purposes:
-
-| Command | Where to run | What it does |
-|---------|-------------|--------------|
-| **`hermes model`** | Your terminal (outside any session) | Full setup wizard — add providers, run OAuth, enter API keys, configure endpoints |
-| **`/model`** | Inside a Hermes chat session | Quick switch between **already-configured** providers and models |
-
-If you're trying to switch to a provider you haven't set up yet (e.g. you only have OpenRouter configured and want to use Anthropic), you need `hermes model`, not `/model`. Exit your session first (`Ctrl+C` or `/quit`), run `hermes model`, complete the provider setup, then start a new session.
-
 ### Anthropic (Native)

 Use Claude models directly through the Anthropic API — no OpenRouter proxy needed. Supports three auth methods:
@@ -263,15 +252,7 @@ Both approaches persist to `config.yaml`, which is the source of truth for model

 ### Switching Models with `/model`

-:::warning hermes model vs /model
-**`hermes model`** (run from your terminal, outside any chat session) is the **full provider setup wizard**. Use it to add new providers, run OAuth flows, enter API keys, and configure custom endpoints.
-
-**`/model`** (typed inside an active Hermes chat session) can only **switch between providers and models you've already set up**. It cannot add new providers, run OAuth, or prompt for API keys. If you've only configured one provider (e.g. OpenRouter), `/model` will only show models for that provider.
-
-**To add a new provider:** Exit your session (`Ctrl+C` or `/quit`), run `hermes model`, set up the new provider, then start a new session.
-:::
-
-Once you have at least one custom endpoint configured, you can switch models mid-session:
+Once a custom endpoint is configured, you can switch models mid-session:

 ```
 /model custom:qwen-2.5          # Switch to a model on your custom endpoint
@@ -109,31 +109,22 @@ hermes chat --worktree -q "Review this repo and open a PR"

 ## `hermes model`

-Interactive provider + model selector. **This is the command for adding new providers, setting up API keys, and running OAuth flows.** Run it from your terminal — not from inside an active Hermes chat session.
+Interactive provider + model selector.

 ```bash
 hermes model
 ```

 Use this when you want to:
- **add a new provider** (OpenRouter, Anthropic, Copilot, DeepSeek, custom, etc.)
- log into OAuth-backed providers (Anthropic, Copilot, Codex, Nous Portal)
- enter or update API keys
+- switch default providers
+- log into OAuth-backed providers during model selection
 - pick from provider-specific model lists
 - configure a custom/self-hosted endpoint
 - save the new default into config

-:::warning hermes model vs /model — know the difference
-**`hermes model`** (run from your terminal, outside any Hermes session) is the **full provider setup wizard**. It can add new providers, run OAuth flows, prompt for API keys, and configure endpoints.
-
-**`/model`** (typed inside an active Hermes chat session) can only **switch between providers and models you've already set up**. It cannot add new providers, run OAuth, or prompt for API keys.
-
-**If you need to add a new provider:** Exit your Hermes session first (`Ctrl+C` or `/quit`), then run `hermes model` from your terminal prompt.
-:::
-
 ### `/model` slash command (mid-session)

-Switch between already-configured models without leaving a session:
+Switch models without leaving a session:

 ```
 /model                              # Show current model and available options
@@ -145,16 +136,6 @@ Switch between already-configured models without leaving a session:
 /model openrouter:anthropic/claude-sonnet-4  # Switch back to cloud
 ```

-By default, `/model` changes apply **to the current session only**. Add `--global` to persist the change to `config.yaml`:
-
-```
-/model claude-sonnet-4 --global     # Switch and save as new default
-```
-
-:::info What if I only see OpenRouter models?
-If you've only configured OpenRouter, `/model` will only show OpenRouter models. To add another provider (Anthropic, DeepSeek, Copilot, etc.), exit your session and run `hermes model` from the terminal.
-:::
-
 Provider and base URL changes are persisted to `config.yaml` automatically. When switching away from a custom endpoint, the stale base URL is cleared to prevent it leaking into other providers.

 ## `hermes gateway`
@@ -187,32 +187,6 @@ curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scri

 ### Provider & Model Issues

-#### `/model` only shows one provider / can't switch providers
-
-**Cause:** `/model` (inside a chat session) can only switch between providers you've **already configured**. If you've only set up OpenRouter, that's all `/model` will show.
-
-**Solution:** Exit your session and use `hermes model` from your terminal to add new providers:
-
-```bash
-# Exit the Hermes chat session first (Ctrl+C or /quit)
-
-# Run the full provider setup wizard
-hermes model
-
-# This lets you: add providers, run OAuth, enter API keys, configure endpoints
-```
-
-After adding a new provider via `hermes model`, start a new chat session — `/model` will now show all your configured providers.
-
-:::tip Quick reference
-| Want to... | Use |
-|-----------|-----|
-| Add a new provider | `hermes model` (from terminal) |
-| Enter/change API keys | `hermes model` (from terminal) |
-| Switch model mid-session | `/model <name>` (inside session) |
-| Switch to different configured provider | `/model provider:model` (inside session) |
-:::
-
 #### API key not working

 **Cause:** Key is missing, expired, incorrectly set, or for the wrong provider.
@@ -46,7 +46,7 @@ Type `/` in the CLI to open the autocomplete menu. Built-in commands are case-in
 | Command | Description |
 |---------|-------------|
 | `/config` | Show current configuration |
-| `/model [model-name]` | Show or change the current model. Supports: `/model claude-sonnet-4`, `/model provider:model` (switch providers), `/model custom:model` (custom endpoint), `/model custom:name:model` (named custom provider), `/model custom` (auto-detect from endpoint). Use `--global` to persist the change to config.yaml. **Note:** `/model` can only switch between already-configured providers. To add a new provider, exit the session and run `hermes model` from your terminal. |
+| `/model [model-name]` | Show or change the current model. Supports: `/model claude-sonnet-4`, `/model provider:model` (switch providers), `/model custom:model` (custom endpoint), `/model custom:name:model` (named custom provider), `/model custom` (auto-detect from endpoint). Use `--global` to persist the change to config.yaml. |
 | `/provider` | Show available providers and current provider |
 | `/personality` | Set a predefined personality |
 | `/verbose` | Cycle tool progress display: off → new → all → verbose. Can be [enabled for messaging](#notes) via config. |
@@ -124,7 +124,7 @@ The messaging gateway supports the following built-in commands inside Telegram,
 | `/reset` | Reset conversation history. |
 | `/status` | Show session info. |
 | `/stop` | Kill all running background processes and interrupt the running agent. |
-| `/model [provider:model]` | Show or change the model. Supports provider switches (`/model zai:glm-5`), custom endpoints (`/model custom:model`), named custom providers (`/model custom:local:qwen`), and auto-detect (`/model custom`). Use `--global` to persist the change to config.yaml. **Note:** `/model` can only switch between already-configured providers. To add a new provider or set up API keys, use `hermes model` from your terminal (outside the chat session). |
+| `/model [provider:model]` | Show or change the model. Supports provider switches (`/model zai:glm-5`), custom endpoints (`/model custom:model`), named custom providers (`/model custom:local:qwen`), and auto-detect (`/model custom`). Use `--global` to persist the change to config.yaml. |
 | `/provider` | Show provider availability and auth status. |
 | `/personality [name]` | Set a personality overlay for the session. |
 | `/fast [normal\|fast\|status]` | Toggle fast mode — OpenAI Priority Processing / Anthropic Fast Mode. |
@@ -119,7 +119,6 @@ const sidebars: SidebarsConfig = {
        'user-guide/messaging/wecom-callback',
        'user-guide/messaging/weixin',
        'user-guide/messaging/bluebubbles',
-        'user-guide/messaging/qqbot',
        'user-guide/messaging/open-webui',
        'user-guide/messaging/webhooks',
      ],
@@ -154,7 +153,6 @@ const sidebars: SidebarsConfig = {
        'guides/use-voice-mode-with-hermes',
        'guides/build-a-hermes-plugin',
        'guides/automate-with-cron',
-        'guides/automation-templates',
        'guides/cron-troubleshooting',
        'guides/work-with-skills',
        'guides/delegation-patterns',