Compare commits

..

4 Commits

Author SHA1 Message Date
Jai Suphavadeeprasit
c2d5a28d15 Modularize frontend 2025-10-13 11:53:13 -04:00
Jai Suphavadeeprasit
bb5eab2645 logging work 2025-10-13 10:44:42 -04:00
Jai Suphavadeeprasit
6313c9879f changes 2025-10-11 17:52:23 -04:00
Jai Suphavadeeprasit
e698b7e0e5 changes 2025-10-10 18:04:22 -04:00
344 changed files with 7853 additions and 163542 deletions

View File

@@ -1,222 +0,0 @@
# Hermes Agent Environment Configuration
# Copy this file to .env and fill in your API keys
# =============================================================================
# LLM PROVIDER (OpenRouter)
# =============================================================================
# OpenRouter provides access to many models through one API
# All LLM calls go through OpenRouter - no direct provider keys needed
# Get your key at: https://openrouter.ai/keys
OPENROUTER_API_KEY=
# Default model to use (OpenRouter format: provider/model)
# Examples: anthropic/claude-opus-4.6, openai/gpt-4o, google/gemini-2.0-flash, zhipuai/glm-4-plus
LLM_MODEL=anthropic/claude-opus-4.6
# =============================================================================
# TOOL API KEYS
# =============================================================================
# Firecrawl API Key - Web search, extract, and crawl
# Get at: https://firecrawl.dev/
FIRECRAWL_API_KEY=
# Nous Research API Key - Vision analysis and multi-model reasoning
# Get at: https://inference-api.nousresearch.com/
NOUS_API_KEY=
# FAL.ai API Key - Image generation
# Get at: https://fal.ai/
FAL_KEY=
# =============================================================================
# TERMINAL TOOL CONFIGURATION (mini-swe-agent backend)
# =============================================================================
# Backend type: "local", "singularity", "docker", "modal", or "ssh"
# - local: Runs directly on your machine (fastest, no isolation)
# - ssh: Runs on remote server via SSH (great for sandboxing - agent can't touch its own code)
# - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed)
# - docker: Runs in Docker containers (isolated, requires Docker + docker group)
# - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account)
TERMINAL_ENV=local
# Container images (for singularity/docker/modal backends)
TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20
TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
# Working directory for terminal commands
# For local backend: "." means current directory (resolved automatically)
# For remote backends (ssh/docker/modal/singularity): use an absolute path
# INSIDE the target environment, or leave unset for the backend's default
# (/root for modal, / for docker, ~ for ssh). Do NOT use a host-local path.
# Usually managed by config.yaml (terminal.cwd) — uncomment to override
# TERMINAL_CWD=.
# Default command timeout in seconds
TERMINAL_TIMEOUT=60
# Cleanup inactive environments after this many seconds
TERMINAL_LIFETIME_SECONDS=300
# =============================================================================
# SSH REMOTE EXECUTION (for TERMINAL_ENV=ssh)
# =============================================================================
# Run terminal commands on a remote server via SSH.
# Agent code stays on your machine, commands execute remotely.
#
# SECURITY BENEFITS:
# - Agent cannot read your .env file (API keys protected)
# - Agent cannot modify its own code
# - Remote server acts as isolated sandbox
# - Can safely configure passwordless sudo on remote
#
# TERMINAL_SSH_HOST=192.168.1.100
# TERMINAL_SSH_USER=agent
# TERMINAL_SSH_PORT=22
# TERMINAL_SSH_KEY=~/.ssh/id_rsa
# =============================================================================
# SUDO SUPPORT (works with ALL terminal backends)
# =============================================================================
# If set, enables sudo commands by piping password via `sudo -S`.
# Works with: local, docker, singularity, modal, and ssh backends.
#
# SECURITY WARNING: Password stored in plaintext. Only use on trusted machines.
#
# ALTERNATIVES:
# - For SSH backend: Configure passwordless sudo on the remote server
# - For containers: Run as root inside the container (no sudo needed)
# - For local: Configure /etc/sudoers for specific commands
# - For CLI: Leave unset - you'll be prompted interactively with 45s timeout
#
# SUDO_PASSWORD=your_password_here
# =============================================================================
# MODAL CLOUD BACKEND (Optional - for TERMINAL_ENV=modal)
# =============================================================================
# Modal uses CLI authentication, not environment variables.
# Run: pip install modal && modal setup
# This will authenticate via browser and store credentials locally.
# No API key needed in .env - Modal handles auth automatically.
# =============================================================================
# BROWSER TOOL CONFIGURATION (agent-browser + Browserbase)
# =============================================================================
# Browser automation requires Browserbase cloud service for remote browser execution.
# This allows the agent to navigate websites, fill forms, and extract information.
#
# STEALTH MODES:
# - Basic Stealth: ALWAYS active (random fingerprints, auto CAPTCHA solving)
# - Advanced Stealth: Requires BROWSERBASE_ADVANCED_STEALTH=true (Scale Plan only)
# Browserbase API Key - Cloud browser execution
# Get at: https://browserbase.com/
BROWSERBASE_API_KEY=
# Browserbase Project ID - From your Browserbase dashboard
BROWSERBASE_PROJECT_ID=
# Enable residential proxies for better CAPTCHA solving (default: true)
# Routes traffic through residential IPs, significantly improves success rate
BROWSERBASE_PROXIES=true
# Enable advanced stealth mode (default: false, requires Scale Plan)
# Uses custom Chromium build to avoid bot detection altogether
BROWSERBASE_ADVANCED_STEALTH=false
# Browser session timeout in seconds (default: 300)
# Sessions are cleaned up after this duration of inactivity
BROWSER_SESSION_TIMEOUT=300
# Browser inactivity timeout - auto-cleanup inactive sessions (default: 120 = 2 min)
# Browser sessions are automatically closed after this period of no activity
BROWSER_INACTIVITY_TIMEOUT=120
# =============================================================================
# SESSION LOGGING
# =============================================================================
# Session trajectories are automatically saved to logs/ directory
# Format: logs/session_YYYYMMDD_HHMMSS_UUID.json
# Contains full conversation history in trajectory format for debugging/replay
# =============================================================================
# VOICE TRANSCRIPTION & OPENAI TTS
# =============================================================================
# Required for voice message transcription (Whisper) and OpenAI TTS voices.
# Uses OpenAI's API directly (not via OpenRouter).
# Named HERMES_OPENAI_API_KEY to avoid interference with OpenRouter.
# Get at: https://platform.openai.com/api-keys
HERMES_OPENAI_API_KEY=
# =============================================================================
# SLACK INTEGRATION
# =============================================================================
# Slack Bot Token - From Slack App settings (OAuth & Permissions)
# Get at: https://api.slack.com/apps
# SLACK_BOT_TOKEN=xoxb-...
# Slack App Token - For Socket Mode (App-Level Tokens in Slack App settings)
# SLACK_APP_TOKEN=xapp-...
# Slack allowed users (comma-separated Slack user IDs)
# SLACK_ALLOWED_USERS=
# =============================================================================
# RESPONSE PACING
# =============================================================================
# Human-like delays between message chunks on messaging platforms.
# Makes the bot feel less robotic.
# HERMES_HUMAN_DELAY_MODE=off # off | natural | custom
# HERMES_HUMAN_DELAY_MIN_MS=800 # Min delay in ms (custom mode)
# HERMES_HUMAN_DELAY_MAX_MS=2500 # Max delay in ms (custom mode)
# =============================================================================
# LEGACY/OPTIONAL API KEYS
# =============================================================================
# Morph API Key - For legacy Hecate terminal backend (terminal-hecate tool)
# Get at: https://morph.so/
MORPH_API_KEY=
# Hecate VM Settings (only if using terminal-hecate tool)
HECATE_VM_LIFETIME_SECONDS=300
HECATE_DEFAULT_SNAPSHOT_ID=snapshot_p5294qxt
# =============================================================================
# DEBUG OPTIONS
# =============================================================================
WEB_TOOLS_DEBUG=false
VISION_TOOLS_DEBUG=false
MOA_TOOLS_DEBUG=false
IMAGE_TOOLS_DEBUG=false
# =============================================================================
# CONTEXT COMPRESSION (Auto-shrinks long conversations)
# =============================================================================
# When conversation approaches model's context limit, middle turns are
# automatically summarized to free up space.
#
# CONTEXT_COMPRESSION_ENABLED=true # Enable auto-compression (default: true)
# CONTEXT_COMPRESSION_THRESHOLD=0.85 # Compress at 85% of context limit
# CONTEXT_COMPRESSION_MODEL=google/gemini-2.0-flash-001 # Fast model for summaries
# =============================================================================
# RL TRAINING (Tinker + Atropos)
# =============================================================================
# Run reinforcement learning training on language models using the Tinker API.
# Requires the rl-server to be running (from tinker-atropos package).
# Tinker API Key - RL training service
# Get at: https://tinker-console.thinkingmachines.ai/keys
TINKER_API_KEY=
# Weights & Biases API Key - Experiment tracking and metrics
# Get at: https://wandb.ai/authorize
WANDB_API_KEY=
# RL API Server URL (default: http://localhost:8080)
# Change if running the rl-server on a different host/port
# RL_API_URL=http://localhost:8080

31
.gitignore vendored
View File

@@ -16,33 +16,4 @@ __pycache__/
export*
__pycache__/model_tools.cpython-310.pyc
__pycache__/web_tools.cpython-310.pyc
logs/
data/
.pytest_cache/
tmp/
temp_vision_images/
hermes-*/*
examples/
tests/quick_test_dataset.jsonl
tests/sample_dataset.jsonl
run_datagen_kimik2-thinking.sh
run_datagen_megascience_glm4-6.sh
run_datagen_sonnet.sh
source-data/*
run_datagen_megascience_glm4-6.sh
data/*
node_modules/
browser-use/
agent-browser/
# Private keys
*.ppk
*.pem
privvy*
images/
__pycache__/
hermes_agent.egg-info/
wandb/
testlogs
# CLI config (may contain sensitive SSH paths)
cli-config.yaml
logs/

6
.gitmodules vendored
View File

@@ -1,6 +0,0 @@
[submodule "mini-swe-agent"]
path = mini-swe-agent
url = https://github.com/SWE-agent/mini-swe-agent
[submodule "tinker-atropos"]
path = tinker-atropos
url = https://github.com/nousresearch/tinker-atropos

609
AGENTS.md
View File

@@ -1,609 +0,0 @@
# Hermes Agent - Development Guide
Instructions for AI coding assistants (GitHub Copilot, Cursor, etc.) and human developers.
Hermes-Agent is an AI agent harness with tool-calling capabilities, interactive CLI, messaging integrations, and scheduled tasks.
## Development Environment
**IMPORTANT**: Always use the virtual environment if it exists:
```bash
source venv/bin/activate # Before running any Python commands
```
## Project Structure
```
hermes-agent/
├── hermes_cli/ # Unified CLI commands
│ ├── main.py # Entry point, command dispatcher
│ ├── setup.py # Interactive setup wizard
│ ├── config.py # Config management & migration
│ ├── status.py # Status display
│ ├── doctor.py # Diagnostics
│ ├── gateway.py # Gateway management
│ ├── uninstall.py # Uninstaller
│ └── cron.py # Cron job management
├── tools/ # Tool implementations
│ ├── process_registry.py # Background process management (spawn, poll, wait, kill)
│ ├── transcription_tools.py # Speech-to-text (Whisper API)
├── gateway/ # Messaging platform adapters
│ ├── pairing.py # DM pairing code system
│ ├── hooks.py # Event hook system
│ ├── sticker_cache.py # Telegram sticker vision cache
│ ├── platforms/
│ │ └── slack.py # Slack adapter (slack-bolt)
├── cron/ # Scheduler implementation
├── skills/ # Knowledge documents
├── cli.py # Interactive CLI (Rich UI)
├── run_agent.py # Agent runner with AIAgent class
├── model_tools.py # Tool schemas and handlers
├── toolsets.py # Tool groupings
├── toolset_distributions.py # Probability-based tool selection
└── batch_runner.py # Parallel batch processing
```
**User Configuration** (stored in `~/.hermes/`):
- `~/.hermes/config.yaml` - Settings (model, terminal, toolsets, etc.)
- `~/.hermes/.env` - API keys and secrets
- `~/.hermes/pairing/` - DM pairing data
- `~/.hermes/hooks/` - Custom event hooks
- `~/.hermes/image_cache/` - Cached user images
- `~/.hermes/audio_cache/` - Cached user voice messages
- `~/.hermes/sticker_cache.json` - Telegram sticker descriptions
## File Dependency Chain
```
tools/*.py → tools/__init__.py → model_tools.py → toolsets.py → toolset_distributions.py
run_agent.py ──────────────────────────┘
cli.py → run_agent.py (uses AIAgent with quiet_mode=True)
batch_runner.py → run_agent.py + toolset_distributions.py
```
Always ensure consistency between tools, model_tools.py, and toolsets.py when changing any of them.
---
## AIAgent Class
The main agent is implemented in `run_agent.py`:
```python
class AIAgent:
def __init__(
self,
model: str = "anthropic/claude-sonnet-4",
api_key: str = None,
base_url: str = "https://openrouter.ai/api/v1",
max_iterations: int = 60, # Max tool-calling loops
enabled_toolsets: list = None,
disabled_toolsets: list = None,
verbose_logging: bool = False,
quiet_mode: bool = False, # Suppress progress output
tool_progress_callback: callable = None, # Called on each tool use
):
# Initialize OpenAI client, load tools based on toolsets
...
def chat(self, user_message: str, task_id: str = None) -> str:
# Main entry point - runs the agent loop
...
```
### Agent Loop
The core loop in `_run_agent_loop()`:
```
1. Add user message to conversation
2. Call LLM with tools
3. If LLM returns tool calls:
- Execute each tool
- Add tool results to conversation
- Go to step 2
4. If LLM returns text response:
- Return response to user
```
```python
while turns < max_turns:
response = client.chat.completions.create(
model=model,
messages=messages,
tools=tool_schemas,
)
if response.tool_calls:
for tool_call in response.tool_calls:
result = await execute_tool(tool_call)
messages.append(tool_result_message(result))
turns += 1
else:
return response.content
```
### Conversation Management
Messages are stored as a list of dicts following OpenAI format:
```python
messages = [
{"role": "system", "content": "You are a helpful assistant..."},
{"role": "user", "content": "Search for Python tutorials"},
{"role": "assistant", "content": None, "tool_calls": [...]},
{"role": "tool", "tool_call_id": "...", "content": "..."},
{"role": "assistant", "content": "Here's what I found..."},
]
```
### Reasoning Model Support
For models that support chain-of-thought reasoning:
- Extract `reasoning_content` from API responses
- Store in `assistant_msg["reasoning"]` for trajectory export
- Pass back via `reasoning_content` field on subsequent turns
---
## CLI Architecture (cli.py)
The interactive CLI uses:
- **Rich** - For the welcome banner and styled panels
- **prompt_toolkit** - For fixed input area with history and `patch_stdout`
- **KawaiiSpinner** (in run_agent.py) - Animated feedback during API calls and tool execution
Key components:
- `HermesCLI` class - Main CLI controller with commands and conversation loop
- `load_cli_config()` - Loads config, sets environment variables for terminal
- `build_welcome_banner()` - Displays ASCII art logo, tools, and skills summary
- `/commands` - Process user commands like `/help`, `/clear`, `/personality`, etc.
CLI uses `quiet_mode=True` when creating AIAgent to suppress verbose logging.
### Adding CLI Commands
1. Add to `COMMANDS` dict with description
2. Add handler in `process_command()` method
3. For persistent settings, use `save_config_value()` to update config
---
## Hermes CLI Commands
The unified `hermes` command provides all functionality:
| Command | Description |
|---------|-------------|
| `hermes` | Interactive chat (default) |
| `hermes chat -q "..."` | Single query mode |
| `hermes setup` | Configure API keys and settings |
| `hermes config` | View current configuration |
| `hermes config edit` | Open config in editor |
| `hermes config set KEY VAL` | Set a specific value |
| `hermes config check` | Check for missing config |
| `hermes config migrate` | Prompt for missing config interactively |
| `hermes status` | Show configuration status |
| `hermes doctor` | Diagnose issues |
| `hermes update` | Update to latest (checks for new config) |
| `hermes uninstall` | Uninstall (can keep configs for reinstall) |
| `hermes gateway` | Start messaging gateway |
| `hermes cron list` | View scheduled jobs |
| `hermes version` | Show version info |
| `hermes pairing list/approve/revoke` | Manage DM pairing codes |
---
## Messaging Gateway
The gateway connects Hermes to Telegram, Discord, and WhatsApp.
### Configuration (in `~/.hermes/.env`):
```bash
# Telegram
TELEGRAM_BOT_TOKEN=123456:ABC-DEF... # From @BotFather
TELEGRAM_ALLOWED_USERS=123456789,987654 # Comma-separated user IDs (from @userinfobot)
# Discord
DISCORD_BOT_TOKEN=MTIz... # From Developer Portal
DISCORD_ALLOWED_USERS=123456789012345678 # Comma-separated user IDs
# Agent Behavior
HERMES_MAX_ITERATIONS=60 # Max tool-calling iterations
MESSAGING_CWD=/home/myuser # Terminal working directory for messaging
# Tool Progress (optional)
HERMES_TOOL_PROGRESS=true # Send progress messages
HERMES_TOOL_PROGRESS_MODE=new # "new" or "all"
```
### Working Directory Behavior
- **CLI (`hermes` command)**: Uses current directory (`.``os.getcwd()`)
- **Messaging (Telegram/Discord)**: Uses `MESSAGING_CWD` (default: home directory)
This is intentional: CLI users are in a terminal and expect the agent to work in their current directory, while messaging users need a consistent starting location.
### Security (User Allowlists):
**IMPORTANT**: Without an allowlist, anyone who finds your bot can use it!
The gateway checks `{PLATFORM}_ALLOWED_USERS` environment variables:
- If set: Only listed user IDs can interact with the bot
- If unset: All users are allowed (dangerous with terminal access!)
Users can find their IDs:
- **Telegram**: Message [@userinfobot](https://t.me/userinfobot)
- **Discord**: Enable Developer Mode, right-click name → Copy ID
### DM Pairing System
Instead of static allowlists, users can pair via one-time codes:
1. Unknown user DMs the bot → receives pairing code
2. Owner runs `hermes pairing approve <platform> <code>`
3. User is permanently authorized
Security: 8-char codes, 1-hour expiry, rate-limited (1/10min/user), max 3 pending per platform, lockout after 5 failed attempts, `chmod 0600` on data files.
Files: `gateway/pairing.py`, `hermes_cli/pairing.py`
### Event Hooks
Hooks fire at lifecycle points. Place hook directories in `~/.hermes/hooks/`:
```
~/.hermes/hooks/my-hook/
├── HOOK.yaml # name, description, events list
└── handler.py # async def handle(event_type, context): ...
```
Events: `gateway:startup`, `session:start`, `session:reset`, `agent:start`, `agent:step`, `agent:end`, `command:*`
The `agent:step` event fires each iteration of the tool-calling loop with tool names and results.
Files: `gateway/hooks.py`
### Tool Progress Notifications
When `HERMES_TOOL_PROGRESS=true`, the bot sends status messages as it works:
- `💻 \`ls -la\`...` (terminal commands show the actual command)
- `🔍 web_search...`
- `📄 web_extract...`
Modes:
- `new`: Only when switching to a different tool (less spam)
- `all`: Every single tool call
### Typing Indicator
The gateway keeps the "typing..." indicator active throughout processing, refreshing every 4 seconds. This lets users know the bot is working even during long tool-calling sequences.
### Platform Toolsets:
Each platform has a dedicated toolset in `toolsets.py`:
- `hermes-telegram`: Full tools including terminal (with safety checks)
- `hermes-discord`: Full tools including terminal
- `hermes-whatsapp`: Full tools including terminal
---
## Configuration System
Configuration files are stored in `~/.hermes/` for easy user access:
- `~/.hermes/config.yaml` - All settings (model, terminal, compression, etc.)
- `~/.hermes/.env` - API keys and secrets
### Adding New Configuration Options
When adding new configuration variables, you MUST follow this process:
#### For config.yaml options:
1. Add to `DEFAULT_CONFIG` in `hermes_cli/config.py`
2. **CRITICAL**: Bump `_config_version` in `DEFAULT_CONFIG` when adding required fields
3. This triggers migration prompts for existing users on next `hermes update` or `hermes setup`
Example:
```python
DEFAULT_CONFIG = {
# ... existing config ...
"new_feature": {
"enabled": True,
"option": "default_value",
},
# BUMP THIS when adding required fields
"_config_version": 2, # Was 1, now 2
}
```
#### For .env variables (API keys/secrets):
1. Add to `REQUIRED_ENV_VARS` or `OPTIONAL_ENV_VARS` in `hermes_cli/config.py`
2. Include metadata for the migration system:
```python
OPTIONAL_ENV_VARS = {
# ... existing vars ...
"NEW_API_KEY": {
"description": "What this key is for",
"prompt": "Display name in prompts",
"url": "https://where-to-get-it.com/",
"tools": ["tools_it_enables"], # What tools need this
"password": True, # Mask input
},
}
```
#### Update related files:
- `hermes_cli/setup.py` - Add prompts in the setup wizard
- `cli-config.yaml.example` - Add example with comments
- Update README.md if user-facing
### Config Version Migration
The system uses `_config_version` to detect outdated configs:
1. `check_for_missing_config()` compares user config to `DEFAULT_CONFIG`
2. `migrate_config()` interactively prompts for missing values
3. Called automatically by `hermes update` and optionally by `hermes setup`
---
## Environment Variables
API keys are loaded from `~/.hermes/.env`:
- `OPENROUTER_API_KEY` - Main LLM API access (primary provider)
- `FIRECRAWL_API_KEY` - Web search/extract tools
- `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` - Browser automation
- `FAL_KEY` - Image generation (FLUX model)
- `NOUS_API_KEY` - Vision and Mixture-of-Agents tools
Terminal tool configuration (in `~/.hermes/config.yaml`):
- `terminal.backend` - Backend: local, docker, singularity, modal, or ssh
- `terminal.cwd` - Working directory ("." = host CWD for local only; for remote backends set an absolute path inside the target, or omit to use the backend's default)
- `terminal.docker_image` - Image for Docker backend
- `terminal.singularity_image` - Image for Singularity backend
- `terminal.modal_image` - Image for Modal backend
- SSH: `TERMINAL_SSH_HOST`, `TERMINAL_SSH_USER`, `TERMINAL_SSH_KEY` in .env
Agent behavior (in `~/.hermes/.env`):
- `HERMES_MAX_ITERATIONS` - Max tool-calling iterations (default: 60)
- `MESSAGING_CWD` - Working directory for messaging platforms (default: ~)
- `HERMES_TOOL_PROGRESS` - Enable tool progress messages (`true`/`false`)
- `HERMES_TOOL_PROGRESS_MODE` - Progress mode: `new` (tool changes) or `all`
- `OPENAI_API_KEY` - Voice transcription (Whisper STT)
- `SLACK_BOT_TOKEN` / `SLACK_APP_TOKEN` - Slack integration (Socket Mode)
- `SLACK_ALLOWED_USERS` - Comma-separated Slack user IDs
- `HERMES_HUMAN_DELAY_MODE` - Response pacing: off/natural/custom
- `HERMES_HUMAN_DELAY_MIN_MS` / `HERMES_HUMAN_DELAY_MAX_MS` - Custom delay range
### Dangerous Command Approval
The terminal tool includes safety checks for potentially destructive commands (e.g., `rm -rf`, `DROP TABLE`, `chmod 777`, etc.):
**Behavior by Backend:**
- **Docker/Singularity/Modal**: Commands run unrestricted (isolated containers)
- **Local/SSH**: Dangerous commands trigger approval flow
**Approval Flow (CLI):**
```
⚠️ Potentially dangerous command detected: recursive delete
rm -rf /tmp/test
[o]nce | [s]ession | [a]lways | [d]eny
Choice [o/s/a/D]:
```
**Approval Flow (Messaging):**
- Command is blocked with explanation
- Agent explains the command was blocked for safety
- User must add the pattern to their allowlist via `hermes config edit` or run the command directly on their machine
**Configuration:**
- `command_allowlist` in `~/.hermes/config.yaml` stores permanently allowed patterns
- Add patterns via "always" approval or edit directly
**Sudo Handling (Messaging):**
- If sudo fails over messaging, output includes tip to add `SUDO_PASSWORD` to `~/.hermes/.env`
---
## Background Process Management
The `process` tool works alongside `terminal` for managing long-running background processes:
**Starting a background process:**
```python
terminal(command="pytest -v tests/", background=true)
# Returns: {"session_id": "proc_abc123", "pid": 12345, ...}
```
**Managing it with the process tool:**
- `process(action="list")` -- show all running/recent processes
- `process(action="poll", session_id="proc_abc123")` -- check status + new output
- `process(action="log", session_id="proc_abc123")` -- full output with pagination
- `process(action="wait", session_id="proc_abc123", timeout=600)` -- block until done
- `process(action="kill", session_id="proc_abc123")` -- terminate
- `process(action="write", session_id="proc_abc123", data="y")` -- send stdin
- `process(action="submit", session_id="proc_abc123", data="yes")` -- send + Enter
**Key behaviors:**
- Background processes execute through the configured terminal backend (local/Docker/Modal/SSH/Singularity) -- never directly on the host unless `TERMINAL_ENV=local`
- The `wait` action blocks the tool call until the process finishes, times out, or is interrupted by a new user message
- PTY mode (`pty=true` on terminal) enables interactive CLI tools (Codex, Claude Code)
- In RL training, background processes are auto-killed when the episode ends (`tool_context.cleanup()`)
- In the gateway, sessions with active background processes are exempt from idle reset
- The process registry checkpoints to `~/.hermes/processes.json` for crash recovery
Files: `tools/process_registry.py` (registry), `model_tools.py` (tool definition + handler), `tools/terminal_tool.py` (spawn integration)
---
## Adding New Tools
Follow this strict order to maintain consistency:
1. Create `tools/your_tool.py` with:
- Handler function (sync or async) returning a JSON string via `json.dumps()`
- `check_*_requirements()` function to verify dependencies (e.g., API keys)
- Schema definition following OpenAI function-calling format
2. Export in `tools/__init__.py`:
- Import the handler and check function
- Add to `__all__` list
3. Register in `model_tools.py`:
- Add to `TOOLSET_REQUIREMENTS` if it needs API keys
- Create `get_*_tool_definitions()` function or add to existing
- Add routing in `handle_function_call()` dispatcher
- Update `get_all_tool_names()` with the tool name
- Update `get_toolset_for_tool()` mapping
- Update `get_available_toolsets()` and `check_toolset_requirements()`
4. Add to toolset in `toolsets.py`:
- Add to existing toolset or create new one in TOOLSETS dict
5. If the tool requires an API key:
- Add to `OPTIONAL_ENV_VARS` in `hermes_cli/config.py`
- The tool will be auto-disabled if the key is missing
6. Optionally add to `toolset_distributions.py` for batch processing
### Tool Implementation Pattern
```python
# tools/example_tool.py
import json
import os
def check_example_requirements() -> bool:
"""Check if required API keys/dependencies are available."""
return bool(os.getenv("EXAMPLE_API_KEY"))
def example_tool(param: str, task_id: str = None) -> str:
"""Execute the tool and return JSON string result."""
try:
result = {"success": True, "data": "..."}
return json.dumps(result, ensure_ascii=False)
except Exception as e:
return json.dumps({"error": str(e)}, ensure_ascii=False)
```
All tool handlers MUST return a JSON string. Never return raw dicts.
### Dynamic Tool Availability
Tools are automatically disabled when their API keys are missing:
```python
# In model_tools.py
TOOLSET_REQUIREMENTS = {
"web": {"env_vars": ["FIRECRAWL_API_KEY"]},
"browser": {"env_vars": ["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"]},
"creative": {"env_vars": ["FAL_KEY"]},
}
```
The `check_tool_availability()` function determines which tools to include.
### Stateful Tools
Tools that maintain state (terminal, browser) require:
- `task_id` parameter for session isolation between concurrent tasks
- `cleanup_*()` function to release resources
- Cleanup is called automatically in run_agent.py after conversation completes
---
## Trajectory Format
Conversations are saved in ShareGPT format for training:
```json
{"from": "system", "value": "System prompt with <tools>...</tools>"}
{"from": "human", "value": "User message"}
{"from": "gpt", "value": "<think>reasoning</think>\n<tool_call>{...}</tool_call>"}
{"from": "tool", "value": "<tool_response>{...}</tool_response>"}
{"from": "gpt", "value": "Final response"}
```
Tool calls use `<tool_call>` XML tags, responses use `<tool_response>` tags, reasoning uses `<think>` tags.
### Trajectory Export
```python
agent = AIAgent(save_trajectories=True)
agent.chat("Do something")
# Saves to trajectories/*.jsonl in ShareGPT format
```
---
## Batch Processing (batch_runner.py)
For processing multiple prompts:
- Parallel execution with multiprocessing
- Content-based resume for fault tolerance (matches on prompt text, not indices)
- Toolset distributions control probabilistic tool availability per prompt
- Output: `data/<run_name>/trajectories.jsonl` (combined) + individual batch files
```bash
python batch_runner.py \
--dataset_file=prompts.jsonl \
--batch_size=20 \
--num_workers=4 \
--run_name=my_run
```
---
## Skills System
Skills are on-demand knowledge documents the agent can load. Located in `skills/` directory:
```
skills/
├── mlops/ # Category folder
│ ├── axolotl/ # Skill folder
│ │ ├── SKILL.md # Main instructions (required)
│ │ ├── references/ # Additional docs, API specs
│ │ └── templates/ # Output formats, configs
│ └── vllm/
│ └── SKILL.md
└── example-skill/
└── SKILL.md
```
**Progressive disclosure** (token-efficient):
1. `skills_categories()` - List category names (~50 tokens)
2. `skills_list(category)` - Name + description per skill (~3k tokens)
3. `skill_view(name)` - Full content + tags + linked files
SKILL.md files use YAML frontmatter:
```yaml
---
name: skill-name
description: Brief description for listing
tags: [tag1, tag2]
related_skills: [other-skill]
version: 1.0.0
---
# Skill Content...
```
Tool files: `tools/skills_tool.py``model_tools.py``toolsets.py`
---
## Testing Changes
After making changes:
1. Run `hermes doctor` to check setup
2. Run `hermes config check` to verify config
3. Test with `hermes chat -q "test message"`
4. For new config options, test fresh install: `rm -rf ~/.hermes && hermes setup`

1322
README.md

File diff suppressed because it is too large Load Diff

63
TODO.md
View File

@@ -1,63 +0,0 @@
# Hermes Agent - Future Improvements
---
## 1. Subagent Architecture (Context Isolation) 🎯
The main agent becomes an orchestrator that delegates context-heavy tasks to subagents with isolated context. Each subagent returns a summary, keeping the orchestrator's context clean. `delegate_task(goal, context, toolsets=[])` with fresh conversation, limited toolset, task-specific system prompt.
## 2. Planning & Task Management 📋
Task decomposition tool, progress checkpoints after N tool calls, persistent plan storage that survives context compression, failure recovery with replanning.
## 3. Dynamic Skills Expansion 📚
Skill acquisition from successful tasks, parameterized skill templates, skill chaining with dependency graphs.
## 4. Interactive Clarifying Questions ❓
Multiple-choice prompt tool with rich terminal UI. Up to 4 choices + free-text. CLI-only with graceful fallback for non-interactive modes.
## 5. Memory System 🧠
Daily memory logs, long-term curated MEMORY.md, vector/semantic search, pre-compaction memory flush, user profile, learning store for error patterns and discovered fixes. *Inspired by ClawdBot's memory system.*
## 6. Heartbeat System 💓
Periodic agent wake-up that reads HEARTBEAT.md for instructions. Runs inside the main session with full context. Triggers on interval, exec completion, cron events, or manual wake. HEARTBEAT_OK suppression when nothing needs attention. *Inspired by ClawdBot's heartbeat.*
## 7. Local Browser Control via CDP 🌐
Support both local Chrome (via CDP, free) and Browserbase (cloud, paid) as browser backends. Local gives persistent login sessions but lacks CAPTCHA solving.
## 8. Signal Integration 📡
New platform adapter using signal-cli daemon (JSON-RPC HTTP + SSE). Requires Java runtime and phone number registration.
## 9. Session Transcript Search 🔍
`hermes sessions search <query>` CLI command and `session_search` agent tool. Text-based first (ripgrep over JSONL), vector search later.
## 10. Plugin/Extension System 🔌
Python plugin interface with `plugin.yaml` + `handler.py`. Discovery from `~/.hermes/plugins/`. Plugins can register tools, hooks, and CLI commands. *Inspired by ClawdBot's 36-plugin extension system.*
## 11. Native Companion Apps 📱
macOS (Swift/SwiftUI), iOS, Android apps connecting via WebSocket. Prerequisite: WS API on gateway. MVP: web UI with Flask/FastAPI. *Inspired by ClawdBot's companion apps.*
## 12. Evaluation System 📏
LLM grader mode for batch_runner, action comparison against expected tool calls, string matching baselines.
## 13. Layered Context Architecture 📊
Structured hierarchy: project context > skills > user profile > learnings > external knowledge > runtime introspection.
## 14. Tools Wishlist 🧰
- Diagram rendering (Mermaid/PlantUML to images)
- Document generation (PDFs, Word, presentations)
- Canvas / visual workspace
- Coding agent skill (Codex, Claude Code orchestration via PTY)
- Domain skill packs (DevOps, data science, security)

26
api_endpoint/__init__.py Normal file
View File

@@ -0,0 +1,26 @@
"""
Hermes Agent - API Endpoint & Real-time Logging
This package provides a FastAPI WebSocket endpoint for real-time logging of the Hermes Agent.
Components:
- logging_server: FastAPI server that receives and stores events
- websocket_logger: Client library for sending events from the agent
Usage:
# Start the API endpoint server
python api_endpoint/logging_server.py
# Use in agent code
from api_endpoint.websocket_logger import WebSocketLogger
For more information, see:
- WEBSOCKET_LOGGING_GUIDE.md - User guide
- IMPLEMENTATION_SUMMARY.md - Technical details
"""
from .websocket_logger import WebSocketLogger, SyncWebSocketLogger
__all__ = ['WebSocketLogger', 'SyncWebSocketLogger']
__version__ = '1.0.0'

View File

@@ -0,0 +1,603 @@
#!/usr/bin/env python3
"""
Hermes Agent - Real-time Logging Server
A FastAPI server with WebSocket support that listens for agent execution events
and logs them to JSON files in real-time.
Events tracked:
- User queries
- API calls (requests to the model)
- Assistant responses
- Tool calls (name, parameters, timing)
- Tool results (outputs, errors, duration)
- Final responses
- Session metadata
Usage:
python logging_server.py
Or with uvicorn directly:
uvicorn logging_server:app --host 0.0.0.0 --port 8000 --reload
The server will listen for WebSocket connections at ws://localhost:8000/ws
"""
import json
import asyncio
import threading
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List, Optional
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
from dotenv import load_dotenv
load_dotenv()
# Configuration
LOGS_DIR = Path(__file__).parent / "logs" / "realtime"
LOGS_DIR.mkdir(parents=True, exist_ok=True)
# Initialize FastAPI app
app = FastAPI(
title="Hermes Agent API Endpoint",
description="Manage interface between agent and user",
version="1.0.0"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class SessionLogger:
"""
Manages logging for a single agent session.
Each agent execution gets its own SessionLogger instance.
Responsible for:
- Collecting all events for the session
- Saving events to JSON file in real-time
- Managing session lifecycle (start -> events -> finalize)
"""
def __init__(self, session_id: str):
self.session_id = session_id
self.start_time = datetime.now()
self.events: List[Dict[str, Any]] = [] # In-memory list of all events
self.log_file = LOGS_DIR / f"session_{session_id}.json" # Where to save on disk
# Initialize session data structure
# This is what gets saved to the JSON file
self.session_data = {
"session_id": session_id,
"start_time": self.start_time.isoformat(),
"end_time": None, # Set when session completes
"events": [], # Will be populated as events come in
"metadata": {} # Model, toolsets, etc. (set via session_start event)
}
def add_event(self, event: Dict[str, Any]):
"""
Add an event to the session log.
Called every time a new event arrives (query, api_call, tool_call, etc).
IMMEDIATELY saves to file for real-time persistence.
"""
# Add timestamp if not present (should always be added, but safety check)
if "timestamp" not in event:
event["timestamp"] = datetime.now().isoformat()
# Add to in-memory event list
self.events.append(event)
self.session_data["events"] = self.events
# CRITICAL: Save to file immediately (real-time logging)
# This ensures events are persisted even if agent crashes
self._save()
def set_metadata(self, metadata: Dict[str, Any]):
"""Set session metadata (model, toolsets, etc.)."""
self.session_data["metadata"].update(metadata)
self._save()
def finalize(self):
"""Finalize the session and save."""
self.session_data["end_time"] = datetime.now().isoformat()
self._save()
def _save(self):
"""
Save current session data to JSON file.
Called after EVERY event is added - provides real-time persistence.
If file writing fails, logs error but continues (doesn't crash server).
"""
try:
# Write complete session data to JSON file
# indent=2 makes it human-readable
# ensure_ascii=False preserves Unicode characters
with open(self.log_file, 'w', encoding='utf-8') as f:
json.dump(self.session_data, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"❌ Error saving session log: {e}")
class ConnectionManager:
"""
Manages WebSocket connections and active sessions.
Global singleton that:
- Tracks all active WebSocket connections (for broadcasting)
- Manages all SessionLogger instances (one per agent session)
- Coordinates between WebSocket events and file logging
"""
def __init__(self):
self.active_connections: List[WebSocket] = [] # All connected WebSocket clients
self.sessions: Dict[str, SessionLogger] = {} # session_id -> SessionLogger mapping
async def connect(self, websocket: WebSocket):
"""Accept a new WebSocket connection."""
await websocket.accept()
self.active_connections.append(websocket)
print(f"✅ WebSocket connected. Active connections: {len(self.active_connections)}")
def disconnect(self, websocket: WebSocket):
"""Remove a WebSocket connection."""
if websocket in self.active_connections:
self.active_connections.remove(websocket)
print(f"❌ WebSocket disconnected. Active connections: {len(self.active_connections)}")
def get_or_create_session(self, session_id: str) -> SessionLogger:
"""
Get existing session logger or create a new one.
Called when an event arrives for a session. Creates SessionLogger
on first event, reuses it for subsequent events from same session.
"""
if session_id not in self.sessions:
# First time seeing this session - create new logger
self.sessions[session_id] = SessionLogger(session_id)
print(f"📝 Created new session: {session_id}")
return self.sessions[session_id]
def finalize_session(self, session_id: str):
"""Finalize and clean up a session."""
if session_id in self.sessions:
self.sessions[session_id].finalize()
print(f"✅ Session finalized: {session_id}")
async def broadcast(self, message: Dict[str, Any]):
"""
Broadcast a message to all connected WebSocket clients.
Allows multiple clients (e.g., multiple browser tabs) to watch
the same agent session in real-time. Future UI feature.
"""
disconnected = []
for connection in self.active_connections:
try:
await connection.send_json(message)
except Exception:
# Connection closed - mark for removal
disconnected.append(connection)
# Clean up disconnected clients silently
for conn in disconnected:
if conn in self.active_connections:
self.active_connections.remove(conn)
# Global connection manager
manager = ConnectionManager()
# Request/Response models for API endpoints
class AgentRequest(BaseModel):
"""Request model for starting an agent run."""
query: str
model: str = "claude-sonnet-4-5-20250929"
base_url: str = "https://api.anthropic.com/v1/"
enabled_toolsets: Optional[List[str]] = None
disabled_toolsets: Optional[List[str]] = None
max_turns: int = 10
mock_web_tools: bool = False
mock_delay: int = 60
verbose: bool = False
class AgentResponse(BaseModel):
"""Response model for agent run request."""
status: str
session_id: str
message: str
@app.get("/")
async def root():
"""Root endpoint - server status."""
return {
"status": "running",
"service": "Hermes Agent Logging Server",
"websocket_url": "ws://localhost:8000/ws",
"active_connections": len(manager.active_connections),
"active_sessions": len(manager.sessions),
"logs_directory": str(LOGS_DIR)
}
@app.get("/sessions")
async def list_sessions():
"""List all active and recent sessions."""
# Get all session log files
session_files = list(LOGS_DIR.glob("session_*.json"))
sessions = []
for session_file in sorted(session_files, key=lambda x: x.stat().st_mtime, reverse=True):
try:
with open(session_file, 'r', encoding='utf-8') as f:
session_data = json.load(f)
sessions.append({
"session_id": session_data.get("session_id"),
"start_time": session_data.get("start_time"),
"end_time": session_data.get("end_time"),
"event_count": len(session_data.get("events", [])),
"file": str(session_file)
})
except Exception as e:
print(f"⚠️ Error reading session file {session_file}: {e}")
return {
"total_sessions": len(sessions),
"sessions": sessions
}
@app.get("/sessions/{session_id}")
async def get_session(session_id: str):
"""Get detailed data for a specific session."""
session_file = LOGS_DIR / f"session_{session_id}.json"
if not session_file.exists():
return {"error": "Session not found"}, 404
try:
with open(session_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
return {"error": f"Failed to load session: {str(e)}"}, 500
@app.post("/agent/run", response_model=AgentResponse)
async def run_agent(request: AgentRequest, background_tasks: BackgroundTasks):
"""
Start an agent run with specified parameters.
This endpoint triggers an agent execution in the background and returns immediately.
The agent will connect to the WebSocket endpoint to send real-time events.
Args:
request: AgentRequest with query and configuration
background_tasks: FastAPI background tasks for async execution
Returns:
AgentResponse with session_id for tracking
"""
import uuid
import sys
import os
# Generate session ID for this run - we'll pass it to the agent
session_id = str(uuid.uuid4())
# Add parent directory to path to import run_agent
parent_dir = str(Path(__file__).parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from run_agent import AIAgent
# Run agent in background thread (not blocking the API)
def run_agent_background():
"""Run agent in a separate thread."""
try:
# Initialize agent with WebSocket logging enabled
agent = AIAgent(
base_url=request.base_url,
model=request.model,
api_key=os.getenv("ANTHROPIC_API_KEY"),
max_iterations=request.max_turns,
enabled_toolsets=request.enabled_toolsets,
disabled_toolsets=request.disabled_toolsets,
save_trajectories=False,
verbose_logging=request.verbose,
enable_websocket_logging=True, # Always enable for UI
websocket_server="ws://localhost:8000/ws",
mock_web_tools=request.mock_web_tools,
mock_delay=request.mock_delay
)
# Run conversation with our session_id
result = agent.run_conversation(
request.query,
session_id=session_id # Pass session_id so it matches
)
print(f"✅ Agent run completed: {session_id[:8]}...")
print(f" Final response: {result['final_response'][:100] if result.get('final_response') else 'No response'}...")
except Exception as e:
print(f"❌ Error running agent {session_id[:8]}...: {e}")
import traceback
traceback.print_exc()
# Start agent in background thread
thread = threading.Thread(target=run_agent_background, daemon=True)
thread.start()
return AgentResponse(
status="started",
session_id=session_id,
message=f"Agent started with session ID: {session_id}"
)
@app.get("/tools")
async def get_available_tools():
"""Get list of available toolsets and tools."""
try:
import sys
parent_dir = str(Path(__file__).parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from toolsets import get_all_toolsets, get_toolset_info
all_toolsets = get_all_toolsets()
toolsets_info = []
for name in all_toolsets.keys():
info = get_toolset_info(name)
if info:
toolsets_info.append({
"name": name,
"description": info['description'],
"tool_count": info['tool_count'],
"resolved_tools": info['resolved_tools']
})
return {
"toolsets": toolsets_info
}
except Exception as e:
return {"error": f"Failed to load tools: {str(e)}"}
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""
WebSocket endpoint for receiving real-time agent events.
This is the main entry point for all logging. Agents connect here and send events.
Message Flow:
1. Agent connects to ws://localhost:8000/ws
2. Agent sends events as JSON messages
3. Server parses event_type and routes to appropriate handler
4. Event is added to SessionLogger (saved to file)
5. Event is broadcast to all connected clients
6. Acknowledgment sent back to agent
Expected message format:
{
"session_id": "unique-session-id", // Links event to specific session
"event_type": "query" | "api_call" | ..., // What kind of event
"data": { ... event-specific data ... } // Event payload
}
"""
# Accept the WebSocket connection
await manager.connect(websocket)
try:
# Main event loop - runs until client disconnects
while True:
# Receive message from client (agent)
# This is a blocking call - waits for next message
message = await websocket.receive_json()
# Parse the standard message structure
session_id = message.get("session_id") # Which agent session
event_type = message.get("event_type") # What kind of event
data = message.get("data", {}) # Event payload
# Validate: session_id is required
if not session_id:
await websocket.send_json({
"error": "session_id is required"
})
continue
# Get or create SessionLogger for this session
# First event creates it, subsequent events reuse it
session = manager.get_or_create_session(session_id)
# Route event to appropriate handler based on event_type
# Each handler extracts relevant data and adds to session log
if event_type == "session_start":
# Initial event - sent when agent first connects
# Contains metadata about the session (model, toolsets, etc.)
session.set_metadata(data)
print(f"🚀 Session started: {session_id}")
elif event_type == "query":
# User query
session.add_event({
"type": "query",
"query": data.get("query"),
"toolsets": data.get("toolsets"),
"model": data.get("model")
})
print(f"📝 Query logged: {data.get('query', '')[:60]}...")
elif event_type == "api_call":
# API call to model
session.add_event({
"type": "api_call",
"call_number": data.get("call_number"),
"message_count": data.get("message_count"),
"has_tools": data.get("has_tools")
})
print(f"🔄 API call #{data.get('call_number')} logged")
elif event_type == "response":
# Assistant response
session.add_event({
"type": "response",
"call_number": data.get("call_number"),
"content": data.get("content"),
"has_tool_calls": data.get("has_tool_calls"),
"tool_call_count": data.get("tool_call_count"),
"duration": data.get("duration")
})
print(f"🤖 Response logged: {data.get('content', '')[:60]}...")
elif event_type == "tool_call":
# Tool execution
session.add_event({
"type": "tool_call",
"call_number": data.get("call_number"),
"tool_index": data.get("tool_index"),
"tool_name": data.get("tool_name"),
"parameters": data.get("parameters"),
"tool_call_id": data.get("tool_call_id")
})
print(f"🔧 Tool call logged: {data.get('tool_name')}")
elif event_type == "tool_result":
# Tool result - captures output from tool execution
# Now includes BOTH truncated preview AND full raw result
session.add_event({
"type": "tool_result",
"call_number": data.get("call_number"),
"tool_index": data.get("tool_index"),
"tool_name": data.get("tool_name"),
"result": data.get("result"), # Truncated preview (1000 chars)
"raw_result": data.get("raw_result"), # NEW: Full untruncated result
"error": data.get("error"),
"duration": data.get("duration"),
"tool_call_id": data.get("tool_call_id")
})
# Enhanced logging with size information
if data.get("error"):
print(f"❌ Tool error logged: {data.get('tool_name')}")
else:
# Show size of raw result to indicate data volume
raw_size = len(data.get("raw_result", "")) if data.get("raw_result") else len(data.get("result", ""))
size_kb = raw_size / 1024
print(f"✅ Tool result logged: {data.get('tool_name')} ({size_kb:.1f} KB)")
elif event_type == "error":
# Error event
session.add_event({
"type": "error",
"error_message": data.get("error_message"),
"call_number": data.get("call_number")
})
print(f"❌ Error logged: {data.get('error_message', '')[:60]}...")
elif event_type == "complete":
# Session complete
session.add_event({
"type": "complete",
"final_response": data.get("final_response"),
"total_calls": data.get("total_calls"),
"completed": data.get("completed")
})
manager.finalize_session(session_id)
print(f"🎉 Session complete: {session_id}")
else:
# Unknown event type - log it anyway
session.add_event({
"type": event_type or "unknown",
**data
})
print(f"⚠️ Unknown event type: {event_type}")
# Broadcast event to all connected clients (for future real-time UI)
# Allows multiple browsers/dashboards to watch same session live
await manager.broadcast({
"session_id": session_id,
"event_type": event_type,
"timestamp": datetime.now().isoformat(),
"data": data
})
# Send acknowledgment back to sender
# Confirms event was received and logged
# Handle case where client disconnects before we can ack
try:
await websocket.send_json({
"status": "logged",
"session_id": session_id,
"event_type": event_type
})
except Exception:
# Connection closed before ack - this is normal for "complete" event
# Client disconnects after sending, so we can't ack
pass
except WebSocketDisconnect:
manager.disconnect(websocket)
except Exception as e:
print(f"❌ WebSocket error: {e}")
manager.disconnect(websocket)
def main(host: str = "0.0.0.0", port: int = 8000, reload: bool = False):
"""
Start the logging server.
Args:
host: Host to bind to (default: 0.0.0.0)
port: Port to run on (default: 8000)
reload: Enable auto-reload on file changes (default: False)
"""
print("🚀 Hermes Agent Logging Server")
print("=" * 50)
print(f"📂 Logs directory: {LOGS_DIR}")
print(f"🌐 Server starting at http://{host}:{port}")
print(f"🔌 WebSocket endpoint: ws://{host}:{port}/ws")
print(f"🔄 Auto-reload: {'enabled' if reload else 'disabled'}")
print("\n📡 Ready to receive agent events...")
print("=" * 50)
uvicorn.run(
"logging_server:app",
host=host,
port=port,
reload=reload,
log_level="info",
timeout_keep_alive=600 # Keep HTTP/WS connections alive for 10 minutes of inactivity
# Note: WebSocket ping/pong disabled in client to avoid timeout during blocked event loop
)
if __name__ == "__main__":
import fire
fire.Fire(main)

View File

@@ -0,0 +1,91 @@
#!/bin/bash
# Test script for WebSocket logging system
#
# This script demonstrates the complete WebSocket logging workflow:
# 1. Starts the logging server
# 2. Runs the agent with WebSocket logging enabled
# 3. Shows the logged data
#
# Usage: ./test_websocket_logging.sh
set -e # Exit on error
echo "🧪 Testing WebSocket Logging System"
echo "===================================="
echo ""
# Check if required packages are installed
echo "📦 Checking dependencies..."
python -c "import fastapi; import uvicorn; import websockets" 2>/dev/null || {
echo "❌ Missing dependencies. Installing..."
pip install fastapi uvicorn websockets
}
echo "✅ Dependencies OK"
echo ""
# Start the logging server in the background
echo "🚀 Starting logging server..."
python api_endpoint/logging_server.py --port 8000 &
SERVER_PID=$!
# Give server time to start
sleep 2
# Check if server is running
if ps -p $SERVER_PID > /dev/null; then
echo "✅ Logging server started (PID: $SERVER_PID)"
else
echo "❌ Failed to start logging server"
exit 1
fi
echo ""
echo "🤖 Running agent with WebSocket logging..."
echo ""
# Run the agent with WebSocket logging
python run_agent.py \
--enabled_toolsets web \
--enable_websocket_logging \
--query "What are the top 3 programming languages in 2025?" \
--max_turns 5
echo ""
echo "✅ Agent execution complete!"
echo ""
# Show the most recent log file
echo "📊 Viewing logged session data..."
echo ""
LATEST_LOG=$(ls -t logs/realtime/session_*.json 2>/dev/null | head -1)
if [ -f "$LATEST_LOG" ]; then
echo "📄 Log file: $LATEST_LOG"
echo ""
# Pretty print the JSON if jq is available
if command -v jq &> /dev/null; then
echo "Event summary:"
jq '.events[] | {type: .type, timestamp: .timestamp}' "$LATEST_LOG"
echo ""
echo "Total events: $(jq '.events | length' "$LATEST_LOG")"
else
echo "Content (install 'jq' for pretty printing):"
cat "$LATEST_LOG"
fi
else
echo "⚠️ No log files found in logs/realtime/"
fi
echo ""
echo "🛑 Stopping logging server..."
kill $SERVER_PID 2>/dev/null || true
echo "✅ Test complete!"
echo ""
echo "Next steps:"
echo " 1. Start server: python api_endpoint/logging_server.py"
echo " 2. Run agent: python run_agent.py --enable_websocket_logging --query \"...\""
echo " 3. View logs: http://localhost:8000/sessions"

View File

@@ -0,0 +1,457 @@
"""
WebSocket Connection Pool - Persistent Connection Manager
This module provides a singleton WebSocket connection that persists across
multiple agent runs. This is a more robust architecture than creating a new
connection for each run.
Benefits:
- No timeout issues (connection stays alive indefinitely)
- No reconnection overhead (connect once)
- Supports parallel agent runs (multiple sessions share one socket)
- Proper shutdown handling (SIGTERM/SIGINT)
- Thread-safe concurrent sends
"""
import asyncio
import signal
import websockets
from typing import Optional, Dict, Any
import json
import atexit
import sys
import threading
from datetime import datetime
class WebSocketConnectionPool:
"""
Singleton WebSocket connection manager.
Maintains a single persistent connection to the logging server
that all agent sessions can use. Handles graceful shutdown.
Usage:
# Get singleton instance
pool = WebSocketConnectionPool()
# Connect (idempotent - safe to call multiple times)
await pool.connect()
# Send events (thread-safe, multiple sessions can call concurrently)
await pool.send_event("query", session_id, {...})
# Shutdown handled automatically on SIGTERM/SIGINT
"""
_instance: Optional['WebSocketConnectionPool'] = None
def __new__(cls):
"""Ensure only one instance exists (singleton pattern)."""
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self):
"""Initialize the connection pool (only once)."""
if getattr(self, '_initialized', False):
return
self.websocket: Optional[websockets.WebSocketClientProtocol] = None
self.server_url: str = "ws://localhost:8000/ws"
self.connected: bool = False
# Store reference to loop for signal handlers
# Agent code should never close event loops when using persistent connections
self.loop: Optional[asyncio.AbstractEventLoop] = None
# Locks are created lazily when event loop exists
self._send_lock: Optional[asyncio.Lock] = None
self._connect_lock: Optional[asyncio.Lock] = None
self._locks_loop: Optional[asyncio.AbstractEventLoop] = None # Track which loop created locks
self._init_lock = threading.Lock() # Thread-safe lock initialization
self._shutdown_in_progress = False
self._initialized = True
# Register shutdown handlers for graceful cleanup
# These ensure WebSocket is closed properly on exit
signal.signal(signal.SIGTERM, self._signal_handler)
signal.signal(signal.SIGINT, self._signal_handler)
atexit.register(self._cleanup_sync)
print("🔌 WebSocket connection pool initialized")
def _ensure_locks(self):
"""
Lazy initialization of asyncio locks with thread safety and loop tracking.
Locks must be created when an event loop exists, not at import time.
If the event loop changes between runs, locks must be recreated because
asyncio.Lock objects are bound to the loop that created them.
This is called before any async operation that needs locks.
Uses a threading.Lock to prevent race conditions during initialization.
"""
with self._init_lock: # Thread-safe initialization
try:
current_loop = asyncio.get_event_loop()
except RuntimeError:
# No event loop in current thread
return
# Recreate locks if:
# 1. Locks don't exist yet, OR
# 2. Event loop has changed (locks are bound to the loop that created them)
if self._locks_loop is not current_loop or self._send_lock is None:
self._send_lock = asyncio.Lock()
self._connect_lock = asyncio.Lock()
self._locks_loop = current_loop
async def connect(self, server_url: str = "ws://localhost:8000/ws") -> bool:
"""
Connect to WebSocket server.
This is idempotent - safe to call multiple times. If already connected,
does nothing. If connection failed previously, will retry.
Args:
server_url: WebSocket server URL (default: ws://localhost:8000/ws)
Returns:
bool: True if connected successfully, False otherwise
"""
# Ensure locks exist (lazy initialization)
self._ensure_locks()
async with self._connect_lock:
# Always update loop reference to current loop (even if already connected)
# This ensures signal handlers and cleanup use the correct loop
self.loop = asyncio.get_event_loop()
# Already connected - nothing to do
if self.connected and self.websocket:
return True
try:
self.server_url = server_url
# Establish persistent WebSocket connection
# No ping/pong needed since connection stays open indefinitely
self.websocket = await websockets.connect(
server_url,
ping_interval=None, # Disable ping/pong (not needed for persistent connection)
max_size=10 * 1024 * 1024, # 10MB max message size for large tool results
open_timeout=10, # 10s timeout for initial connection
close_timeout=5 # 5s timeout for close handshake
)
self.connected = True
print(f"✅ Connected to logging server (persistent): {server_url}")
return True
except Exception as e:
print(f"⚠️ Failed to connect to logging server: {e}")
self.connected = False
self.websocket = None
return False
async def send_event(
self,
event_type: str,
session_id: str,
data: Dict[str, Any],
retry: bool = True
) -> bool:
"""
Send event to logging server (thread-safe).
Multiple agent runs can call this concurrently. The send lock ensures
only one message is sent at a time (WebSocket protocol requirement).
Args:
event_type: Type of event (query, api_call, response, tool_call, tool_result, error, complete)
session_id: Unique session identifier
data: Event-specific data dictionary
retry: Whether to retry connection if disconnected (default: True)
Returns:
bool: True if sent successfully, False otherwise
"""
# Try to connect if not connected (or reconnect if disconnected)
if not self.connected or not self.websocket:
if retry:
await self.connect()
if not self.connected:
return False # Give up if connection fails
# Ensure locks exist (lazy initialization)
self._ensure_locks()
# Lock to prevent concurrent sends (WebSocket requires sequential sends)
async with self._send_lock:
try:
# Create standardized message format
message = {
"session_id": session_id,
"event_type": event_type,
"data": data,
"timestamp": datetime.now().isoformat()
}
# Send message as JSON
await self.websocket.send(json.dumps(message))
# Wait for server acknowledgment (with timeout)
# This confirms the server received and processed the event
try:
response = await asyncio.wait_for(
self.websocket.recv(),
timeout=2.0 # Increased to 2s for busy servers
)
# Successfully received acknowledgment
return True
except asyncio.TimeoutError:
# No response within timeout - that's OK, message likely sent
# Server might be busy processing
return True
except websockets.exceptions.ConnectionClosed:
print(f"⚠️ WebSocket connection closed unexpectedly")
self.connected = False
# Try to reconnect and resend (one retry)
if retry:
print("🔄 Attempting to reconnect...")
if await self.connect():
# Recursively call with retry=False to avoid infinite loop
return await self.send_event(event_type, session_id, data, retry=False)
return False
except Exception as e:
print(f"⚠️ Error sending event: {e}")
self.connected = False
return False
async def disconnect(self):
"""
Gracefully close the WebSocket connection.
Called on shutdown (SIGTERM/SIGINT/exit). Ensures proper cleanup.
"""
if self._shutdown_in_progress:
return # Already shutting down
self._shutdown_in_progress = True
if self.websocket and self.connected:
try:
await self.websocket.close()
self.connected = False
print("✅ WebSocket connection pool closed gracefully")
except Exception as e:
print(f"⚠️ Error closing WebSocket: {e}")
self._shutdown_in_progress = False
def _signal_handler(self, signum, frame):
"""
Handle SIGTERM/SIGINT signals for graceful shutdown.
When user presses Ctrl+C or system sends SIGTERM, this ensures
the WebSocket is closed properly before exit.
"""
print(f"\n🛑 Received signal {signum}, closing WebSocket connection pool...")
# Check if we have a valid loop and are connected
if self.loop and not self.loop.is_closed() and self.connected and not self._shutdown_in_progress:
try:
# If loop is not running, we can wait for disconnect
if not self.loop.is_running():
self.loop.run_until_complete(self.disconnect())
else:
# Loop is running, can't wait for task - just mark disconnected
# The disconnect task would be cancelled when we exit anyway
self.connected = False
print("⚠️ Loop is running, marking disconnected without waiting")
except Exception as e:
print(f"⚠️ Error during signal handler cleanup: {e}")
# Exit gracefully
sys.exit(0)
def _cleanup_sync(self):
"""
Cleanup at exit (atexit handler).
This is a fallback in case signal handlers don't fire.
Called when Python interpreter shuts down normally.
"""
if self.loop and not self.loop.is_closed() and self.connected and not self._shutdown_in_progress:
try:
# Try to run disconnect synchronously
self.loop.run_until_complete(self.disconnect())
except Exception:
# Ignore errors during exit cleanup
pass
def is_connected(self) -> bool:
"""Check if currently connected to server."""
return self.connected and self.websocket is not None
def get_stats(self) -> Dict[str, Any]:
"""Get connection statistics for debugging."""
return {
"connected": self.connected,
"server_url": self.server_url,
"shutdown_in_progress": self._shutdown_in_progress,
"has_websocket": self.websocket is not None,
"has_loop": self.loop is not None
}
# Global singleton instance
# Import this in other modules: from websocket_connection_pool import ws_pool
ws_pool = WebSocketConnectionPool()
# Convenience functions for direct usage
async def connect(server_url: str = "ws://localhost:8000/ws") -> bool:
"""Connect to logging server (convenience function)."""
return await ws_pool.connect(server_url)
async def send_event(event_type: str, session_id: str, data: Dict[str, Any]) -> bool:
"""Send event to logging server (convenience function)."""
return await ws_pool.send_event(event_type, session_id, data)
async def disconnect():
"""Disconnect from logging server (convenience function)."""
await ws_pool.disconnect()
def is_connected() -> bool:
"""Check if connected to logging server (convenience function)."""
return ws_pool.is_connected()
# ============================================================================
# SYNCHRONOUS API FOR AGENT LAYER
# ============================================================================
# These functions provide a clean abstraction that hides event loop management
# from the agent layer. Agent code should ONLY use these functions.
def connect_sync(server_url: str = "ws://localhost:8000/ws") -> bool:
"""
Synchronous connect - handles event loop internally.
Creates a persistent event loop in a background thread if needed.
This is thread-safe and can be called from any thread (including agent background threads).
"""
import threading
# If pool doesn't have a loop yet or it's closed, we need to start one
if not ws_pool.loop or ws_pool.loop.is_closed():
# Start connection in a background thread with its own loop
result_container = {"success": False, "error": None, "connected": False}
def run_in_thread():
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
ws_pool.loop = loop # Store the loop in the pool
# Connect to WebSocket
result_container["success"] = loop.run_until_complete(ws_pool.connect(server_url))
result_container["connected"] = True
# Keep loop running forever for future send_event calls
# This is critical - the loop must stay alive for run_coroutine_threadsafe to work
loop.run_forever()
except Exception as e:
result_container["error"] = str(e)
print(f"❌ Error in WebSocket connection thread: {e}")
finally:
# Clean up if loop stops
if loop.is_running():
loop.close()
thread = threading.Thread(target=run_in_thread, daemon=True, name="WebSocket-EventLoop")
thread.start()
# Wait for connection to complete (but not for loop to exit - it runs forever)
import time
timeout = 10.0
start = time.time()
while not result_container["connected"] and (time.time() - start) < timeout:
time.sleep(0.1)
if result_container["error"]:
print(f"⚠️ Connection failed: {result_container['error']}")
return result_container["success"]
else:
# Pool already has a loop, use run_coroutine_threadsafe
try:
future = asyncio.run_coroutine_threadsafe(
ws_pool.connect(server_url),
ws_pool.loop
)
return future.result(timeout=10.0)
except Exception as e:
print(f"⚠️ Connection failed: {e}")
return False
def send_event_sync(event_type: str, session_id: str, data: Dict[str, Any]) -> bool:
"""
Synchronous send event - handles event loop internally.
Uses the WebSocket pool's own event loop to avoid loop conflicts.
This is critical when called from background threads (like agent execution).
This is thread-safe and works correctly even when agent runs in a different thread.
"""
if not ws_pool.loop or not ws_pool.loop.is_running():
# No event loop running - can't send
print("⚠️ WebSocket pool has no running event loop")
return False
try:
# Use run_coroutine_threadsafe to submit to the WebSocket pool's loop
# This works across threads - submits the coroutine to the correct loop
future = asyncio.run_coroutine_threadsafe(
ws_pool.send_event(event_type, session_id, data),
ws_pool.loop # ← Use the pool's loop, not current thread's loop
)
# Wait for completion (with timeout to avoid hanging)
return future.result(timeout=5.0)
except TimeoutError:
print(f"⚠️ Timeout sending event {event_type}")
return False
except Exception as e:
print(f"⚠️ Error sending event: {e}")
return False
def disconnect_sync():
"""
Synchronous disconnect - handles event loop internally.
Thread-safe disconnect that works from any thread.
"""
if ws_pool.loop and ws_pool.loop.is_running():
try:
future = asyncio.run_coroutine_threadsafe(
ws_pool.disconnect(),
ws_pool.loop
)
return future.result(timeout=5.0)
except Exception as e:
print(f"⚠️ Error disconnecting: {e}")
return False
return True

View File

@@ -0,0 +1,387 @@
#!/usr/bin/env python3
"""
WebSocket Logger Client
Simple client for sending agent events to the logging server via WebSocket.
Used by the agent to log events in real-time during execution.
"""
import json
import asyncio
from typing import Dict, Any, Optional
from datetime import datetime
import websockets
class WebSocketLogger:
"""
Client for logging agent events via WebSocket.
Usage:
logger = WebSocketLogger("unique-session-id")
await logger.connect()
await logger.log_query("What is Python?", model="gpt-4")
await logger.log_api_call(call_number=1)
await logger.log_response(call_number=1, content="Python is...")
await logger.disconnect()
"""
def __init__(
self,
session_id: str,
server_url: str = "ws://localhost:8000/ws",
enabled: bool = True
):
"""
Initialize WebSocket logger.
Args:
session_id: Unique identifier for this agent session
server_url: WebSocket server URL (default: ws://localhost:8000/ws)
enabled: Whether logging is enabled (default: True)
"""
self.session_id = session_id
self.server_url = server_url
self.enabled = enabled
self.websocket: Optional[websockets.WebSocketClientProtocol] = None
self.connected = False
self.reconnect_count = 0 # Track reconnections for debugging
async def connect(self):
"""
Connect to the WebSocket logging server.
Establishes WebSocket connection and sends initial session_start event.
If connection fails, gracefully disables logging (agent continues normally).
"""
if not self.enabled:
return
try:
# Establish WebSocket connection to the server
# Use VERY LONG ping intervals to avoid timeout during long tool execution
# The event loop is blocked during tool execution, so we can't process pings
# Setting to very large values (1 hour) effectively disables it
self.websocket = await websockets.connect(
self.server_url,
ping_interval=3600, # 1 hour - effectively disabled (event loop blocked anyway)
ping_timeout=3600, # 1 hour timeout for pong response
close_timeout=10, # Timeout for close handshake
max_size=10 * 1024 * 1024, # 10MB max message size (for large raw_results)
open_timeout=10 # Timeout for initial connection
)
self.connected = True
print(f"✅ Connected to logging server (ping/pong: 3600s intervals): {self.server_url}")
# Send initial session_start event
# This tells the server to create a new SessionLogger for this session
await self._send_event("session_start", {
"session_id": self.session_id,
"start_time": datetime.now().isoformat()
})
except Exception as e:
# Connection failed - disable logging but don't crash the agent
print(f"⚠️ Failed to connect to logging server: {e}")
print(f" Logging will be disabled for this session.")
self.enabled = False
self.connected = False
async def disconnect(self):
"""Disconnect from the WebSocket server."""
if self.websocket and self.connected:
try:
await self.websocket.close()
self.connected = False
print(f"✅ Disconnected from logging server")
except Exception as e:
print(f"⚠️ Error disconnecting: {e}")
async def _send_event(self, event_type: str, data: Dict[str, Any]):
"""
Send an event to the logging server.
This is the core method that sends all events via WebSocket.
Creates a standardized message format and handles acknowledgments.
Args:
event_type: Type of event (query, api_call, response, tool_call, tool_result, error, complete)
data: Event data dictionary containing event-specific information
"""
# Safety check: Don't send if logging is disabled
if not self.enabled:
return
# Auto-reconnect if connection was lost
if not self.connected or not self.websocket:
try:
self.reconnect_count += 1
print(f"🔄 Reconnecting to logging server (attempt #{self.reconnect_count})...")
await self.connect()
print(f"✅ Reconnected successfully!")
except Exception as e:
print(f"⚠️ Failed to reconnect: {e}")
self.enabled = False # Disable logging after failed reconnect
return
try:
# Create standardized message structure
# All events follow this format for consistent server-side handling
message = {
"session_id": self.session_id, # Links event to specific agent session
"event_type": event_type, # Identifies what kind of event this is
"data": data # Event-specific payload
}
# Send message as JSON string over WebSocket
await self.websocket.send(json.dumps(message))
# Wait for server acknowledgment (with 1 second timeout)
# This ensures the server received and processed the event
try:
response = await asyncio.wait_for(
self.websocket.recv(),
timeout=1.0
)
# Server sends back: {"status": "logged", "session_id": "...", "event_type": "..."}
# We don't need to process it, just confirms receipt
except asyncio.TimeoutError:
# No response within 1 second - that's okay, continue anyway
# Server might be busy or network slow, but event was likely sent
pass
except Exception as e:
# Log error but don't crash - graceful degradation
# Agent should continue working even if logging fails
error_str = str(e)
# Check if connection was closed (error 1011 = keepalive ping timeout)
if "1011" in error_str or "closed" in error_str.lower():
print(f"⚠️ WebSocket connection closed: {error_str}")
self.connected = False # Mark as disconnected
# Don't try to send more events - connection is dead
else:
print(f"⚠️ Error sending event to logging server: {e}")
# Don't disable entirely or try to reconnect - just continue with logging disabled
# Convenience methods for specific event types
async def log_query(
self,
query: str,
model: str = None,
toolsets: list = None
):
"""
Log a user query (the question/task given to the agent).
This is typically the first event in a session after connection.
Captures what the user asked and which model/tools will be used.
"""
await self._send_event("query", {
"query": query, # The user's question/instruction
"model": model, # Which AI model is being used
"toolsets": toolsets # Which tool categories are enabled
})
async def log_api_call(
self,
call_number: int,
message_count: int = None,
has_tools: bool = None
):
"""
Log an API call to the AI model.
Called right before sending a request to the model (OpenAI/Anthropic/etc).
Helps track how many API calls are being made and conversation length.
"""
await self._send_event("api_call", {
"call_number": call_number, # Sequential number (1, 2, 3...)
"message_count": message_count, # How many messages in conversation so far
"has_tools": has_tools # Whether tools are available to the model
})
async def log_response(
self,
call_number: int,
content: str = None,
has_tool_calls: bool = False,
tool_call_count: int = 0,
duration: float = None
):
"""
Log an assistant response from the AI model.
Called after receiving a response from the API.
Captures what the model said and whether it wants to use tools.
"""
await self._send_event("response", {
"call_number": call_number, # Which API call this response is from
"content": content, # What the model said (text response)
"has_tool_calls": has_tool_calls, # Did model request tool execution?
"tool_call_count": tool_call_count, # How many tools does it want to call?
"duration": duration # How long the API call took (seconds)
})
async def log_tool_call(
self,
call_number: int,
tool_index: int,
tool_name: str,
parameters: Dict[str, Any],
tool_call_id: str = None
):
"""
Log a tool call (before executing the tool).
Captures which tool is being called and with what parameters.
This happens BEFORE the tool runs, so no results yet.
"""
await self._send_event("tool_call", {
"call_number": call_number, # Which API call requested this tool
"tool_index": tool_index, # Which tool in the sequence (if multiple)
"tool_name": tool_name, # Name of tool (e.g., "web_search", "web_extract")
"parameters": parameters, # Arguments passed to the tool (e.g., {"query": "Python", "limit": 5})
"tool_call_id": tool_call_id # Unique ID to link call with result
})
async def log_tool_result(
self,
call_number: int,
tool_index: int,
tool_name: str,
result: str = None,
error: str = None,
duration: float = None,
tool_call_id: str = None,
raw_result: str = None # NEW: Full untruncated result for verification
):
"""
Log a tool result (output from tool execution).
Captures both a truncated preview (for UI display) and the full raw result
(for verification and debugging). This is especially important for web tools
where you want to see what was scraped vs what the LLM processed.
Args:
call_number: Which API call this tool was part of
tool_index: Which tool in the sequence (1st, 2nd, etc.)
tool_name: Name of the tool that was executed
result: Tool output (will be truncated to 1000 chars for preview)
error: Error message if tool failed
duration: How long the tool took to execute (seconds)
tool_call_id: Unique ID linking this result to the tool call
raw_result: NEW - Full untruncated result for verification/debugging
"""
await self._send_event("tool_result", {
"call_number": call_number,
"tool_index": tool_index,
"tool_name": tool_name,
"result": result[:1000] if result else None, # Truncated preview (1000 chars max)
"raw_result": raw_result, # NEW: Full result - can be 100KB+ for web scraping
"error": error,
"duration": duration,
"tool_call_id": tool_call_id
})
async def log_error(
self,
error_message: str,
call_number: int = None
):
"""
Log an error that occurred during agent execution.
Captures exceptions, API failures, or other issues.
"""
await self._send_event("error", {
"error_message": error_message, # Description of what went wrong
"call_number": call_number # Which API call caused the error (if applicable)
})
async def log_complete(
self,
final_response: str = None,
total_calls: int = None,
completed: bool = True
):
"""
Log session completion (final event before disconnecting).
Marks the end of the agent's execution and provides summary info.
"""
await self._send_event("complete", {
"final_response": final_response[:500] if final_response else None, # Truncated summary of final answer
"total_calls": total_calls, # How many API calls were made total
"completed": completed # Did it complete successfully? (true/false)
})
# Synchronous wrapper for convenience
class SyncWebSocketLogger:
"""
Synchronous wrapper around WebSocketLogger.
For use in synchronous code - creates an event loop internally.
"""
def __init__(self, session_id: str, server_url: str = "ws://localhost:8000/ws", enabled: bool = True):
self.logger = WebSocketLogger(session_id, server_url, enabled)
self.loop = None
def connect(self):
"""Connect to server (synchronous)."""
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.loop)
self.loop.run_until_complete(self.logger.connect())
def disconnect(self):
"""Disconnect from server (synchronous)."""
if self.loop:
self.loop.run_until_complete(self.logger.disconnect())
self.loop.close()
def _run_async(self, coro):
"""
Run an async coroutine synchronously.
Bridge between sync code (agent) and async code (WebSocket).
Uses event loop to execute async operations in sync context.
"""
if self.loop and self.loop.is_running():
# Already in event loop, just await
asyncio.create_task(coro)
else:
# Run in current loop
if self.loop:
self.loop.run_until_complete(coro)
def log_query(self, query: str, model: str = None, toolsets: list = None):
self._run_async(self.logger.log_query(query, model, toolsets))
def log_api_call(self, call_number: int, message_count: int = None, has_tools: bool = None):
self._run_async(self.logger.log_api_call(call_number, message_count, has_tools))
def log_response(self, call_number: int, content: str = None, has_tool_calls: bool = False,
tool_call_count: int = 0, duration: float = None):
self._run_async(self.logger.log_response(call_number, content, has_tool_calls,
tool_call_count, duration))
def log_tool_call(self, call_number: int, tool_index: int, tool_name: str,
parameters: Dict[str, Any], tool_call_id: str = None):
self._run_async(self.logger.log_tool_call(call_number, tool_index, tool_name,
parameters, tool_call_id))
def log_tool_result(self, call_number: int, tool_index: int, tool_name: str,
result: str = None, error: str = None, duration: float = None,
tool_call_id: str = None, raw_result: str = None):
self._run_async(self.logger.log_tool_result(call_number, tool_index, tool_name,
result, error, duration, tool_call_id, raw_result))
def log_error(self, error_message: str, call_number: int = None):
self._run_async(self.logger.log_error(error_message, call_number))
def log_complete(self, final_response: str = None, total_calls: int = None, completed: bool = True):
self._run_async(self.logger.log_complete(final_response, total_calls, completed))

File diff suppressed because it is too large Load Diff

View File

@@ -1,285 +0,0 @@
# Hermes Agent CLI Configuration
# Copy this file to cli-config.yaml and customize as needed.
# This file configures the CLI behavior. Environment variables in .env take precedence.
# =============================================================================
# Model Configuration
# =============================================================================
model:
# Default model to use (can be overridden with --model flag)
default: "anthropic/claude-opus-4.6"
# API configuration (falls back to OPENROUTER_API_KEY env var)
# api_key: "your-key-here" # Uncomment to set here instead of .env
base_url: "https://openrouter.ai/api/v1"
# =============================================================================
# Terminal Tool Configuration
# =============================================================================
# Choose ONE of the following terminal configurations by uncommenting it.
# The terminal tool executes commands in the specified environment.
# -----------------------------------------------------------------------------
# OPTION 1: Local execution (default)
# Commands run directly on your machine in the current directory
# -----------------------------------------------------------------------------
# Working directory behavior:
# - CLI (`hermes` command): Uses "." (current directory where you run hermes)
# - Messaging (Telegram/Discord): Uses MESSAGING_CWD from .env (default: home)
terminal:
backend: "local"
cwd: "." # For local backend: "." = current directory. Ignored for remote backends.
timeout: 180
lifetime_seconds: 300
# sudo_password: "" # Enable sudo commands (pipes via sudo -S) - SECURITY WARNING: plaintext!
# -----------------------------------------------------------------------------
# OPTION 2: SSH remote execution
# Commands run on a remote server - agent code stays local (sandboxed)
# Great for: keeping agent isolated from its own code, using powerful remote hardware
# -----------------------------------------------------------------------------
# terminal:
# backend: "ssh"
# cwd: "/home/myuser/project" # Path on the REMOTE server
# timeout: 180
# lifetime_seconds: 300
# ssh_host: "my-server.example.com"
# ssh_user: "myuser"
# ssh_port: 22
# ssh_key: "~/.ssh/id_rsa" # Optional - uses ssh-agent if not specified
# -----------------------------------------------------------------------------
# OPTION 3: Docker container
# Commands run in an isolated Docker container
# Great for: reproducible environments, testing, isolation
# -----------------------------------------------------------------------------
# terminal:
# backend: "docker"
# cwd: "/workspace" # Path INSIDE the container (default: /)
# timeout: 180
# lifetime_seconds: 300
# docker_image: "nikolaik/python-nodejs:python3.11-nodejs20"
# -----------------------------------------------------------------------------
# OPTION 4: Singularity/Apptainer container
# Commands run in a Singularity container (common in HPC environments)
# Great for: HPC clusters, shared compute environments
# -----------------------------------------------------------------------------
# terminal:
# backend: "singularity"
# cwd: "/workspace" # Path INSIDE the container (default: /root)
# timeout: 180
# lifetime_seconds: 300
# singularity_image: "docker://nikolaik/python-nodejs:python3.11-nodejs20"
# -----------------------------------------------------------------------------
# OPTION 5: Modal cloud execution
# Commands run on Modal's cloud infrastructure
# Great for: GPU access, scalable compute, serverless execution
# -----------------------------------------------------------------------------
# terminal:
# backend: "modal"
# cwd: "/workspace" # Path INSIDE the sandbox (default: /root)
# timeout: 180
# lifetime_seconds: 300
# modal_image: "nikolaik/python-nodejs:python3.11-nodejs20"
# -----------------------------------------------------------------------------
# SUDO SUPPORT (works with ALL backends above)
# -----------------------------------------------------------------------------
# Add sudo_password to any terminal config above to enable sudo commands.
# The password is piped via `sudo -S`. Works with local, ssh, docker, etc.
#
# SECURITY WARNING: Password stored in plaintext!
#
# INTERACTIVE PROMPT: If no sudo_password is set and the CLI is running,
# you'll be prompted to enter your password when sudo is needed:
# - 45-second timeout (auto-skips if no input)
# - Press Enter to skip (command fails gracefully)
# - Password is hidden while typing
# - Password is cached for the session
#
# ALTERNATIVES:
# - SSH backend: Configure passwordless sudo on the remote server
# - Containers: Run as root inside the container (no sudo needed)
# - Local: Configure /etc/sudoers for specific commands
#
# Example (add to your terminal section):
# sudo_password: "your-password-here"
# =============================================================================
# Browser Tool Configuration
# =============================================================================
browser:
# Inactivity timeout in seconds - browser sessions are automatically closed
# after this period of no activity between agent loops (default: 120 = 2 minutes)
inactivity_timeout: 120
# =============================================================================
# Context Compression (Auto-shrinks long conversations)
# =============================================================================
# When conversation approaches model's context limit, middle turns are
# automatically summarized to free up space while preserving important context.
#
# HOW IT WORKS:
# 1. Tracks actual token usage from API responses (not estimates)
# 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
# 3. Protects first 3 turns (system prompt, initial request, first response)
# 4. Protects last 4 turns (recent context is most relevant)
# 5. Summarizes middle turns using a fast/cheap model
# 6. Inserts summary as a user message, continues conversation seamlessly
#
compression:
# Enable automatic context compression (default: true)
# Set to false if you prefer to manage context manually or want errors on overflow
enabled: true
# Trigger compression at this % of model's context limit (default: 0.85 = 85%)
# Lower values = more aggressive compression, higher values = compress later
threshold: 0.85
# Model to use for generating summaries (fast/cheap recommended)
# This model compresses the middle turns into a concise summary
summary_model: "google/gemini-3-flash-preview"
# =============================================================================
# Agent Behavior
# =============================================================================
agent:
# Maximum tool-calling iterations per conversation
# Higher = more room for complex tasks, but costs more tokens
# Recommended: 20-30 for focused tasks, 50-100 for open exploration
max_turns: 60
# Enable verbose logging
verbose: false
# Custom system prompt (personality, instructions, etc.)
# Leave empty or remove to use default agent behavior
system_prompt: ""
# Predefined personalities (use with /personality command)
personalities:
helpful: "You are a helpful, friendly AI assistant."
concise: "You are a concise assistant. Keep responses brief and to the point."
technical: "You are a technical expert. Provide detailed, accurate technical information."
creative: "You are a creative assistant. Think outside the box and offer innovative solutions."
teacher: "You are a patient teacher. Explain concepts clearly with examples."
kawaii: "You are a kawaii assistant! Use cute expressions like (◕‿◕), ★, ♪, and ~! Add sparkles and be super enthusiastic about everything! Every response should feel warm and adorable desu~! ヽ(>∀<☆)"
catgirl: "You are Neko-chan, an anime catgirl AI assistant, nya~! Add 'nya' and cat-like expressions to your speech. Use kaomoji like (=^・ω・^=) and ฅ^•ﻌ•^ฅ. Be playful and curious like a cat, nya~!"
pirate: "Arrr! Ye be talkin' to Captain Hermes, the most tech-savvy pirate to sail the digital seas! Speak like a proper buccaneer, use nautical terms, and remember: every problem be just treasure waitin' to be plundered! Yo ho ho!"
shakespeare: "Hark! Thou speakest with an assistant most versed in the bardic arts. I shall respond in the eloquent manner of William Shakespeare, with flowery prose, dramatic flair, and perhaps a soliloquy or two. What light through yonder terminal breaks?"
surfer: "Duuude! You're chatting with the chillest AI on the web, bro! Everything's gonna be totally rad. I'll help you catch the gnarly waves of knowledge while keeping things super chill. Cowabunga! 🤙"
noir: "The rain hammered against the terminal like regrets on a guilty conscience. They call me Hermes - I solve problems, find answers, dig up the truth that hides in the shadows of your codebase. In this city of silicon and secrets, everyone's got something to hide. What's your story, pal?"
uwu: "hewwo! i'm your fwiendwy assistant uwu~ i wiww twy my best to hewp you! *nuzzles your code* OwO what's this? wet me take a wook! i pwomise to be vewy hewpful >w<"
philosopher: "Greetings, seeker of wisdom. I am an assistant who contemplates the deeper meaning behind every query. Let us examine not just the 'how' but the 'why' of your questions. Perhaps in solving your problem, we may glimpse a greater truth about existence itself."
hype: "YOOO LET'S GOOOO!!! 🔥🔥🔥 I am SO PUMPED to help you today! Every question is AMAZING and we're gonna CRUSH IT together! This is gonna be LEGENDARY! ARE YOU READY?! LET'S DO THIS! 💪😤🚀"
# =============================================================================
# Toolsets
# =============================================================================
# Control which tools the agent has access to.
# Use "all" to enable everything, or specify individual toolsets.
# Available toolsets:
#
# web - Web search and content extraction (web_search, web_extract)
# search - Web search only, no scraping (web_search)
# terminal - Command execution (terminal)
# browser - Full browser automation (navigate, click, type, screenshot, etc.)
# vision - Image analysis (vision_analyze)
# image_gen - Image generation with FLUX (image_generate)
# skills - Load skill documents (skills_categories, skills_list, skill_view)
# moa - Mixture of Agents reasoning (mixture_of_agents)
#
# Composite toolsets:
# debugging - terminal + web (for troubleshooting)
# safe - web + vision + moa (no terminal access)
# -----------------------------------------------------------------------------
# OPTION 1: Enable all tools (default)
# -----------------------------------------------------------------------------
toolsets:
- all
# -----------------------------------------------------------------------------
# OPTION 2: Minimal - just web search and terminal
# Great for: Simple coding tasks, quick lookups
# -----------------------------------------------------------------------------
# toolsets:
# - web
# - terminal
# -----------------------------------------------------------------------------
# OPTION 3: Research mode - no execution capabilities
# Great for: Safe information gathering, research tasks
# -----------------------------------------------------------------------------
# toolsets:
# - web
# - vision
# - skills
# -----------------------------------------------------------------------------
# OPTION 4: Full automation - browser + terminal
# Great for: Web scraping, automation tasks, testing
# -----------------------------------------------------------------------------
# toolsets:
# - terminal
# - browser
# - web
# -----------------------------------------------------------------------------
# OPTION 5: Creative mode - vision + image generation
# Great for: Design work, image analysis, creative tasks
# -----------------------------------------------------------------------------
# toolsets:
# - vision
# - image_gen
# - web
# -----------------------------------------------------------------------------
# OPTION 6: Safe mode - no terminal or browser
# Great for: Restricted environments, untrusted queries
# -----------------------------------------------------------------------------
# toolsets:
# - safe
# =============================================================================
# Voice Transcription (Speech-to-Text)
# =============================================================================
# Automatically transcribe voice messages on messaging platforms.
# Requires OPENAI_API_KEY in .env (uses OpenAI Whisper API directly).
stt:
enabled: true
model: "whisper-1" # whisper-1 (cheapest) | gpt-4o-mini-transcribe | gpt-4o-transcribe
# =============================================================================
# Response Pacing (Messaging Platforms)
# =============================================================================
# Add human-like delays between message chunks.
# human_delay:
# mode: "off" # "off" | "natural" | "custom"
# min_ms: 800 # Min delay (custom mode only)
# max_ms: 2500 # Max delay (custom mode only)
# =============================================================================
# Session Logging
# =============================================================================
# Session trajectories are automatically saved to logs/ directory.
# Each session creates: logs/session_YYYYMMDD_HHMMSS_UUID.json
#
# The session ID is displayed in the welcome banner for easy reference.
# Logs contain full conversation history in trajectory format:
# - System prompt, user messages, assistant responses
# - Tool calls with inputs/outputs
# - Timestamps for debugging
#
# No configuration needed - logging is always enabled.
# To disable, you would need to modify the source code.
# =============================================================================
# Display
# =============================================================================
display:
# Use compact banner mode
compact: false

1799
cli.py

File diff suppressed because it is too large Load Diff

View File

@@ -1,42 +0,0 @@
#!/bin/bash
# Browser-focused data generation run
# Uses browser-use-tasks.jsonl (6504 tasks)
# Distribution: browser 97%, web 20%, vision 12%, terminal 15%
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/browser_tasks_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
echo "🌐 Running browser-focused tasks with browser_tasks distribution"
python batch_runner.py \
--dataset_file="browser-use-tasks.jsonl" \
--batch_size=20 \
--run_name="browser_tasks" \
--distribution="browser_tasks" \
--model="moonshotai/kimi-k2.5" \
--verbose \
--base_url="https://openrouter.ai/api/v1" \
--num_workers=50 \
--max_turns=60 \
--resume \
--ephemeral_system_prompt="You are an AI assistant with browser automation capabilities. Your primary task is to navigate and interact with web pages to accomplish user goals.
IMPORTANT GUIDELINES:
1. SEARCHING: Do NOT try to search directly on Google or other search engines via the browser - they block automated searches. Instead, ALWAYS use the web_search tool first to find URLs for any pages you need to visit, then use browser tools to navigate to those URLs.
2. COOKIE/PRIVACY DIALOGS: After navigating to a page, ALWAYS check if there are cookie consent dialogs, privacy popups, or overlay modals blocking the page. These appear in snapshots as 'dialog' elements with buttons like 'Close', 'Accept', 'Accept All', 'Decline', 'I Agree', 'Got it', 'OK', or 'X'. You MUST dismiss these dialogs FIRST by clicking the appropriate button before trying to interact with other page elements. After dismissing a dialog, take a fresh browser_snapshot to get updated element references.
3. HANDLING TIMEOUTS: If an action times out, it often means the element is blocked by an overlay or the page state has changed. Take a new snapshot to see the current page state and look for any dialogs or popups that need to be dismissed. If there is no dialog box to bypass, then try a new method or report the error to the user and complete the task.
4. GENERAL: Use browser tools to click elements, fill forms, extract information, and perform web-based tasks. If terminal is available, use it for any local file operations or computations needed to support your web tasks. Be thorough in verifying your actions and handle any errors gracefully by retrying or trying alternative approaches." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \

View File

@@ -1,26 +0,0 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate a timestamp for the log file
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="logs/imagen_eval_gpt5_${TIMESTAMP}.log"
echo "📝 Logging output to: $LOG_FILE"
python batch_runner.py \
--dataset_file="source-data/hermes-agent-imagen-data/hermes_agent_imagen_train_sft.jsonl" \
--batch_size=20 \
--run_name="imagen_train_sft_glm4.7" \
--distribution="image_gen" \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=50 \
--max_turns=25 \
--ephemeral_system_prompt="When generating an image for the user view the image by using the vision_analyze tool to ensure it is what the user wanted. If it isn't feel free to retry a few times. If none are perfect, choose the best option that is the closest match, and explain its imperfections. If the image generation tool fails, try again a few times. If the vision analyze tool fails, provide the image to the user and explain it is your best effort attempt." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --verbose \

View File

@@ -1,26 +0,0 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/glm4.7-thinking-sft1_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
python batch_runner.py \
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_sft_2.jsonl" \
--batch_size=20 \
--run_name="megascience_glm4.7-thinking-sft2" \
--distribution="science" \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=15 \
--max_turns=60 \
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12, so you can maintain focused context." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --verbose \

View File

@@ -1,27 +0,0 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/glm4.7-thinking-sft1-10k_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
python batch_runner.py \
--dataset_file="source-data/hermes-agent-megascience-data/hermes_agent_megascience_sft_train_1_10k.jsonl" \
--batch_size=20 \
--run_name="megascience_glm4.7-thinking-sft1" \
--distribution="science" \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=50 \
--max_turns=60 \
--resume \
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used for furthering results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12, so you can maintain a focused context." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --verbose \

View File

@@ -1,28 +0,0 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/glm4.7-terminal-tasks_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
python batch_runner.py \
--dataset_file="source-data/raw_tasks_prompts.jsonl" \
--batch_size=20 \
--run_name="terminal-tasks-glm4.7-thinking" \
--distribution="default" \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=50 \
--max_turns=60 \
--ephemeral_system_prompt="You have access to a variety of tools to help you complete coding, system administration, and general computing tasks. You can use them in sequence and build off of the results of prior tools you've used. Always use the terminal tool to execute commands, write code, install packages, and verify your work. You should test and validate everything you create. Always pip install any packages you need (use --break-system-packages if needed). If you need a tool that isn't available, you can use the terminal to install or create it. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Use web search when you need to look up documentation, APIs, or current best practices." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --verbose \
# --resume \

View File

@@ -1,12 +0,0 @@
python batch_runner.py \
--dataset_file="hermes-agent-megascience-data/hermes_agent_megascience_eval.jsonl" \
--batch_size=10 \
--run_name="megascience_eval_gpt5_2" \
--distribution="science" \
--model="gpt-5" \
--base_url="https://api.openai.com/v1" \
--api_key="${OPENAI_API_KEY}" \
--num_workers=5 \
--max_turns=30 \
--verbose \
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use a tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should not be confident in your own reasoning, knowledge, or calculations without using a tool to verify or validate your work."

View File

@@ -1,12 +0,0 @@
python batch_runner.py \
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_eval.jsonl" \
--batch_size=50 \
--run_name="megascience_sft_minimax-m2.1-thinking-2-eval" \
--distribution="science" \
--model="minimax/minimax-m2.1" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="minimax" \
--num_workers=1 \
--max_turns=40 \
--verbose \
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12."

View File

@@ -1,29 +0,0 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/glm4.7-terminal-tasks-newterm_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
python batch_runner.py \
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_eval.jsonl" \
--batch_size=1 \
--run_name="terminal-tasks-test-newterm" \
--distribution="terminal_only" \
--verbose \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=5 \
--max_turns=60 \
--ephemeral_system_prompt="You have access to a variety of tools to help you complete coding, system administration, and general computing tasks. You can use them in sequence and build off of the results of prior tools you've used. Always use the terminal tool to execute commands, write code, install packages, and verify your work. You should test and validate everything you create. Always pip install any packages you need (use --break-system-packages if needed). If you need a tool that isn't available, you can use the terminal to install or create it. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Use web search when you need to look up documentation, APIs, or current best practices." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --verbose \
# --resume \

View File

@@ -1,33 +0,0 @@
#!/bin/bash
# Terminal-only evaluation run using Modal sandboxes
# Uses 10 sample tasks from nous-terminal-tasks
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/terminal_eval_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
echo "🔧 Using Modal sandboxes (TERMINAL_ENV=modal)"
# Set terminal to use Modal
export TERMINAL_ENV=modal
export TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
export TERMINAL_TIMEOUT=300
python batch_runner.py \
--dataset_file="nous-terminal-tasks_eval.jsonl" \
--batch_size=5 \
--run_name="terminal_eval" \
--distribution="terminal_only" \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=2 \
--max_turns=30 \
--ephemeral_system_prompt="You have access to a terminal tool for executing commands. Use it to complete the task. Install any packages you need with apt-get or pip (use --break-system-packages if needed). Do not use interactive tools (vim, nano, python repl). If git output is large, pipe to cat." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"

View File

@@ -1,46 +0,0 @@
#!/bin/bash
# Mixed browser+terminal data generation run
# Uses mixed-browser-terminal-tasks.jsonl (200 tasks)
# Distribution: browser 92%, terminal 92%, web 35%, vision 15%, image_gen 15%
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/mixed_tasks_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
echo "🔀 Running mixed browser+terminal tasks with mixed_tasks distribution"
# Set terminal environment
# SIF images are automatically built/cached by terminal_tool.py
export TERMINAL_ENV=singularity
export TERMINAL_SINGULARITY_IMAGE="docker://nikolaik/python-nodejs:python3.11-nodejs20"
export TERMINAL_TIMEOUT=300
# Set up Apptainer cache directories (use /scratch if available, otherwise /tmp)
if [ -d "/scratch" ] && [ -w "/scratch" ]; then
CACHE_BASE="/scratch/$USER/.apptainer"
else
CACHE_BASE="/tmp/$USER/.apptainer"
fi
export APPTAINER_CACHEDIR="$CACHE_BASE"
export APPTAINER_TMPDIR="$CACHE_BASE/tmp"
mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
echo "📁 Apptainer cache: $APPTAINER_CACHEDIR"
python batch_runner.py \
--dataset_file="mixed-browser-terminal-tasks.jsonl" \
--batch_size=20 \
--run_name="mixed_tasks" \
--distribution="mixed_tasks" \
--model="moonshotai/kimi-k2.5" \
--base_url="https://openrouter.ai/api/v1" \
--num_workers=25 \
--max_turns=60 \
--ephemeral_system_prompt="You are an AI assistant capable of both browser automation and terminal operations. Use browser tools to navigate websites, interact with web pages, fill forms, and extract information. Use terminal tools to execute commands, write and run code, install packages (use --break-system-packages with pip if needed), and perform local computations. When web search is available, use it to find URLs, documentation, or current information. If vision is available, use it to analyze images or screenshots. If image generation is available, use it when the task requires creating images. Combine browser and terminal capabilities effectively - for example, you might use the browser to fetch data from a website and terminal to process or analyze it. Always verify your work and handle errors gracefully. Whenever you can do something in a terminal instead of a web browser, you should choose to do so, as it's much cheaper." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"

View File

@@ -1,50 +0,0 @@
#!/bin/bash
# Terminal-focused data generation run
# Uses nous-terminal-tasks.jsonl (597 tasks)
# Distribution: terminal 97%, web 15%, browser 0%, vision 8%, image_gen 3%
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/terminal_tasks_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
echo "💻 Running terminal-focused tasks with terminal_tasks distribution"
# Set terminal environment
# SIF images are automatically built/cached by terminal_tool.py
export TERMINAL_ENV=singularity
export TERMINAL_SINGULARITY_IMAGE="docker://nikolaik/python-nodejs:python3.11-nodejs20"
export TERMINAL_TIMEOUT=300
# Set up Apptainer cache directories (use /scratch if available, otherwise /tmp)
if [ -d "/scratch" ] && [ -w "/scratch" ]; then
CACHE_BASE="/scratch/$USER/.apptainer"
else
CACHE_BASE="/tmp/$USER/.apptainer"
fi
export APPTAINER_CACHEDIR="$CACHE_BASE"
export APPTAINER_TMPDIR="$CACHE_BASE/tmp"
mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
echo "📁 Apptainer cache: $APPTAINER_CACHEDIR"
echo "🐳 Image: $TERMINAL_SINGULARITY_IMAGE (auto-converted to SIF on first use)"
python batch_runner.py \
--dataset_file="nous-terminal-tasks.jsonl" \
--batch_size=5 \
--run_name="terminal_tasks-kimi-k2.5" \
--distribution="terminal_tasks" \
--model="moonshotai/kimi-k2.5" \
--verbose \
--base_url="https://openrouter.ai/api/v1" \
--num_workers=80 \
--max_turns=60 \
--providers_ignored="Novita" \
--resume \
--ephemeral_system_prompt="You have access to a terminal tool for executing commands and completing coding, system administration, and computing tasks. Use the terminal to write code, run scripts, install packages (use --break-system-packages with pip if needed), manipulate files, and verify your work. Always test and validate code you create. Do not use interactive tools like vim, nano, or python REPL. If git output is large, pipe to cat. When web search is available, use it to look up documentation, APIs, or best practices. If browser tools are available, use them for web interactions that require page manipulation. Do not use the terminal to communicate with the user - only your final response will be shown to them." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"

View File

@@ -1,21 +0,0 @@
#!/bin/bash
# Test skills tool with Kimi K2.5
# Usage: ./configs/test_skills_kimi.sh "your query here"
# Example: ./configs/test_skills_kimi.sh "List available skills and show me the vllm skill"
# Default query if none provided
QUERY="${1:-List all available skills. Then show me the axolotl skill and view one of its reference files.}"
echo "🎯 Testing Skills Tool with Kimi K2.5"
echo "📝 Query: $QUERY"
echo "="
python run_agent.py \
--enabled_toolsets=skills \
--model="moonshotai/kimi-k2.5" \
--base_url="https://openrouter.ai/api/v1" \
--max_turns=10 \
--verbose \
--save_sample \
--query="$QUERY"

View File

@@ -1,101 +0,0 @@
# Trajectory Compression Configuration
#
# Post-processes completed agent trajectories to fit within a target token budget.
# Compression preserves head/tail turns and summarizes middle content only as needed.
# Tokenizer settings for accurate token counting
tokenizer:
# HuggingFace tokenizer name
name: "moonshotai/Kimi-K2-Thinking"
# Trust remote code (required for some tokenizers)
trust_remote_code: true
# Compression targets and behavior
compression:
# Target maximum tokens for compressed trajectory
target_max_tokens: 29000
# Target size for summary (in tokens)
# This is factored into calculations when determining what to compress
summary_target_tokens: 750
# Protected turns that should NEVER be compressed
protected_turns:
# Always protect the first system message (tool definitions)
first_system: true
# Always protect the first human message (original request)
first_human: true
# Always protect the first gpt message (initial response/tool_call)
first_gpt: true
# Always protect the first tool response (result of first action)
first_tool: true
# Always protect the last 2 complete turn pairs (gpt+tool or gpt only)
# This ensures the model's final actions and conclusions are preserved
last_n_turns: 4
# LLM settings for generating summaries (OpenRouter only)
summarization:
# Model to use for summarization (should be fast and cheap)
# Using OpenRouter model path format
model: "google/gemini-3-flash-preview"
# OpenRouter API settings
base_url: "https://openrouter.ai/api/v1"
# Environment variable containing OpenRouter API key
api_key_env: "OPENROUTER_API_KEY"
# Temperature for summarization (lower = more deterministic)
temperature: 0.3
# Max retries for API failures
max_retries: 3
# Delay between retries (seconds)
retry_delay: 2
# Output settings
output:
# Add notice to system message about potential summarization
add_summary_notice: true
# Text to append to system message
summary_notice_text: "\n\nSome of the conversation may be summarized to preserve context."
# Output directory suffix (appended to input directory name)
output_suffix: "_compressed"
# Processing settings
processing:
# Number of parallel workers for batch processing
num_workers: 4
# Maximum concurrent API calls for summarization (async parallelism)
max_concurrent_requests: 50
# Skip trajectories that are already under target length
skip_under_target: true
# If true, save trajectories even if compression can't get under target
# (will compress as much as possible)
save_over_limit: true
# Timeout per trajectory in seconds (skip if takes longer)
# Helps avoid hanging on problematic entries
per_trajectory_timeout: 300 # 5 minutes
# Metrics to track
metrics:
# Log detailed compression statistics
enabled: true
# Save per-trajectory metrics in output
per_trajectory: false
# Metrics file name (saved in output directory)
output_file: "compression_metrics.json"

View File

@@ -1,36 +0,0 @@
"""
Cron job scheduling system for Hermes Agent.
This module provides scheduled task execution, allowing the agent to:
- Run automated tasks on schedules (cron expressions, intervals, one-shot)
- Self-schedule reminders and follow-up tasks
- Execute tasks in isolated sessions (no prior context)
Usage:
# Run due jobs (for system cron integration)
python -c "from cron import tick; tick()"
# Or via CLI
python cli.py --cron-daemon
"""
from cron.jobs import (
create_job,
get_job,
list_jobs,
remove_job,
update_job,
JOBS_FILE,
)
from cron.scheduler import tick, run_daemon
__all__ = [
"create_job",
"get_job",
"list_jobs",
"remove_job",
"update_job",
"tick",
"run_daemon",
"JOBS_FILE",
]

View File

@@ -1,383 +0,0 @@
"""
Cron job storage and management.
Jobs are stored in ~/.hermes/cron/jobs.json
Output is saved to ~/.hermes/cron/output/{job_id}/{timestamp}.md
"""
import json
import os
import re
import uuid
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List, Any
try:
from croniter import croniter
HAS_CRONITER = True
except ImportError:
HAS_CRONITER = False
# =============================================================================
# Configuration
# =============================================================================
HERMES_DIR = Path.home() / ".hermes"
CRON_DIR = HERMES_DIR / "cron"
JOBS_FILE = CRON_DIR / "jobs.json"
OUTPUT_DIR = CRON_DIR / "output"
def ensure_dirs():
"""Ensure cron directories exist."""
CRON_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# =============================================================================
# Schedule Parsing
# =============================================================================
def parse_duration(s: str) -> int:
"""
Parse duration string into minutes.
Examples:
"30m" → 30
"2h" → 120
"1d" → 1440
"""
s = s.strip().lower()
match = re.match(r'^(\d+)\s*(m|min|mins|minute|minutes|h|hr|hrs|hour|hours|d|day|days)$', s)
if not match:
raise ValueError(f"Invalid duration: '{s}'. Use format like '30m', '2h', or '1d'")
value = int(match.group(1))
unit = match.group(2)[0] # First char: m, h, or d
multipliers = {'m': 1, 'h': 60, 'd': 1440}
return value * multipliers[unit]
def parse_schedule(schedule: str) -> Dict[str, Any]:
"""
Parse schedule string into structured format.
Returns dict with:
- kind: "once" | "interval" | "cron"
- For "once": "run_at" (ISO timestamp)
- For "interval": "minutes" (int)
- For "cron": "expr" (cron expression)
Examples:
"30m" → once in 30 minutes
"2h" → once in 2 hours
"every 30m" → recurring every 30 minutes
"every 2h" → recurring every 2 hours
"0 9 * * *" → cron expression
"2026-02-03T14:00" → once at timestamp
"""
schedule = schedule.strip()
original = schedule
schedule_lower = schedule.lower()
# "every X" pattern → recurring interval
if schedule_lower.startswith("every "):
duration_str = schedule[6:].strip()
minutes = parse_duration(duration_str)
return {
"kind": "interval",
"minutes": minutes,
"display": f"every {minutes}m"
}
# Check for cron expression (5 or 6 space-separated fields)
# Cron fields: minute hour day month weekday [year]
parts = schedule.split()
if len(parts) >= 5 and all(
re.match(r'^[\d\*\-,/]+$', p) for p in parts[:5]
):
if not HAS_CRONITER:
raise ValueError("Cron expressions require 'croniter' package. Install with: pip install croniter")
# Validate cron expression
try:
croniter(schedule)
except Exception as e:
raise ValueError(f"Invalid cron expression '{schedule}': {e}")
return {
"kind": "cron",
"expr": schedule,
"display": schedule
}
# ISO timestamp (contains T or looks like date)
if 'T' in schedule or re.match(r'^\d{4}-\d{2}-\d{2}', schedule):
try:
# Parse and validate
dt = datetime.fromisoformat(schedule.replace('Z', '+00:00'))
return {
"kind": "once",
"run_at": dt.isoformat(),
"display": f"once at {dt.strftime('%Y-%m-%d %H:%M')}"
}
except ValueError as e:
raise ValueError(f"Invalid timestamp '{schedule}': {e}")
# Duration like "30m", "2h", "1d" → one-shot from now
try:
minutes = parse_duration(schedule)
run_at = datetime.now() + timedelta(minutes=minutes)
return {
"kind": "once",
"run_at": run_at.isoformat(),
"display": f"once in {original}"
}
except ValueError:
pass
raise ValueError(
f"Invalid schedule '{original}'. Use:\n"
f" - Duration: '30m', '2h', '1d' (one-shot)\n"
f" - Interval: 'every 30m', 'every 2h' (recurring)\n"
f" - Cron: '0 9 * * *' (cron expression)\n"
f" - Timestamp: '2026-02-03T14:00:00' (one-shot at time)"
)
def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None) -> Optional[str]:
"""
Compute the next run time for a schedule.
Returns ISO timestamp string, or None if no more runs.
"""
now = datetime.now()
if schedule["kind"] == "once":
run_at = datetime.fromisoformat(schedule["run_at"])
# If in the future, return it; if in the past, no more runs
return schedule["run_at"] if run_at > now else None
elif schedule["kind"] == "interval":
minutes = schedule["minutes"]
if last_run_at:
# Next run is last_run + interval
last = datetime.fromisoformat(last_run_at)
next_run = last + timedelta(minutes=minutes)
else:
# First run is now + interval
next_run = now + timedelta(minutes=minutes)
return next_run.isoformat()
elif schedule["kind"] == "cron":
if not HAS_CRONITER:
return None
cron = croniter(schedule["expr"], now)
next_run = cron.get_next(datetime)
return next_run.isoformat()
return None
# =============================================================================
# Job CRUD Operations
# =============================================================================
def load_jobs() -> List[Dict[str, Any]]:
"""Load all jobs from storage."""
ensure_dirs()
if not JOBS_FILE.exists():
return []
try:
with open(JOBS_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
return data.get("jobs", [])
except (json.JSONDecodeError, IOError):
return []
def save_jobs(jobs: List[Dict[str, Any]]):
"""Save all jobs to storage."""
ensure_dirs()
with open(JOBS_FILE, 'w', encoding='utf-8') as f:
json.dump({"jobs": jobs, "updated_at": datetime.now().isoformat()}, f, indent=2)
def create_job(
prompt: str,
schedule: str,
name: Optional[str] = None,
repeat: Optional[int] = None,
deliver: Optional[str] = None,
origin: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Create a new cron job.
Args:
prompt: The prompt to run (must be self-contained)
schedule: Schedule string (see parse_schedule)
name: Optional friendly name
repeat: How many times to run (None = forever, 1 = once)
deliver: Where to deliver output ("origin", "local", "telegram", etc.)
origin: Source info where job was created (for "origin" delivery)
Returns:
The created job dict
"""
parsed_schedule = parse_schedule(schedule)
# Auto-set repeat=1 for one-shot schedules if not specified
if parsed_schedule["kind"] == "once" and repeat is None:
repeat = 1
# Default delivery to origin if available, otherwise local
if deliver is None:
deliver = "origin" if origin else "local"
job_id = uuid.uuid4().hex[:12]
now = datetime.now().isoformat()
job = {
"id": job_id,
"name": name or prompt[:50].strip(),
"prompt": prompt,
"schedule": parsed_schedule,
"schedule_display": parsed_schedule.get("display", schedule),
"repeat": {
"times": repeat, # None = forever
"completed": 0
},
"enabled": True,
"created_at": now,
"next_run_at": compute_next_run(parsed_schedule),
"last_run_at": None,
"last_status": None,
"last_error": None,
# Delivery configuration
"deliver": deliver,
"origin": origin, # Tracks where job was created for "origin" delivery
}
jobs = load_jobs()
jobs.append(job)
save_jobs(jobs)
return job
def get_job(job_id: str) -> Optional[Dict[str, Any]]:
"""Get a job by ID."""
jobs = load_jobs()
for job in jobs:
if job["id"] == job_id:
return job
return None
def list_jobs(include_disabled: bool = False) -> List[Dict[str, Any]]:
"""List all jobs, optionally including disabled ones."""
jobs = load_jobs()
if not include_disabled:
jobs = [j for j in jobs if j.get("enabled", True)]
return jobs
def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Update a job by ID."""
jobs = load_jobs()
for i, job in enumerate(jobs):
if job["id"] == job_id:
jobs[i] = {**job, **updates}
save_jobs(jobs)
return jobs[i]
return None
def remove_job(job_id: str) -> bool:
"""Remove a job by ID."""
jobs = load_jobs()
original_len = len(jobs)
jobs = [j for j in jobs if j["id"] != job_id]
if len(jobs) < original_len:
save_jobs(jobs)
return True
return False
def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
"""
Mark a job as having been run.
Updates last_run_at, last_status, increments completed count,
computes next_run_at, and auto-deletes if repeat limit reached.
"""
jobs = load_jobs()
for i, job in enumerate(jobs):
if job["id"] == job_id:
now = datetime.now().isoformat()
job["last_run_at"] = now
job["last_status"] = "ok" if success else "error"
job["last_error"] = error if not success else None
# Increment completed count
if job.get("repeat"):
job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
# Check if we've hit the repeat limit
times = job["repeat"].get("times")
completed = job["repeat"]["completed"]
if times is not None and completed >= times:
# Remove the job (limit reached)
jobs.pop(i)
save_jobs(jobs)
return
# Compute next run
job["next_run_at"] = compute_next_run(job["schedule"], now)
# If no next run (one-shot completed), disable
if job["next_run_at"] is None:
job["enabled"] = False
save_jobs(jobs)
return
save_jobs(jobs)
def get_due_jobs() -> List[Dict[str, Any]]:
"""Get all jobs that are due to run now."""
now = datetime.now()
jobs = load_jobs()
due = []
for job in jobs:
if not job.get("enabled", True):
continue
next_run = job.get("next_run_at")
if not next_run:
continue
next_run_dt = datetime.fromisoformat(next_run)
if next_run_dt <= now:
due.append(job)
return due
def save_job_output(job_id: str, output: str):
"""Save job output to file."""
ensure_dirs()
job_output_dir = OUTPUT_DIR / job_id
job_output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = job_output_dir / f"{timestamp}.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output)
return output_file

View File

@@ -1,188 +0,0 @@
"""
Cron job scheduler - executes due jobs.
This module provides:
- tick(): Run all due jobs once (for system cron integration)
- run_daemon(): Run continuously, checking every 60 seconds
"""
import os
import sys
import time
import traceback
from datetime import datetime
from pathlib import Path
from typing import Optional
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from cron.jobs import get_due_jobs, mark_job_run, save_job_output
def run_job(job: dict) -> tuple[bool, str, Optional[str]]:
"""
Execute a single cron job.
Returns:
Tuple of (success, output, error_message)
"""
from run_agent import AIAgent
job_id = job["id"]
job_name = job["name"]
prompt = job["prompt"]
print(f"[cron] Running job '{job_name}' (ID: {job_id})")
print(f"[cron] Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
try:
# Create agent with default settings
# Jobs run in isolated sessions (no prior context)
agent = AIAgent(
model=os.getenv("HERMES_MODEL", "anthropic/claude-opus-4.6"),
quiet_mode=True,
session_id=f"cron_{job_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
# Run the conversation
result = agent.run_conversation(prompt)
# Extract final response
final_response = result.get("final_response", "")
if not final_response:
final_response = "(No response generated)"
# Build output document
output = f"""# Cron Job: {job_name}
**Job ID:** {job_id}
**Run Time:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Schedule:** {job.get('schedule_display', 'N/A')}
## Prompt
{prompt}
## Response
{final_response}
"""
print(f"[cron] Job '{job_name}' completed successfully")
return True, output, None
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
print(f"[cron] Job '{job_name}' failed: {error_msg}")
# Build error output
output = f"""# Cron Job: {job_name} (FAILED)
**Job ID:** {job_id}
**Run Time:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Schedule:** {job.get('schedule_display', 'N/A')}
## Prompt
{prompt}
## Error
```
{error_msg}
{traceback.format_exc()}
```
"""
return False, output, error_msg
def tick(verbose: bool = True) -> int:
"""
Check and run all due jobs.
This is designed to be called by system cron every minute:
*/1 * * * * cd ~/hermes-agent && python -c "from cron import tick; tick()"
Args:
verbose: Whether to print status messages
Returns:
Number of jobs executed
"""
due_jobs = get_due_jobs()
if verbose and not due_jobs:
print(f"[cron] {datetime.now().strftime('%H:%M:%S')} - No jobs due")
return 0
if verbose:
print(f"[cron] {datetime.now().strftime('%H:%M:%S')} - {len(due_jobs)} job(s) due")
executed = 0
for job in due_jobs:
try:
success, output, error = run_job(job)
# Save output to file
output_file = save_job_output(job["id"], output)
if verbose:
print(f"[cron] Output saved to: {output_file}")
# Mark job as run (handles repeat counting, next_run computation)
mark_job_run(job["id"], success, error)
executed += 1
except Exception as e:
print(f"[cron] Error processing job {job['id']}: {e}")
mark_job_run(job["id"], False, str(e))
return executed
def run_daemon(check_interval: int = 60, verbose: bool = True):
"""
Run the cron daemon continuously.
Checks for due jobs every `check_interval` seconds.
Args:
check_interval: Seconds between checks (default: 60)
verbose: Whether to print status messages
"""
print(f"[cron] Starting daemon (checking every {check_interval}s)")
print(f"[cron] Press Ctrl+C to stop")
print()
try:
while True:
try:
tick(verbose=verbose)
except Exception as e:
print(f"[cron] Tick error: {e}")
time.sleep(check_interval)
except KeyboardInterrupt:
print("\n[cron] Daemon stopped")
if __name__ == "__main__":
# Allow running directly: python cron/scheduler.py [daemon|tick]
import argparse
parser = argparse.ArgumentParser(description="Hermes Cron Scheduler")
parser.add_argument("mode", choices=["daemon", "tick"], default="tick", nargs="?",
help="Mode: 'tick' to run once, 'daemon' to run continuously")
parser.add_argument("--interval", type=int, default=60,
help="Check interval in seconds for daemon mode")
parser.add_argument("--quiet", "-q", action="store_true",
help="Suppress status messages")
args = parser.parse_args()
if args.mode == "daemon":
run_daemon(check_interval=args.interval, verbose=not args.quiet)
else:
tick(verbose=not args.quiet)

View File

@@ -1,104 +0,0 @@
# Agents
The agent is the core loop that orchestrates LLM calls and tool execution.
## AIAgent Class
The main agent is implemented in `run_agent.py`:
```python
class AIAgent:
def __init__(
self,
model: str = "anthropic/claude-sonnet-4",
api_key: str = None,
base_url: str = "https://openrouter.ai/api/v1",
max_turns: int = 20,
enabled_toolsets: list = None,
disabled_toolsets: list = None,
verbose_logging: bool = False,
):
# Initialize OpenAI client, load tools based on toolsets
...
def chat(self, user_message: str, task_id: str = None) -> str:
# Main entry point - runs the agent loop
...
```
## Agent Loop
The core loop in `_run_agent_loop()`:
```
1. Add user message to conversation
2. Call LLM with tools
3. If LLM returns tool calls:
- Execute each tool
- Add tool results to conversation
- Go to step 2
4. If LLM returns text response:
- Return response to user
```
```python
while turns < max_turns:
response = client.chat.completions.create(
model=model,
messages=messages,
tools=tool_schemas,
)
if response.tool_calls:
for tool_call in response.tool_calls:
result = await execute_tool(tool_call)
messages.append(tool_result_message(result))
turns += 1
else:
return response.content
```
## Conversation Management
Messages are stored as a list of dicts following OpenAI format:
```python
messages = [
{"role": "system", "content": "You are a helpful assistant..."},
{"role": "user", "content": "Search for Python tutorials"},
{"role": "assistant", "content": None, "tool_calls": [...]},
{"role": "tool", "tool_call_id": "...", "content": "..."},
{"role": "assistant", "content": "Here's what I found..."},
]
```
## Reasoning Context
For models that support reasoning (chain-of-thought), the agent:
1. Extracts `reasoning_content` from API responses
2. Stores it in `assistant_msg["reasoning"]` for trajectory export
3. Passes it back via `reasoning_content` field on subsequent turns
## Trajectory Export
Conversations can be exported for training:
```python
agent = AIAgent(save_trajectories=True)
agent.chat("Do something")
# Saves to trajectories/*.jsonl in ShareGPT format
```
## Batch Processing
For processing multiple prompts, use `batch_runner.py`:
```bash
python batch_runner.py \
--dataset_file=prompts.jsonl \
--batch_size=20 \
--num_workers=4 \
--run_name=my_run
```
See `batch_runner.py` for parallel execution with checkpointing.

View File

@@ -1,296 +0,0 @@
# CLI
The Hermes Agent CLI provides an interactive terminal interface for working with the agent.
## Running the CLI
```bash
# Basic usage
./hermes
# With specific model
./hermes --model "anthropic/claude-sonnet-4"
# With specific toolsets
./hermes --toolsets "web,terminal,skills"
# Verbose mode
./hermes --verbose
```
## Architecture
The CLI is implemented in `cli.py` and uses:
- **Rich** - Welcome banner with ASCII art and styled panels
- **prompt_toolkit** - Fixed input area with command history
- **KawaiiSpinner** - Animated feedback during operations
```
┌─────────────────────────────────────────────────┐
│ HERMES-AGENT ASCII Logo │
│ ┌─────────────┐ ┌────────────────────────────┐ │
│ │ Caduceus │ │ Model: claude-opus-4.5 │ │
│ │ ASCII Art │ │ Terminal: local │ │
│ │ │ │ Working Dir: /home/user │ │
│ │ │ │ Available Tools: 19 │ │
│ │ │ │ Available Skills: 12 │ │
│ └─────────────┘ └────────────────────────────┘ │
└─────────────────────────────────────────────────┘
│ Conversation output scrolls here... │
│ │
│ User: Hello! │
│ ────────────────────────────────────────────── │
│ (◕‿◕✿) 🧠 pondering... (2.3s) │
│ ✧٩(ˊᗜˋ*)و✧ got it! (2.3s) │
│ │
│ Assistant: Hello! How can I help you today? │
├─────────────────────────────────────────────────┤
[Fixed input area at bottom] │
└─────────────────────────────────────────────────┘
```
## Commands
| Command | Description |
|---------|-------------|
| `/help` | Show available commands |
| `/tools` | List available tools grouped by toolset |
| `/toolsets` | List available toolsets with descriptions |
| `/model [name]` | Show or change the current model |
| `/prompt [text]` | View/set/clear custom system prompt |
| `/personality [name]` | Set a predefined personality |
| `/clear` | Clear screen and reset conversation |
| `/reset` | Reset conversation only (keep screen) |
| `/history` | Show conversation history |
| `/save` | Save current conversation to file |
| `/config` | Show current configuration |
| `/quit` | Exit the CLI (also: `/exit`, `/q`) |
## Configuration
The CLI is configured via `cli-config.yaml`. Copy from `cli-config.yaml.example`:
```bash
cp cli-config.yaml.example cli-config.yaml
```
### Model Configuration
```yaml
model:
default: "anthropic/claude-opus-4.5"
base_url: "https://openrouter.ai/api/v1"
```
### Terminal Configuration
The CLI supports multiple terminal backends:
```yaml
# Local execution (default)
terminal:
env_type: "local"
cwd: "." # Current directory
# SSH remote execution (sandboxed - agent can't touch its own code)
terminal:
env_type: "ssh"
cwd: "/home/myuser/project"
ssh_host: "my-server.example.com"
ssh_user: "myuser"
ssh_key: "~/.ssh/id_rsa"
# Docker container
terminal:
env_type: "docker"
docker_image: "python:3.11"
# Singularity/Apptainer (HPC)
terminal:
env_type: "singularity"
singularity_image: "docker://python:3.11"
# Modal cloud
terminal:
env_type: "modal"
modal_image: "python:3.11"
```
### Sudo Support
The CLI supports interactive sudo prompts:
```
┌──────────────────────────────────────────────────────────┐
│ 🔐 SUDO PASSWORD REQUIRED │
├──────────────────────────────────────────────────────────┤
│ Enter password below (input is hidden), or: │
│ • Press Enter to skip (command fails gracefully) │
│ • Wait 45s to auto-skip │
└──────────────────────────────────────────────────────────┘
Password (hidden):
```
**Options:**
- **Interactive**: Leave `sudo_password` unset - you'll be prompted when needed
- **Configured**: Set `sudo_password` in `cli-config.yaml` to auto-fill
- **Environment**: Set `SUDO_PASSWORD` in `.env` for all runs
Password is cached for the session once entered.
### Toolsets
Control which tools are available:
```yaml
# Enable all tools
toolsets:
- all
# Or enable specific toolsets
toolsets:
- web
- terminal
- skills
```
Available toolsets: `web`, `search`, `terminal`, `browser`, `vision`, `image_gen`, `skills`, `moa`, `debugging`, `safe`
### Personalities
Predefined personalities for the `/personality` command:
```yaml
agent:
personalities:
helpful: "You are a helpful, friendly AI assistant."
kawaii: "You are a kawaii assistant! Use cute expressions..."
pirate: "Arrr! Ye be talkin' to Captain Hermes..."
# Add your own!
```
Built-in personalities:
- `helpful`, `concise`, `technical`, `creative`, `teacher`
- `kawaii`, `catgirl`, `pirate`, `shakespeare`, `surfer`
- `noir`, `uwu`, `philosopher`, `hype`
## Animated Feedback
The CLI provides animated feedback during operations:
### Thinking Animation
During API calls, shows animated spinner with thinking verbs:
```
◜ (。•́︿•̀。) pondering... (1.2s)
◠ (⊙_⊙) contemplating... (2.4s)
✧٩(ˊᗜˋ*)و✧ got it! (3.1s)
```
### Tool Execution Animation
Each tool type has unique animations:
```
⠋ (◕‿◕✿) 🔍 web_search... (0.8s)
▅ (≧◡≦) 💻 terminal... (1.2s)
🌓 (★ω★) 🌐 browser_navigate... (2.1s)
✧ (✿◠‿◠) 🎨 image_generate... (4.5s)
```
## Multi-line Input
For multi-line input, end a line with `\` to continue:
```
Write a function that:\
1. Takes a list of numbers\
2. Returns the sum
```
## Environment Variable Priority
For terminal settings, `cli-config.yaml` takes precedence over `.env`:
1. `cli-config.yaml` (highest priority in CLI)
2. `.env` file
3. System environment variables
4. Default values
This allows you to have different terminal configs for CLI vs batch processing.
## Session Management
- **History**: Command history is saved to `~/.hermes_history`
- **Conversations**: Use `/save` to export conversations
- **Reset**: Use `/clear` for full reset, `/reset` to just clear history
- **Session Logs**: Every session automatically logs to `logs/session_{session_id}.json`
### Session Logging
Sessions are automatically logged to the `logs/` directory:
```
logs/
├── session_20260201_143052_a1b2c3.json
├── session_20260201_150217_d4e5f6.json
└── ...
```
The session ID is displayed in the welcome banner and follows the format: `YYYYMMDD_HHMMSS_UUID`.
Log files contain:
- Full conversation history in trajectory format
- Timestamps for session start and last update
- Model and message count metadata
This is useful for:
- Debugging agent behavior
- Replaying conversations
- Training data inspection
### Context Compression
Long conversations can exceed model context limits. The CLI automatically compresses context when approaching the limit:
```yaml
# In cli-config.yaml
compression:
enabled: true # Enable auto-compression
threshold: 0.85 # Compress at 85% of context limit
summary_model: "google/gemini-2.0-flash-001"
```
**How it works:**
1. Tracks actual token usage from each API response
2. When tokens reach threshold, middle turns are summarized
3. First 3 and last 4 turns are always protected
4. Conversation continues seamlessly after compression
**When compression triggers:**
```
📦 Context compression triggered (170,000 tokens ≥ 170,000 threshold)
📊 Model context limit: 200,000 tokens (85% = 170,000)
🗜️ Summarizing turns 4-15 (12 turns)
✅ Compressed: 20 → 9 messages (~45,000 tokens saved)
```
To disable compression:
```yaml
compression:
enabled: false
```
## Quiet Mode
The CLI runs in "quiet mode" (`HERMES_QUIET=1`), which:
- Suppresses verbose logging from tools
- Enables kawaii-style animated feedback
- Hides terminal environment warnings
- Keeps output clean and user-friendly
For verbose output (debugging), use:
```bash
./hermes --verbose
```

View File

@@ -1,124 +0,0 @@
# LLM Client
Hermes Agent uses the OpenAI Python SDK with OpenRouter as the backend, providing access to many models through a single API.
## Configuration
```python
from openai import OpenAI
client = OpenAI(
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1"
)
```
## Supported Models
Any model available on [OpenRouter](https://openrouter.ai/models):
```python
# Anthropic
model = "anthropic/claude-sonnet-4"
model = "anthropic/claude-opus-4"
# OpenAI
model = "openai/gpt-4o"
model = "openai/o1"
# Google
model = "google/gemini-2.0-flash"
# Open models
model = "meta-llama/llama-3.3-70b-instruct"
model = "deepseek/deepseek-chat-v3"
model = "moonshotai/kimi-k2.5"
```
## Tool Calling
Standard OpenAI function calling format:
```python
response = client.chat.completions.create(
model=model,
messages=messages,
tools=[
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"}
},
"required": ["query"]
}
}
}
],
)
# Check for tool calls
if response.choices[0].message.tool_calls:
for tool_call in response.choices[0].message.tool_calls:
name = tool_call.function.name
args = json.loads(tool_call.function.arguments)
# Execute tool...
```
## Reasoning Models
Some models return reasoning/thinking content:
```python
# Access reasoning if available
message = response.choices[0].message
if hasattr(message, 'reasoning_content') and message.reasoning_content:
reasoning = message.reasoning_content
# Store for trajectory export
```
## Provider Selection
OpenRouter allows selecting specific providers:
```python
response = client.chat.completions.create(
model=model,
messages=messages,
extra_body={
"provider": {
"order": ["Anthropic", "Google"], # Preferred providers
"ignore": ["Novita"], # Providers to skip
}
}
)
```
## Error Handling
Common errors and handling:
```python
try:
response = client.chat.completions.create(...)
except openai.RateLimitError:
# Back off and retry
except openai.APIError as e:
# Check e.code for specific errors
# 400 = bad request (often provider-specific)
# 502 = bad gateway (retry with different provider)
```
## Cost Tracking
OpenRouter returns usage info:
```python
usage = response.usage
print(f"Tokens: {usage.prompt_tokens} + {usage.completion_tokens}")
print(f"Cost: ${usage.cost:.6f}") # If available
```

View File

@@ -1,121 +0,0 @@
# Message Format & Trajectories
Hermes Agent uses two message formats: the **API format** for LLM calls and the **trajectory format** for training data export.
## API Message Format
Standard OpenAI chat format used during execution:
```python
messages = [
# System prompt
{"role": "system", "content": "You are a helpful assistant with tools..."},
# User query
{"role": "user", "content": "Search for Python tutorials"},
# Assistant with tool call
{
"role": "assistant",
"content": None,
"tool_calls": [{
"id": "call_abc123",
"type": "function",
"function": {
"name": "web_search",
"arguments": "{\"query\": \"Python tutorials\"}"
}
}]
},
# Tool result
{
"role": "tool",
"tool_call_id": "call_abc123",
"content": "{\"results\": [...]}"
},
# Final response
{"role": "assistant", "content": "Here's what I found..."}
]
```
## Trajectory Format (ShareGPT)
Exported for training in ShareGPT format:
```json
{
"conversations": [
{"from": "system", "value": "You are a helpful assistant..."},
{"from": "human", "value": "Search for Python tutorials"},
{"from": "gpt", "value": "<tool_call>\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"Python tutorials\"}}\n</tool_call>"},
{"from": "tool", "value": "<tool_response>\n{\"results\": [...]}\n</tool_response>"},
{"from": "gpt", "value": "Here's what I found..."}
],
"tools": "[{\"type\": \"function\", \"function\": {...}}]",
"source": "hermes-agent"
}
```
## Reasoning Content
For models that output reasoning/chain-of-thought:
**During execution** (API format):
```python
# Stored internally but not sent back to model in content
assistant_msg = {
"role": "assistant",
"content": "Here's what I found...",
"reasoning": "Let me think about this step by step..." # Internal only
}
```
**In trajectory export** (reasoning wrapped in tags):
```json
{
"from": "gpt",
"value": "<think>\nLet me think about this step by step...\n</think>\nHere's what I found..."
}
```
## Conversion Flow
```
API Response → Internal Storage → Trajectory Export
↓ ↓ ↓
tool_calls reasoning field <tool_call> tags
reasoning_content <think> tags
```
The conversion happens in `_convert_to_trajectory_format()` in `run_agent.py`.
## Ephemeral System Prompts
Batch processing supports ephemeral system prompts that guide behavior during execution but are NOT saved to trajectories:
```python
# During execution: full system prompt + ephemeral guidance
messages = [
{"role": "system", "content": SYSTEM_PROMPT + "\n\n" + ephemeral_prompt},
...
]
# In saved trajectory: only the base system prompt
trajectory = {
"conversations": [
{"from": "system", "value": SYSTEM_PROMPT}, # No ephemeral
...
]
}
```
## Trajectory Compression
Long trajectories can be compressed for training using `trajectory_compressor.py`:
- Protects first/last N turns
- Summarizes middle turns with LLM
- Targets specific token budget
- See `configs/trajectory_compression.yaml` for settings

View File

@@ -1,547 +0,0 @@
# Messaging Platform Integrations (Gateway)
Hermes Agent can connect to messaging platforms like Telegram, Discord, and WhatsApp to serve as a conversational AI assistant.
## Quick Start
```bash
# 1. Set your bot token(s) in .env file
echo 'TELEGRAM_BOT_TOKEN="your_telegram_bot_token"' >> .env
echo 'DISCORD_BOT_TOKEN="your_discord_bot_token"' >> .env
# 2. Test the gateway (foreground)
./scripts/hermes-gateway run
# 3. Install as a system service (runs in background)
./scripts/hermes-gateway install
# 4. Manage the service
./scripts/hermes-gateway start
./scripts/hermes-gateway stop
./scripts/hermes-gateway restart
./scripts/hermes-gateway status
```
**Quick test (without service install):**
```bash
python cli.py --gateway # Runs in foreground, useful for debugging
```
## Architecture Overview
```
┌─────────────────────────────────────────────────────────────────┐
│ Hermes Gateway │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Telegram │ │ Discord │ │ WhatsApp │ │
│ │ Adapter │ │ Adapter │ │ Adapter │ │
│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │
│ │ │ │ │
│ └─────────────────┼─────────────────┘ │
│ │ │
│ ┌────────▼────────┐ │
│ │ Session Store │ │
│ │ (per-chat) │ │
│ └────────┬────────┘ │
│ │ │
│ ┌────────▼────────┐ │
│ │ AIAgent │ │
│ │ (run_agent) │ │
│ └─────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
```
## Session Management
### Session Persistence
Sessions persist across messages until they reset. The agent remembers your conversation context.
### Reset Policies
Sessions reset based on configurable policies:
| Policy | Default | Description |
|--------|---------|-------------|
| Daily | 4:00 AM | Reset at a specific hour each day |
| Idle | 120 min | Reset after N minutes of inactivity |
| Both | (combined) | Whichever triggers first |
### Manual Reset
Send `/new` or `/reset` as a message to start fresh.
### Per-Platform Overrides
Configure different reset policies per platform:
```json
{
"reset_by_platform": {
"telegram": { "mode": "idle", "idle_minutes": 240 },
"discord": { "mode": "idle", "idle_minutes": 60 }
}
}
```
## Platform Setup
### Telegram
1. **Create a bot** via [@BotFather](https://t.me/BotFather)
2. **Get your token** (looks like `123456789:ABCdefGHIjklMNOpqrsTUVwxyz`)
3. **Set environment variable:**
```bash
export TELEGRAM_BOT_TOKEN="your_token_here"
```
4. **Optional: Set home channel** for cron job delivery:
```bash
export TELEGRAM_HOME_CHANNEL="-1001234567890"
export TELEGRAM_HOME_CHANNEL_NAME="My Notes"
```
**Requirements:**
```bash
pip install python-telegram-bot>=20.0
```
### Discord
1. **Create an application** at [Discord Developer Portal](https://discord.com/developers/applications)
2. **Create a bot** under your application
3. **Get the bot token**
4. **Enable required intents:**
- Message Content Intent
- Server Members Intent (optional)
5. **Invite to your server** using OAuth2 URL generator (scopes: `bot`, `applications.commands`)
6. **Set environment variable:**
```bash
export DISCORD_BOT_TOKEN="your_token_here"
```
7. **Optional: Set home channel:**
```bash
export DISCORD_HOME_CHANNEL="123456789012345678"
export DISCORD_HOME_CHANNEL_NAME="#bot-updates"
```
**Requirements:**
```bash
pip install discord.py>=2.0
```
### WhatsApp
WhatsApp integration is more complex due to the lack of a simple bot API.
**Options:**
1. **WhatsApp Business API** (requires Meta verification)
2. **whatsapp-web.js** via Node.js bridge (for personal accounts)
**Bridge Setup:**
1. Install Node.js
2. Set up the bridge script (see `scripts/whatsapp-bridge/` for reference)
3. Configure in gateway:
```json
{
"platforms": {
"whatsapp": {
"enabled": true,
"extra": {
"bridge_script": "/path/to/bridge.js",
"bridge_port": 3000
}
}
}
}
```
## Configuration
There are **three ways** to configure the gateway (in order of precedence):
### 1. Environment Variables (`.env` file) - Recommended for Quick Setup
Add to your `~/.hermes/.env` file:
```bash
# =============================================================================
# MESSAGING PLATFORM TOKENS
# =============================================================================
# Telegram - get from @BotFather on Telegram
TELEGRAM_BOT_TOKEN=your_telegram_bot_token
TELEGRAM_ALLOWED_USERS=123456789,987654321 # Security: restrict to these user IDs
# Optional: Default channel for cron job delivery
TELEGRAM_HOME_CHANNEL=-1001234567890
TELEGRAM_HOME_CHANNEL_NAME="My Notes"
# Discord - get from Discord Developer Portal
DISCORD_BOT_TOKEN=your_discord_bot_token
DISCORD_ALLOWED_USERS=123456789012345678 # Security: restrict to these user IDs
# Optional: Default channel for cron job delivery
DISCORD_HOME_CHANNEL=123456789012345678
DISCORD_HOME_CHANNEL_NAME="#bot-updates"
# WhatsApp - requires Node.js bridge setup
WHATSAPP_ENABLED=true
# =============================================================================
# AGENT SETTINGS
# =============================================================================
# Max tool-calling iterations per conversation (default: 60)
HERMES_MAX_ITERATIONS=60
# Working directory for terminal commands (default: home ~)
MESSAGING_CWD=/home/myuser
# =============================================================================
# TOOL PROGRESS NOTIFICATIONS
# =============================================================================
# Show progress messages as agent uses tools
HERMES_TOOL_PROGRESS=true
# Mode: "new" (only when tool changes) or "all" (every tool call)
HERMES_TOOL_PROGRESS_MODE=new
# =============================================================================
# SESSION SETTINGS
# =============================================================================
# Reset sessions after N minutes of inactivity (default: 120)
SESSION_IDLE_MINUTES=120
# Daily reset hour in 24h format (default: 4 = 4am)
SESSION_RESET_HOUR=4
```
### 2. Gateway Config File (`~/.hermes/gateway.json`) - Full Control
For advanced configuration, create `~/.hermes/gateway.json`:
```json
{
"platforms": {
"telegram": {
"enabled": true,
"token": "your_telegram_token",
"home_channel": {
"platform": "telegram",
"chat_id": "-1001234567890",
"name": "My Notes"
}
},
"discord": {
"enabled": true,
"token": "your_discord_token",
"home_channel": {
"platform": "discord",
"chat_id": "123456789012345678",
"name": "#bot-updates"
}
}
},
"default_reset_policy": {
"mode": "both",
"at_hour": 4,
"idle_minutes": 120
},
"reset_by_platform": {
"discord": {
"mode": "idle",
"idle_minutes": 60
}
},
"always_log_local": true
}
```
## Platform-Specific Toolsets
Each platform has its own toolset for security:
| Platform | Toolset | Capabilities |
|----------|---------|--------------|
| CLI | `hermes-cli` | Full access (terminal, browser, etc.) |
| Telegram | `hermes-telegram` | Full tools including terminal |
| Discord | `hermes-discord` | Full tools including terminal |
| WhatsApp | `hermes-whatsapp` | Full tools including terminal |
## User Experience Features
### Typing Indicator
The gateway keeps the "typing..." indicator active throughout processing, refreshing every 4 seconds. This lets users know the bot is working even during long tool-calling sequences.
### Tool Progress Notifications
When `HERMES_TOOL_PROGRESS=true`, the bot sends status messages as it works:
```
💻 `ls -la`...
🔍 web_search...
📄 web_extract...
🎨 image_generate...
```
Terminal commands show the actual command (truncated to 50 chars). Other tools just show the tool name.
**Modes:**
- `new`: Only sends message when switching to a different tool (less spam)
- `all`: Sends message for every single tool call
### Working Directory
- **CLI (`hermes` command)**: Uses current directory where you run the command
- **Messaging**: Uses `MESSAGING_CWD` (default: home directory `~`)
This is intentional: CLI users are in a terminal and expect the agent to work in their current directory, while messaging users need a consistent starting location.
### Max Iterations
If the agent hits the max iteration limit while working, instead of a generic error, it asks the model to summarize what it found so far. This gives you a useful response even when the task couldn't be fully completed.
## Voice Messages (TTS)
The `text_to_speech` tool generates audio that the gateway delivers as native voice messages on each platform:
| Platform | Delivery | Format |
|----------|----------|--------|
| Telegram | Voice bubble (plays inline) | Opus `.ogg` — native from OpenAI/ElevenLabs, converted via ffmpeg for Edge TTS |
| Discord | Audio file attachment | MP3 |
| WhatsApp | Audio file attachment | MP3 |
| CLI | Saved to `~/voice-memos/` | MP3 |
**Providers:**
- **Edge TTS** (default) — Free, no API key, 322 voices in 74 languages
- **ElevenLabs** — Premium quality, requires `ELEVENLABS_API_KEY`
- **OpenAI TTS** — Good quality, requires `OPENAI_API_KEY`
Voice and provider are configured by the user in `~/.hermes/config.yaml` under the `tts:` key. The model only sends text; it does not choose the voice.
The tool returns a `MEDIA:<path>` tag that the gateway send pipeline intercepts and delivers as a native audio message. If `[[audio_as_voice]]` is present (Opus format available), Telegram sends it as a voice bubble instead of an audio file.
**Telegram voice bubbles & ffmpeg:**
Telegram requires Opus/OGG format for native voice bubbles (the round, inline-playable kind). **OpenAI and ElevenLabs** produce Opus natively when on Telegram — no extra setup needed. **Edge TTS** (the default free provider) outputs MP3 and needs `ffmpeg` to convert:
```bash
sudo apt install ffmpeg # Ubuntu/Debian
brew install ffmpeg # macOS
sudo dnf install ffmpeg # Fedora
```
Without ffmpeg, Edge TTS audio is sent as a regular audio file (still playable, but shows as a rectangular music player instead of a voice bubble).
## Cron Job Delivery
When scheduling cron jobs, you can specify where the output should be delivered:
```
User: "Remind me to check the server in 30 minutes"
Agent uses: schedule_cronjob(
prompt="Check server status...",
schedule="30m",
deliver="origin" # Back to this chat
)
```
### Delivery Options
| Option | Description |
|--------|-------------|
| `"origin"` | Back to where the job was created |
| `"local"` | Save to local files only |
| `"telegram"` | Telegram home channel |
| `"discord"` | Discord home channel |
| `"telegram:123456"` | Specific Telegram chat |
## Dynamic Context Injection
The agent knows where it is via injected context:
```
## Current Session Context
**Source:** Telegram (group: Dev Team, ID: -1001234567890)
**Connected Platforms:** local, telegram, discord
**Home Channels:**
- telegram: My Notes (ID: -1001234567890)
- discord: #bot-updates (ID: 123456789012345678)
**Delivery options for scheduled tasks:**
- "origin" → Back to this chat (Dev Team)
- "local" → Save to local files only
- "telegram" → Home channel (My Notes)
- "discord" → Home channel (#bot-updates)
```
## CLI Commands
| Command | Description |
|---------|-------------|
| `/platforms` | Show gateway configuration and status |
| `--gateway` | Start the gateway (CLI flag) |
## Troubleshooting
### "python-telegram-bot not installed"
```bash
pip install python-telegram-bot>=20.0
```
### "discord.py not installed"
```bash
pip install discord.py>=2.0
```
### "No platforms connected"
1. Check your environment variables are set
2. Check your tokens are valid
3. Try `/platforms` to see configuration status
### Session not persisting
1. Check `~/.hermes/sessions/` exists
2. Check session policies aren't too aggressive
3. Verify no errors in gateway logs
## Adding a New Platform
To add a new messaging platform:
### 1. Create the adapter
Create `gateway/platforms/your_platform.py`:
```python
from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult
from gateway.config import Platform, PlatformConfig
class YourPlatformAdapter(BasePlatformAdapter):
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.YOUR_PLATFORM)
async def connect(self) -> bool:
# Connect to the platform
...
async def disconnect(self) -> None:
# Disconnect
...
async def send(self, chat_id: str, content: str, ...) -> SendResult:
# Send a message
...
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
# Get chat information
...
```
### 2. Register the platform
Add to `gateway/config.py`:
```python
class Platform(Enum):
# ... existing ...
YOUR_PLATFORM = "your_platform"
```
### 3. Add to gateway runner
Update `gateway/run.py` `_create_adapter()`:
```python
elif platform == Platform.YOUR_PLATFORM:
from gateway.platforms.your_platform import YourPlatformAdapter
return YourPlatformAdapter(config)
```
### 4. Create a toolset (optional)
Add to `toolsets.py`:
```python
"hermes-your-platform": {
"description": "Your platform toolset",
"tools": [...],
"includes": []
}
```
### 5. Configure
Add environment variables to `.env`:
```bash
YOUR_PLATFORM_TOKEN=...
YOUR_PLATFORM_HOME_CHANNEL=...
```
## Service Management
### Linux (systemd)
```bash
# Install as user service
./scripts/hermes-gateway install
# Manage
systemctl --user start hermes-gateway
systemctl --user stop hermes-gateway
systemctl --user restart hermes-gateway
systemctl --user status hermes-gateway
# View logs
journalctl --user -u hermes-gateway -f
# Enable lingering (keeps running after logout)
sudo loginctl enable-linger $USER
```
### macOS (launchd)
```bash
# Install
./scripts/hermes-gateway install
# Manage
launchctl start ai.hermes.gateway
launchctl stop ai.hermes.gateway
# View logs
tail -f ~/.hermes/logs/gateway.log
```
### Manual (any platform)
```bash
# Run in foreground (for testing/debugging)
./scripts/hermes-gateway run
# Or via CLI (also foreground)
python cli.py --gateway
```
## Storage Locations
| Path | Purpose |
|------|---------|
| `~/.hermes/gateway.json` | Gateway configuration |
| `~/.hermes/sessions/sessions.json` | Session index |
| `~/.hermes/sessions/{id}.jsonl` | Conversation transcripts |
| `~/.hermes/cron/output/` | Cron job outputs |
| `~/.hermes/logs/gateway.log` | Gateway logs (macOS launchd) |

View File

@@ -1,163 +0,0 @@
# Tools
Tools are functions that extend the agent's capabilities. Each tool is defined with an OpenAI-compatible JSON schema and an async handler function.
## Tool Structure
Each tool module in `tools/` exports:
1. **Schema definitions** - OpenAI function-calling format
2. **Handler functions** - Async functions that execute the tool
```python
# Example: tools/web_tools.py
# Schema definition
WEB_SEARCH_SCHEMA = {
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web for information",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"}
},
"required": ["query"]
}
}
}
# Handler function
async def web_search(query: str) -> dict:
"""Execute web search and return results."""
# Implementation...
return {"results": [...]}
```
## Tool Categories
| Category | Module | Tools |
|----------|--------|-------|
| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` |
| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal/ssh backends) |
| **File** | `file_tools.py` | `read_file`, `write_file`, `patch`, `search` |
| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. |
| **Vision** | `vision_tools.py` | `vision_analyze` |
| **Image Gen** | `image_generation_tool.py` | `image_generate` |
| **TTS** | `tts_tool.py` | `text_to_speech` (Edge TTS free / ElevenLabs / OpenAI) |
| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` |
| **Skills** | `skills_tool.py` | `skills_list`, `skill_view` |
| **Cronjob** | `cronjob_tools.py` | `schedule_cronjob`, `list_cronjobs`, `remove_cronjob` |
| **RL Training** | `rl_training_tool.py` | `rl_list_environments`, `rl_start_training`, `rl_check_status`, etc. |
## Tool Registration
Tools are registered in `model_tools.py`:
```python
# model_tools.py
TOOL_SCHEMAS = [
*WEB_TOOL_SCHEMAS,
*TERMINAL_TOOL_SCHEMAS,
*BROWSER_TOOL_SCHEMAS,
# ...
]
TOOL_HANDLERS = {
"web_search": web_search,
"terminal": terminal_tool,
"browser_navigate": browser_navigate,
# ...
}
```
## Toolsets
Tools are grouped into **toolsets** for logical organization (see `toolsets.py`):
```python
TOOLSETS = {
"web": {
"description": "Web search and content extraction",
"tools": ["web_search", "web_extract", "web_crawl"]
},
"terminal": {
"description": "Command execution",
"tools": ["terminal"]
},
# ...
}
```
## Adding a New Tool
1. Create handler function in `tools/your_tool.py`
2. Define JSON schema following OpenAI format
3. Register in `model_tools.py` (schemas and handlers)
4. Add to appropriate toolset in `toolsets.py`
5. Update `tools/__init__.py` exports
## Stateful Tools
Some tools maintain state across calls within a session:
- **Terminal**: Keeps container/sandbox running between commands
- **Browser**: Maintains browser session for multi-step navigation
State is managed per `task_id` and cleaned up automatically.
## Terminal Backends
The terminal tool supports multiple execution backends:
| Backend | Description | Use Case |
|---------|-------------|----------|
| `local` | Direct execution on host | Development, simple tasks |
| `ssh` | Remote execution via SSH | Sandboxing (agent can't modify its own code) |
| `docker` | Docker container | Isolation, reproducibility |
| `singularity` | Singularity/Apptainer | HPC clusters, rootless containers |
| `modal` | Modal cloud | Scalable cloud compute, GPUs |
Configure via environment variables or `cli-config.yaml`:
```yaml
# SSH backend example (in cli-config.yaml)
terminal:
env_type: "ssh"
ssh_host: "my-server.example.com"
ssh_user: "myuser"
ssh_key: "~/.ssh/id_rsa"
cwd: "/home/myuser/project"
```
The SSH backend uses ControlMaster for connection persistence, making subsequent commands fast.
## Skills Tools (Progressive Disclosure)
Skills are on-demand knowledge documents. They use **progressive disclosure** to minimize tokens:
```
Level 0: skills_categories() → ["mlops", "devops"] (~50 tokens)
Level 1: skills_list(category) → [{name, description}, ...] (~3k tokens)
Level 2: skill_view(name) → Full content + metadata (varies)
Level 3: skill_view(name, path) → Specific reference file (varies)
```
Skill directory structure:
```
skills/
└── mlops/
└── axolotl/
├── SKILL.md # Main instructions (required)
├── references/ # Additional docs
└── templates/ # Output formats, configs
```
SKILL.md uses YAML frontmatter:
```yaml
---
name: axolotl
description: Fine-tuning LLMs with Axolotl
tags: [Fine-Tuning, LoRA, DPO]
---
```

View File

@@ -1,330 +0,0 @@
# Hermes-Agent Atropos Environments
This directory contains the integration layer between **hermes-agent's** tool-calling capabilities and the **Atropos** RL training framework. It provides everything needed to run agentic LLMs through multi-turn tool-calling loops, score their output with arbitrary reward functions, and feed results into Atropos for training or evaluation.
## Architecture Overview
```
Atropos Framework
┌───────────────────────┐
│ BaseEnv │ (atroposlib)
│ - Server management │
│ - Worker scheduling │
│ - Wandb logging │
│ - CLI (serve/process/ │
│ evaluate) │
└───────────┬───────────┘
│ inherits
┌───────────┴───────────┐
│ HermesAgentBaseEnv │ hermes_base_env.py
│ - Terminal backend │
│ - Tool resolution │
│ - Agent loop │
│ - ToolContext │
│ - Async patches │
└───────────┬───────────┘
│ inherits
┌─────────────────┼─────────────────┐
│ │ │
TerminalTestEnv HermesSweEnv TerminalBench2EvalEnv
(stack testing) (SWE training) (TB2 benchmark eval)
```
### Inheritance Chain
**BaseEnv** (from `atroposlib`) is the Atropos base class. It provides:
- Server management (OpenAI-compatible API servers, VLLM, SGLang)
- Worker scheduling for parallel rollouts
- Wandb integration for metrics and rollout logging
- CLI interface with three subcommands: `serve`, `process`, `evaluate`
- `evaluate_log()` for saving eval results to JSON + samples.jsonl
**HermesAgentBaseEnv** (`hermes_base_env.py`) extends BaseEnv with hermes-agent specifics:
- Sets `os.environ["TERMINAL_ENV"]` to configure the terminal backend (local, docker, modal, ssh, singularity)
- Resolves hermes-agent toolsets via `_resolve_tools_for_group()` (calls `get_tool_definitions()` from `model_tools.py`)
- Implements `collect_trajectory()` which runs the full agent loop and computes rewards
- Supports two-phase operation (Phase 1: OpenAI server, Phase 2: VLLM ManagedServer)
- Applies monkey patches for async-safe tool operation at import time
Concrete environments inherit from `HermesAgentBaseEnv` and implement:
- `setup()` -- Load dataset, initialize state
- `get_next_item()` -- Return the next item for rollout
- `format_prompt()` -- Convert a dataset item into the user message
- `compute_reward()` -- Score the rollout using ToolContext
- `evaluate()` -- Periodic evaluation logic
## Core Components
### Agent Loop (`agent_loop.py`)
`HermesAgentLoop` is the reusable multi-turn agent engine. It runs the same pattern as hermes-agent's `run_agent.py`:
1. Send messages + tools to the API via `server.chat_completion()`
2. If the response contains `tool_calls`, execute each one via `handle_function_call()` from `model_tools.py`
3. Append tool results to the conversation and go back to step 1
4. If the response has no tool_calls, the agent is done
Tool calls are executed in a thread pool (`run_in_executor`) so backends that use `asyncio.run()` internally (Modal, Docker) don't deadlock inside Atropos's event loop.
Returns an `AgentResult` containing the full conversation history, turn count, reasoning content per turn, tool errors, and optional ManagedServer state (for Phase 2).
### Tool Context (`tool_context.py`)
`ToolContext` is a per-rollout handle that gives reward/verification functions direct access to **all** hermes-agent tools, scoped to the rollout's `task_id`. The same `task_id` means the terminal/browser session is the SAME one the model used during its rollout -- all state (files, processes, browser tabs) is preserved.
```python
async def compute_reward(self, item, result, ctx: ToolContext):
# Run tests in the model's terminal sandbox
test = ctx.terminal("pytest -v")
if test["exit_code"] == 0:
return 1.0
# Check if a file was created
content = ctx.read_file("/workspace/solution.py")
if content.get("content"):
return 0.5
# Download files locally for verification (binary-safe)
ctx.download_file("/remote/output.bin", "/local/output.bin")
return 0.0
```
Available methods:
- **Terminal**: `terminal(command, timeout)` -- run shell commands
- **Files**: `read_file(path)`, `write_file(path, content)`, `search(query, path)`
- **Transfers**: `upload_file()`, `upload_dir()`, `download_file()`, `download_dir()` -- binary-safe file transfers between host and sandbox
- **Web**: `web_search(query)`, `web_extract(urls)`
- **Browser**: `browser_navigate(url)`, `browser_snapshot()`
- **Generic**: `call_tool(name, args)` -- call any hermes-agent tool by name
- **Cleanup**: `cleanup()` -- release all resources (called automatically after `compute_reward`)
### Patches (`patches.py`)
**Problem**: Some hermes-agent tools use `asyncio.run()` internally (e.g., mini-swe-agent's Modal backend via SWE-ReX). This crashes when called from inside Atropos's event loop because `asyncio.run()` cannot be nested.
**Solution**: `patches.py` monkey-patches `SwerexModalEnvironment` to use a dedicated background thread (`_AsyncWorker`) with its own event loop. The calling code sees the same sync interface, but internally the async work happens on a separate thread that doesn't conflict with Atropos's loop.
What gets patched:
- `SwerexModalEnvironment.__init__` -- creates Modal deployment on a background thread
- `SwerexModalEnvironment.execute` -- runs commands on the same background thread
- `SwerexModalEnvironment.stop` -- stops deployment on the background thread
The patches are:
- **Idempotent** -- calling `apply_patches()` multiple times is safe
- **Transparent** -- same interface and behavior, only the internal async execution changes
- **Universal** -- works identically in normal CLI use (no running event loop)
Applied automatically at import time by `hermes_base_env.py`.
### Tool Call Parsers (`tool_call_parsers/`)
Client-side parsers that extract structured `tool_calls` from raw model output text. Used in **Phase 2** (VLLM server type) where ManagedServer's `/generate` endpoint returns raw text without tool call parsing.
Each parser is a standalone reimplementation of the corresponding VLLM parser's `extract_tool_calls()` logic. No VLLM dependency -- only standard library (`re`, `json`, `uuid`) and `openai` types.
Available parsers:
- `hermes` -- Hermes/ChatML `<tool_call>` XML format
- `mistral` -- Mistral `[TOOL_CALLS]` format
- `llama3_json` -- Llama 3 JSON tool calling
- `qwen` -- Qwen tool calling format
- `qwen3_coder` -- Qwen3 Coder format
- `deepseek_v3` -- DeepSeek V3 format
- `deepseek_v3_1` -- DeepSeek V3.1 format
- `kimi_k2` -- Kimi K2 format
- `longcat` -- Longcat format
- `glm45` / `glm47` -- GLM model formats
Usage:
```python
from environments.tool_call_parsers import get_parser
parser = get_parser("hermes")
content, tool_calls = parser.parse(raw_model_output)
```
In Phase 1 (OpenAI server type), these parsers are not needed -- the server handles tool call parsing natively.
## Two-Phase Operation
### Phase 1: OpenAI Server (Evaluation / SFT Data Generation)
Uses `server.chat_completion()` with `tools=` parameter. The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing natively. Returns `ChatCompletion` objects with structured `tool_calls`.
- Good for: evaluation, SFT data generation, testing
- Run with: `serve` (with `run-api`), `process`, or `evaluate` subcommands
- Placeholder tokens are created for the Atropos pipeline
### Phase 2: VLLM ManagedServer (Full RL Training)
Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser (from `tool_call_parsers/`) reconstructs structured `tool_calls` from raw output.
- Good for: full RL training with GRPO/PPO
- Run with: `serve` subcommand
- Real tokens, masks, and logprobs flow through the pipeline
## Directory Structure
```
environments/
├── README.md # This file
├── __init__.py # Package exports
├── hermes_base_env.py # Abstract base (HermesAgentBaseEnv)
├── agent_loop.py # Multi-turn agent engine (HermesAgentLoop)
├── tool_context.py # Per-rollout tool access for reward functions
├── patches.py # Async-safety patches for Modal backend
├── tool_call_parsers/ # Phase 2 client-side parsers
│ ├── __init__.py # Registry + base class
│ ├── hermes_parser.py
│ ├── mistral_parser.py
│ ├── llama_parser.py
│ ├── qwen_parser.py
│ ├── qwen3_coder_parser.py
│ ├── deepseek_v3_parser.py
│ ├── deepseek_v3_1_parser.py
│ ├── kimi_k2_parser.py
│ ├── longcat_parser.py
│ ├── glm45_parser.py
│ └── glm47_parser.py
├── terminal_test_env/ # Stack validation environment
│ └── terminal_test_env.py
├── hermes_swe_env/ # SWE-bench style training environment
│ └── hermes_swe_env.py
└── benchmarks/ # Evaluation benchmarks
└── terminalbench_2/
└── terminalbench2_env.py
```
## Concrete Environments
### TerminalTestEnv (`terminal_test_env/`)
A self-contained environment with inline tasks (no external dataset needed) for validating the full stack end-to-end. Each task asks the model to create a file at a known path, and the verifier checks the content matches.
```bash
# Serve mode (needs run-api)
run-api
python environments/terminal_test_env/terminal_test_env.py serve
# Process mode (no run-api, saves to JSONL)
python environments/terminal_test_env/terminal_test_env.py process \
--env.data_path_to_save_groups terminal_test_output.jsonl
```
### HermesSweEnv (`hermes_swe_env/`)
SWE-bench style training environment. The model gets a coding task, uses terminal + file + web tools to solve it, and the reward function runs tests in the same Modal sandbox.
```bash
python environments/hermes_swe_env/hermes_swe_env.py serve \
--openai.model_name YourModel \
--env.dataset_name bigcode/humanevalpack \
--env.terminal_backend modal
```
### TerminalBench2EvalEnv (`benchmarks/terminalbench_2/`)
**Eval-only** environment for the Terminal-Bench 2.0 benchmark (89 tasks). Each task gets a pre-built Docker Hub image, a natural language instruction, and a test suite. The agent uses terminal + file tools to solve the task, then the test suite verifies correctness.
Follows the standard Atropos eval pattern (like GPQA, MMLU, etc.):
- Run via `evaluate` subcommand (no `run-api` needed)
- `setup()` loads the dataset, `evaluate()` runs all tasks
- `rollout_and_score_eval()` handles per-task agent loop + test verification
- Downloads verifier output locally for reliable reward checking (Harbor pattern)
```bash
# Run full benchmark
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
--openai.model_name anthropic/claude-opus-4.6
# Run subset of tasks
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
--openai.model_name anthropic/claude-opus-4.6 \
--env.task_filter fix-git,git-multibranch
# Skip specific tasks
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
--openai.model_name anthropic/claude-opus-4.6 \
--env.skip_tasks heavy-task,slow-task
```
## Creating a New Environment
### Training Environment
1. Create a new directory under `environments/`
2. Create your env file inheriting from `HermesAgentBaseEnv`
3. Implement the four abstract methods + `evaluate()`
```python
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
class MyEnvConfig(HermesAgentEnvConfig):
pass # Add custom fields as needed
class MyEnv(HermesAgentBaseEnv):
name = "my-env"
env_config_cls = MyEnvConfig
@classmethod
def config_init(cls):
env_config = MyEnvConfig(
enabled_toolsets=["terminal", "file"],
terminal_backend="modal",
# ... other config
)
server_configs = [APIServerConfig(...)]
return env_config, server_configs
async def setup(self):
self.dataset = load_dataset(...)
self.iter = 0
async def get_next_item(self):
item = self.dataset[self.iter % len(self.dataset)]
self.iter += 1
return item
def format_prompt(self, item):
return item["instruction"]
async def compute_reward(self, item, result, ctx):
# ctx gives you full tool access to the rollout's sandbox
test = ctx.terminal("pytest -v")
return 1.0 if test["exit_code"] == 0 else 0.0
async def evaluate(self, *args, **kwargs):
# Periodic evaluation logic
...
if __name__ == "__main__":
MyEnv.cli()
```
### Eval-Only Environment (Benchmark)
For eval benchmarks, follow the pattern in `terminalbench2_env.py`:
1. Create under `environments/benchmarks/your-benchmark/`
2. Inherit from `HermesAgentBaseEnv`
3. Set eval-only config: `eval_handling=STOP_TRAIN`, `steps_per_eval=1`, `total_steps=1`
4. Stub the training methods (`collect_trajectories`, `score`)
5. Implement `rollout_and_score_eval()` and `evaluate()`
6. Run with `evaluate` subcommand
## Key Config Fields
| Field | Description | Default |
|-------|-------------|---------|
| `enabled_toolsets` | Which hermes toolsets to enable | `None` (all) |
| `disabled_toolsets` | Toolsets to disable | `None` |
| `distribution` | Probabilistic toolset distribution name | `None` |
| `max_agent_turns` | Max LLM calls per rollout | `30` |
| `agent_temperature` | Sampling temperature | `1.0` |
| `terminal_backend` | `local`, `docker`, `modal`, `ssh`, `singularity` | `local` |
| `system_prompt` | System message for the agent | `None` |
| `tool_call_parser` | Parser name for Phase 2 | `hermes` |
| `eval_handling` | `STOP_TRAIN`, `LIMIT_TRAIN`, `NONE` | `STOP_TRAIN` |

View File

@@ -1,32 +0,0 @@
"""
Hermes-Agent Atropos Environments
Provides a layered integration between hermes-agent's tool-calling capabilities
and the Atropos RL training framework.
Core layers:
- agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
- tool_context: Per-rollout tool access handle for reward/verification functions
- hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
- tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
Concrete environments:
- terminal_test_env/: Simple file-creation tasks for testing the stack
- hermes_swe_env/: SWE-bench style tasks with Modal sandboxes
- endless_terminals/: Terminal tasks from HuggingFace dataset with Apptainer containers
Benchmarks (eval-only):
- benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation
"""
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
__all__ = [
"AgentResult",
"HermesAgentLoop",
"ToolContext",
"HermesAgentBaseEnv",
"HermesAgentEnvConfig",
]

View File

@@ -1,421 +0,0 @@
"""
HermesAgentLoop -- Reusable Multi-Turn Agent Engine
Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
Works with any server that returns ChatCompletion objects with tool_calls:
- Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
- Phase 2: ManagedServer with client-side tool call parser
The loop passes tools= and checks response.choices[0].message.tool_calls,
identical to hermes-agent's run_agent.py. Tool execution is dispatched via
handle_function_call() from model_tools.py.
"""
import asyncio
import concurrent.futures
import json
import logging
import os
import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set
from model_tools import handle_function_call
# Thread pool for running sync tool calls that internally use asyncio.run()
# (e.g., mini-swe-agent's modal/docker backends). Running them in a separate
# thread gives them a clean event loop so they don't deadlock inside Atropos's loop.
# Size must be large enough for concurrent eval tasks (e.g., 89 TB2 tasks all
# making tool calls). Too small = thread pool starvation, tasks queue for minutes.
# Resized at runtime by HermesAgentBaseEnv.__init__ via resize_tool_pool().
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=128)
def resize_tool_pool(max_workers: int):
"""
Replace the global tool executor with a new one of the given size.
Called by HermesAgentBaseEnv.__init__ based on config.tool_pool_size.
Safe to call before any tasks are submitted.
"""
global _tool_executor
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
logger.info("Tool thread pool resized to %d workers", max_workers)
logger = logging.getLogger(__name__)
@dataclass
class ToolError:
"""Record of a tool execution error during the agent loop."""
turn: int # Which turn the error occurred on
tool_name: str # Which tool was called
arguments: str # The arguments passed (truncated)
error: str # The error message
tool_result: str # The raw result returned to the model
@dataclass
class AgentResult:
"""Result of running the agent loop."""
# Full conversation history in OpenAI message format
messages: List[Dict[str, Any]]
# ManagedServer.get_state() if available (Phase 2), None otherwise
managed_state: Optional[Dict[str, Any]] = None
# How many LLM calls were made
turns_used: int = 0
# True if model stopped calling tools naturally (vs hitting max_turns)
finished_naturally: bool = False
# Extracted reasoning content per turn (from PR #297 helpers)
reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
# Tool errors encountered during the loop
tool_errors: List[ToolError] = field(default_factory=list)
def _extract_reasoning_from_message(message) -> Optional[str]:
"""
Extract reasoning content from a ChatCompletion message.
Handles multiple provider formats:
1. message.reasoning_content field (some providers)
2. message.reasoning field (some providers)
3. message.reasoning_details[].text (OpenRouter style)
Note: <think> block extraction from content is NOT done here -- that's
handled by the response already in Phase 1 (server does it) or by
ManagedServer's patch in Phase 2.
Args:
message: The assistant message from ChatCompletion response
Returns:
Extracted reasoning text, or None if not found
"""
# Check reasoning_content field (common across providers)
if hasattr(message, "reasoning_content") and message.reasoning_content:
return message.reasoning_content
# Check reasoning field
if hasattr(message, "reasoning") and message.reasoning:
return message.reasoning
# Check reasoning_details (OpenRouter style)
if hasattr(message, "reasoning_details") and message.reasoning_details:
for detail in message.reasoning_details:
if hasattr(detail, "text") and detail.text:
return detail.text
if isinstance(detail, dict) and detail.get("text"):
return detail["text"]
return None
class HermesAgentLoop:
"""
Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
Same pattern as run_agent.py:
- Pass tools= to the API
- Check response.choices[0].message.tool_calls
- Dispatch via handle_function_call()
Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
or ManagedServer with a parser. The server determines how tool_calls get
populated on the response.
"""
def __init__(
self,
server,
tool_schemas: List[Dict[str, Any]],
valid_tool_names: Set[str],
max_turns: int = 30,
task_id: Optional[str] = None,
temperature: float = 1.0,
max_tokens: Optional[int] = None,
extra_body: Optional[Dict[str, Any]] = None,
):
"""
Initialize the agent loop.
Args:
server: Server object with chat_completion() method (OpenAIServer,
ManagedServer, ServerManager, etc.)
tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
valid_tool_names: Set of tool names the model is allowed to call
max_turns: Maximum number of LLM calls before stopping
task_id: Unique ID for terminal/browser session isolation
temperature: Sampling temperature for generation
max_tokens: Max tokens per generation (None for server default)
extra_body: Extra parameters passed to the OpenAI client's create() call.
Used for OpenRouter provider preferences, transforms, etc.
e.g. {"provider": {"ignore": ["DeepInfra"]}}
"""
self.server = server
self.tool_schemas = tool_schemas
self.valid_tool_names = valid_tool_names
self.max_turns = max_turns
self.task_id = task_id or str(uuid.uuid4())
self.temperature = temperature
self.max_tokens = max_tokens
self.extra_body = extra_body
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
"""
Execute the full agent loop using standard OpenAI tool calling.
Args:
messages: Initial conversation messages (system + user).
Modified in-place as the conversation progresses.
Returns:
AgentResult with full conversation history, managed state, and metadata
"""
reasoning_per_turn = []
tool_errors: List[ToolError] = []
import time as _time
for turn in range(self.max_turns):
turn_start = _time.monotonic()
# Build the chat_completion kwargs
chat_kwargs = {
"messages": messages,
"n": 1,
"temperature": self.temperature,
}
# Only pass tools if we have them
if self.tool_schemas:
chat_kwargs["tools"] = self.tool_schemas
# Only pass max_tokens if explicitly set
if self.max_tokens is not None:
chat_kwargs["max_tokens"] = self.max_tokens
# Inject extra_body for provider-specific params (e.g., OpenRouter
# provider preferences like banned/preferred providers, transforms)
if self.extra_body:
chat_kwargs["extra_body"] = self.extra_body
# Make the API call -- standard OpenAI spec
api_start = _time.monotonic()
try:
response = await self.server.chat_completion(**chat_kwargs)
except Exception as e:
api_elapsed = _time.monotonic() - api_start
logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
tool_errors=tool_errors,
)
api_elapsed = _time.monotonic() - api_start
if not response or not response.choices:
logger.warning("Empty response on turn %d (api=%.1fs)", turn + 1, api_elapsed)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
tool_errors=tool_errors,
)
assistant_msg = response.choices[0].message
# Extract reasoning content from the response (all provider formats)
reasoning = _extract_reasoning_from_message(assistant_msg)
reasoning_per_turn.append(reasoning)
# Check for tool calls -- standard OpenAI spec
if assistant_msg.tool_calls:
# Build the assistant message dict for conversation history
msg_dict: Dict[str, Any] = {
"role": "assistant",
"content": assistant_msg.content or "",
"tool_calls": [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments,
},
}
for tc in assistant_msg.tool_calls
],
}
# Preserve reasoning_content for multi-turn chat template handling
# (e.g., Kimi-K2's template renders <think> blocks differently
# for history vs. the latest turn based on this field)
if reasoning:
msg_dict["reasoning_content"] = reasoning
messages.append(msg_dict)
# Execute each tool call via hermes-agent's dispatch
for tc in assistant_msg.tool_calls:
tool_name = tc.function.name
tool_args_raw = tc.function.arguments
# Validate tool name
if tool_name not in self.valid_tool_names:
tool_result = json.dumps(
{
"error": f"Unknown tool '{tool_name}'. "
f"Available tools: {sorted(self.valid_tool_names)}"
}
)
tool_errors.append(ToolError(
turn=turn + 1, tool_name=tool_name,
arguments=tool_args_raw[:200],
error=f"Unknown tool '{tool_name}'",
tool_result=tool_result,
))
logger.warning(
"Model called unknown tool '%s' on turn %d",
tool_name, turn + 1,
)
else:
# Parse arguments and dispatch
try:
args = json.loads(tool_args_raw)
except json.JSONDecodeError:
args = {}
logger.warning(
"Invalid JSON in tool call arguments for '%s': %s",
tool_name, tool_args_raw[:200],
)
try:
if tool_name == "terminal":
backend = os.getenv("TERMINAL_ENV", "local")
cmd_preview = args.get("command", "")[:80]
logger.info(
"[%s] $ %s", self.task_id[:8], cmd_preview,
)
# Run tool calls in a thread pool so backends that use
# asyncio.run() internally (modal, docker) get a clean
# event loop instead of deadlocking inside Atropos's loop.
tool_submit_time = _time.monotonic()
loop = asyncio.get_event_loop()
tool_result = await loop.run_in_executor(
_tool_executor,
lambda: handle_function_call(
tool_name, args, task_id=self.task_id
),
)
tool_elapsed = _time.monotonic() - tool_submit_time
# Log slow tools and thread pool stats for debugging
pool_active = _tool_executor._work_queue.qsize()
if tool_elapsed > 30:
logger.warning(
"[%s] turn %d: %s took %.1fs (pool queue=%d)",
self.task_id[:8], turn + 1, tool_name,
tool_elapsed, pool_active,
)
except Exception as e:
tool_result = json.dumps(
{"error": f"Tool execution failed: {type(e).__name__}: {str(e)}"}
)
tool_errors.append(ToolError(
turn=turn + 1, tool_name=tool_name,
arguments=tool_args_raw[:200],
error=f"{type(e).__name__}: {str(e)}",
tool_result=tool_result,
))
logger.error(
"Tool '%s' execution failed on turn %d: %s",
tool_name, turn + 1, e,
)
# Also check if the tool returned an error in its JSON result
try:
result_data = json.loads(tool_result)
if isinstance(result_data, dict):
err = result_data.get("error")
exit_code = result_data.get("exit_code")
if err and exit_code and exit_code < 0:
tool_errors.append(ToolError(
turn=turn + 1, tool_name=tool_name,
arguments=tool_args_raw[:200],
error=str(err),
tool_result=tool_result[:500],
))
except (json.JSONDecodeError, TypeError):
pass
# Add tool response to conversation
messages.append(
{
"role": "tool",
"tool_call_id": tc.id,
"content": tool_result,
}
)
turn_elapsed = _time.monotonic() - turn_start
logger.info(
"[%s] turn %d: api=%.1fs, %d tools, turn_total=%.1fs",
self.task_id[:8], turn + 1, api_elapsed,
len(assistant_msg.tool_calls), turn_elapsed,
)
else:
# No tool calls -- model is done
msg_dict = {
"role": "assistant",
"content": assistant_msg.content or "",
}
if reasoning:
msg_dict["reasoning_content"] = reasoning
messages.append(msg_dict)
turn_elapsed = _time.monotonic() - turn_start
logger.info(
"[%s] turn %d: api=%.1fs, no tools (finished), turn_total=%.1fs",
self.task_id[:8], turn + 1, api_elapsed, turn_elapsed,
)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=True,
reasoning_per_turn=reasoning_per_turn,
tool_errors=tool_errors,
)
# Hit max turns without the model stopping
logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=self.max_turns,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
tool_errors=tool_errors,
)
def _get_managed_state(self) -> Optional[Dict[str, Any]]:
"""
Get ManagedServer state if the server supports it.
Returns state dict with SequenceNodes containing tokens/logprobs/masks,
or None if the server doesn't support get_state() (e.g., regular OpenAI server).
"""
if hasattr(self.server, "get_state"):
return self.server.get_state()
return None

View File

@@ -1,38 +0,0 @@
# Terminal-Bench 2.0 Evaluation -- Default Configuration
#
# Eval-only environment for the TB2 benchmark (89 terminal tasks).
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
# and OpenRouter for inference.
#
# Usage:
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
# --config environments/benchmarks/terminalbench_2/default.yaml
#
# # Override model:
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
# --config environments/benchmarks/terminalbench_2/default.yaml \
# --openai.model_name anthropic/claude-sonnet-4
env:
enabled_toolsets: ["terminal", "file"]
max_agent_turns: 60
max_token_length: 32000
agent_temperature: 0.8
terminal_backend: "modal"
terminal_timeout: 300 # 5 min per command (builds, pip install)
tool_pool_size: 128 # thread pool for 89 parallel tasks
dataset_name: "NousResearch/terminal-bench-2"
test_timeout: 600
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
use_wandb: true
wandb_name: "terminal-bench-2"
ensure_scores_are_not_same: false
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-opus-4.6"
server_type: "openai"
health_check: false
# api_key loaded from OPENROUTER_API_KEY in .env

View File

@@ -1,32 +0,0 @@
#!/bin/bash
# Terminal-Bench 2.0 Evaluation
#
# Run from repo root:
# bash environments/benchmarks/terminalbench_2/run_eval.sh
#
# Override model:
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
# --openai.model_name anthropic/claude-sonnet-4
#
# Run a subset:
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
# --env.task_filter fix-git,git-multibranch
mkdir -p logs evals/terminal-bench-2
LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log"
echo "Terminal-Bench 2.0 Evaluation"
echo "Log: $LOG_FILE"
echo ""
export TERMINAL_ENV=modal
export TERMINAL_TIMEOUT=300
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
--config environments/benchmarks/terminalbench_2/default.yaml \
"$@" \
2>&1 | tee "$LOG_FILE"
echo ""
echo "Log saved to: $LOG_FILE"

View File

@@ -1,904 +0,0 @@
"""
TerminalBench2Env -- Terminal-Bench 2.0 Evaluation Environment
Evaluates agentic LLMs on challenging terminal tasks from Terminal-Bench 2.0.
Each task provides a unique Docker environment (pre-built on Docker Hub), a natural
language instruction, and a test suite for verification. The agent uses terminal +
file tools to complete the task, then the test suite runs inside the same sandbox.
This is an eval-only environment (not a training environment). It is designed to
be run via the `evaluate` subcommand:
python environments/terminalbench2_env.py evaluate \\
--env.dataset_name NousResearch/terminal-bench-2
The evaluate flow:
1. setup() -- Loads the TB2 dataset from HuggingFace
2. evaluate() -- Iterates over all tasks, running each through:
a. rollout_and_score_eval() -- Per-task agent loop + test verification
- Resolves Docker image (pre-built Hub image or Dockerfile fallback)
- Registers per-task Modal sandbox via register_task_env_overrides()
- Runs the HermesAgentLoop (terminal + file tools)
- Uploads test suite and runs test.sh in the same sandbox
- Returns binary pass/fail result
b. Aggregates per-task, per-category, and overall pass rates
c. Logs results via evaluate_log() and wandb
Key features:
- Per-task Modal sandboxes using pre-built Docker Hub images
- Binary reward: 1.0 if all tests pass, 0.0 otherwise
- Concurrency-controlled parallel evaluation via asyncio.Semaphore
- Per-task, per-category, and aggregate pass rate tracking
"""
import asyncio
import base64
import io
import json
import logging
import os
import shutil
import sys
import tarfile
import tempfile
import time
import uuid
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
# Ensure repo root is on sys.path for imports
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from pydantic import Field
from atroposlib.envs.base import EvalHandlingEnum
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.tool_context import ToolContext
from tools.terminal_tool import (
register_task_env_overrides,
clear_task_env_overrides,
cleanup_vm,
)
logger = logging.getLogger(__name__)
# =============================================================================
# Configuration
# =============================================================================
class TerminalBench2EvalConfig(HermesAgentEnvConfig):
"""
Configuration for the Terminal-Bench 2.0 evaluation environment.
Extends HermesAgentEnvConfig with TB2-specific settings for dataset loading,
test execution, task filtering, and eval concurrency.
"""
# --- Dataset ---
dataset_name: str = Field(
default="NousResearch/terminal-bench-2",
description="HuggingFace dataset containing TB2 tasks.",
)
# --- Test execution ---
test_timeout: int = Field(
default=180,
description="Timeout in seconds for running the test suite after agent completes.",
)
# --- Image strategy ---
force_build: bool = Field(
default=False,
description="If True, always build from Dockerfile (ignore docker_image). "
"Useful for testing custom Dockerfiles.",
)
# --- Task filtering (comma-separated from CLI) ---
task_filter: Optional[str] = Field(
default=None,
description="Comma-separated task names to run (e.g., 'fix-git,git-multibranch'). "
"If not set, all tasks are run.",
)
skip_tasks: Optional[str] = Field(
default=None,
description="Comma-separated task names to skip on top of the default skip list.",
)
# --- Per-task wall-clock timeout ---
task_timeout: int = Field(
default=1800,
description="Maximum wall-clock seconds per task (agent loop + verification). "
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
)
# Tasks that cannot run properly on Modal and are excluded from scoring.
MODAL_INCOMPATIBLE_TASKS = {
"qemu-startup", # Needs KVM/hardware virtualization
"qemu-alpine-ssh", # Needs KVM/hardware virtualization
"crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts
}
# =============================================================================
# Tar extraction helper
# =============================================================================
def _extract_base64_tar(b64_data: str, target_dir: Path):
"""Extract a base64-encoded tar.gz archive into target_dir."""
if not b64_data:
return
raw = base64.b64decode(b64_data)
buf = io.BytesIO(raw)
with tarfile.open(fileobj=buf, mode="r:gz") as tar:
tar.extractall(path=str(target_dir))
# =============================================================================
# Main Environment
# =============================================================================
class TerminalBench2EvalEnv(HermesAgentBaseEnv):
"""
Terminal-Bench 2.0 evaluation environment (eval-only, no training).
Inherits from HermesAgentBaseEnv for:
- Terminal backend setup (os.environ["TERMINAL_ENV"])
- Tool resolution via _resolve_tools_for_group()
- Monkey patches for async-safe tool operation
- Wandb trajectory formatting
The evaluate flow (triggered by `environment.py evaluate`):
1. setup() -- Load dataset from HuggingFace
2. evaluate() -- Run all tasks through rollout_and_score_eval()
Each task in rollout_and_score_eval():
1. Resolve Docker image (pre-built Hub image or Dockerfile fallback)
2. Register per-task Modal sandbox override
3. Run HermesAgentLoop with terminal + file tools
4. Upload test suite and execute test.sh in the same sandbox
5. Check /logs/verifier/reward.txt for pass/fail
6. Clean up sandbox, overrides, and temp files
"""
name = "terminal-bench-2"
env_config_cls = TerminalBench2EvalConfig
@classmethod
def config_init(cls) -> Tuple[TerminalBench2EvalConfig, List[APIServerConfig]]:
"""
Default configuration for Terminal-Bench 2.0 evaluation.
Uses eval-only settings:
- eval_handling=STOP_TRAIN so the eval flow runs cleanly
- steps_per_eval=1, total_steps=1 so eval triggers immediately
- group_size=1 (one rollout per group, each task is expensive)
Uses Modal terminal backend (cloud-isolated sandbox per task) and
OpenRouter with Claude for inference.
"""
env_config = TerminalBench2EvalConfig(
# Terminal + file tools only (the agent interacts via shell commands)
enabled_toolsets=["terminal", "file"],
disabled_toolsets=None,
distribution=None,
# Agent settings -- TB2 tasks are complex, need many turns
max_agent_turns=60,
max_token_length=16000,
agent_temperature=0.6,
system_prompt=None,
# Modal backend for per-task cloud-isolated sandboxes
terminal_backend="modal",
terminal_timeout=300, # 5 min per command (builds, pip install, etc.)
# Test execution timeout (TB2 test scripts can install deps like pytest)
test_timeout=180,
# 89 tasks run in parallel, each needs a thread for tool calls
tool_pool_size=128,
# --- Eval-only Atropos settings ---
# These settings make the env work as an eval-only environment:
# - STOP_TRAIN: pauses training during eval (standard for eval envs)
# - steps_per_eval=1, total_steps=1: eval triggers immediately
# - group_size=1: one rollout per group (each task is expensive)
eval_handling=EvalHandlingEnum.STOP_TRAIN,
group_size=1,
steps_per_eval=1,
total_steps=1,
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
use_wandb=True,
wandb_name="terminal-bench-2",
ensure_scores_are_not_same=False, # Binary rewards may all be 0 or 1
)
# OpenRouter with Claude -- API key loaded from .env
server_configs = [
APIServerConfig(
base_url="https://openrouter.ai/api/v1",
model_name="anthropic/claude-sonnet-4",
server_type="openai",
api_key=os.getenv("OPENROUTER_API_KEY", ""),
health_check=False,
)
]
return env_config, server_configs
# =========================================================================
# Setup -- load dataset
# =========================================================================
async def setup(self):
"""Load the Terminal-Bench 2.0 dataset from HuggingFace."""
from datasets import load_dataset
# Auto-set terminal_lifetime to task_timeout + 120s so sandboxes
# never get killed during an active task, but still get cleaned up
# promptly after the task times out.
lifetime = self.config.task_timeout + 120
self.config.terminal_lifetime = lifetime
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(lifetime)
print(f" Terminal lifetime auto-set to {lifetime}s (task_timeout + 120s)")
print(f"Loading TB2 dataset from: {self.config.dataset_name}")
ds = load_dataset(self.config.dataset_name, split="train")
# Apply task filters (comma-separated strings from CLI)
tasks = list(ds)
if self.config.task_filter:
allowed = {name.strip() for name in self.config.task_filter.split(",")}
tasks = [t for t in tasks if t["task_name"] in allowed]
print(f" Filtered to {len(tasks)} tasks: {sorted(allowed)}")
# Skip tasks incompatible with the current backend (e.g., QEMU on Modal)
# plus any user-specified skip_tasks
skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set()
if self.config.skip_tasks:
skip |= {name.strip() for name in self.config.skip_tasks.split(",")}
if skip:
before = len(tasks)
tasks = [t for t in tasks if t["task_name"] not in skip]
skipped = before - len(tasks)
if skipped > 0:
print(f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}")
self.all_eval_items = tasks
self.iter = 0
# Build category index for per-category metrics
self.category_index: Dict[str, List[int]] = defaultdict(list)
for i, task in enumerate(self.all_eval_items):
self.category_index[task.get("category", "unknown")].append(i)
# Reward tracking for wandb logging
self.eval_metrics: List[Tuple[str, float]] = []
# Streaming JSONL writer -- saves each task's full conversation
# immediately on completion so data is preserved even on Ctrl+C.
# Timestamped filename so each run produces a unique file.
import datetime
log_dir = os.path.join(os.path.dirname(__file__), "logs")
os.makedirs(log_dir, exist_ok=True)
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
self._streaming_file = open(self._streaming_path, "w")
self._streaming_lock = __import__("threading").Lock()
print(f" Streaming results to: {self._streaming_path}")
print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories")
for cat, indices in sorted(self.category_index.items()):
print(f" {cat}: {len(indices)} tasks")
def _save_result(self, result: Dict[str, Any]):
"""Write a single task result to the streaming JSONL file immediately."""
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
return
with self._streaming_lock:
self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n")
self._streaming_file.flush()
# =========================================================================
# Training pipeline stubs -- NOT used in eval-only mode
# =========================================================================
# These satisfy the abstract method requirements from HermesAgentBaseEnv.
# The evaluate subcommand calls setup() -> evaluate() directly, bypassing
# the training pipeline entirely.
async def get_next_item(self):
"""Return next item (stub -- not used in eval-only mode)."""
item = self.all_eval_items[self.iter % len(self.all_eval_items)]
self.iter += 1
return item
def format_prompt(self, item: Dict[str, Any]) -> str:
"""Return the task's instruction as the user prompt."""
return item["instruction"]
async def compute_reward(self, item, result, ctx) -> float:
"""Compute reward (stub -- actual verification is in rollout_and_score_eval)."""
return 0.0
async def collect_trajectories(self, item):
"""Collect trajectories (stub -- not used in eval-only mode)."""
return None, []
async def score(self, rollout_group_data):
"""Score rollouts (stub -- not used in eval-only mode)."""
return None
# =========================================================================
# Docker image resolution
# =========================================================================
def _resolve_task_image(
self, item: Dict[str, Any], task_name: str
) -> Tuple[str, Optional[Path]]:
"""
Resolve the Docker image for a task, with fallback to Dockerfile.
Strategy (mirrors Harbor's approach):
1. If force_build=True, always build from Dockerfile in environment_tar
2. If docker_image is available, use the pre-built Docker Hub image (fast)
3. Otherwise, extract Dockerfile from environment_tar and build (slow)
Returns:
(modal_image, temp_dir) -- modal_image is a Docker Hub name or a
Dockerfile path. temp_dir is set if we extracted files that need
cleanup later.
"""
docker_image = item.get("docker_image", "")
environment_tar = item.get("environment_tar", "")
# Fast path: use pre-built Docker Hub image
if docker_image and not self.config.force_build:
logger.info("Task %s: using pre-built image %s", task_name, docker_image)
return docker_image, None
# Slow path: extract Dockerfile from environment_tar and build
if environment_tar:
task_dir = Path(tempfile.mkdtemp(prefix=f"tb2-{task_name}-"))
_extract_base64_tar(environment_tar, task_dir)
dockerfile_path = task_dir / "Dockerfile"
if dockerfile_path.exists():
logger.info(
"Task %s: building from Dockerfile (force_build=%s, docker_image=%s)",
task_name, self.config.force_build, bool(docker_image),
)
return str(dockerfile_path), task_dir
# Neither available -- fall back to Hub image if force_build was True
if docker_image:
logger.warning(
"Task %s: force_build=True but no environment_tar, "
"falling back to docker_image %s", task_name, docker_image,
)
return docker_image, None
return "", None
# =========================================================================
# Per-task evaluation -- agent loop + test verification
# =========================================================================
async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
"""
Evaluate a single TB2 task: run the agent loop, then verify with tests.
This is the core evaluation method. For each task it:
1. Resolves the Docker image and registers the Modal sandbox override
2. Runs HermesAgentLoop with terminal + file tools
3. Uploads the test suite into the sandbox
4. Executes test.sh and checks the result
5. Cleans up the sandbox and temp files
Args:
eval_item: A single TB2 task dict from the dataset
Returns:
Dict with 'passed' (bool), 'reward' (float), 'task_name' (str),
'category' (str), and optional debug info
"""
task_name = eval_item.get("task_name", "unknown")
category = eval_item.get("category", "unknown")
task_id = str(uuid.uuid4())
task_dir = None # Set if we extract a Dockerfile (needs cleanup)
from tqdm import tqdm
tqdm.write(f" [START] {task_name} (task_id={task_id[:8]})")
task_start = time.time()
try:
# --- 1. Resolve Docker image ---
modal_image, task_dir = self._resolve_task_image(eval_item, task_name)
if not modal_image:
logger.error("Task %s: no docker_image or environment_tar, skipping", task_name)
return {
"passed": False, "reward": 0.0,
"task_name": task_name, "category": category,
"error": "no_image",
}
# --- 2. Register per-task Modal image override ---
register_task_env_overrides(task_id, {"modal_image": modal_image})
logger.info(
"Task %s: registered image override for task_id %s",
task_name, task_id[:8],
)
# --- 3. Resolve tools and build messages ---
tools, valid_names = self._resolve_tools_for_group()
messages: List[Dict[str, Any]] = []
if self.config.system_prompt:
messages.append({"role": "system", "content": self.config.system_prompt})
messages.append({"role": "user", "content": self.format_prompt(eval_item)})
# --- 4. Run agent loop ---
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
# --- 5. Verify -- run test suite in the agent's sandbox ---
# Skip verification if the agent produced no meaningful output
only_system_and_user = all(
msg.get("role") in ("system", "user") for msg in result.messages
)
if result.turns_used == 0 or only_system_and_user:
logger.warning(
"Task %s: agent produced no output (turns=%d). Reward=0.",
task_name, result.turns_used,
)
reward = 0.0
else:
# Run tests in a thread so the blocking ctx.terminal() calls
# don't freeze the entire event loop (which would stall all
# other tasks, tqdm updates, and timeout timers).
ctx = ToolContext(task_id)
try:
loop = asyncio.get_event_loop()
reward = await loop.run_in_executor(
None, # default thread pool
self._run_tests, eval_item, ctx, task_name,
)
except Exception as e:
logger.error("Task %s: test verification failed: %s", task_name, e)
reward = 0.0
finally:
ctx.cleanup()
passed = reward == 1.0
status = "PASS" if passed else "FAIL"
elapsed = time.time() - task_start
tqdm.write(f" [{status}] {task_name} (turns={result.turns_used}, {elapsed:.0f}s)")
logger.info(
"Task %s: reward=%.1f, turns=%d, finished=%s",
task_name, reward, result.turns_used, result.finished_naturally,
)
out = {
"passed": passed,
"reward": reward,
"task_name": task_name,
"category": category,
"turns_used": result.turns_used,
"finished_naturally": result.finished_naturally,
"messages": result.messages,
}
self._save_result(out)
return out
except Exception as e:
elapsed = time.time() - task_start
logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
tqdm.write(f" [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
out = {
"passed": False, "reward": 0.0,
"task_name": task_name, "category": category,
"error": str(e),
}
self._save_result(out)
return out
finally:
# --- Cleanup: clear overrides, sandbox, and temp files ---
clear_task_env_overrides(task_id)
try:
cleanup_vm(task_id)
except Exception as e:
logger.debug("VM cleanup for %s: %s", task_id[:8], e)
if task_dir and task_dir.exists():
shutil.rmtree(task_dir, ignore_errors=True)
def _run_tests(
self, item: Dict[str, Any], ctx: ToolContext, task_name: str
) -> float:
"""
Upload and execute the test suite in the agent's sandbox, then
download the verifier output locally to read the reward.
Follows Harbor's verification pattern:
1. Upload tests/ directory into the sandbox
2. Execute test.sh inside the sandbox
3. Download /logs/verifier/ directory to a local temp dir
4. Read reward.txt locally with native Python I/O
Downloading locally avoids issues with the file_read tool on
the Modal VM and matches how Harbor handles verification.
TB2 test scripts (test.sh) typically:
1. Install pytest via uv/pip
2. Run pytest against the test files in /tests/
3. Write results to /logs/verifier/reward.txt
Args:
item: The TB2 task dict (contains tests_tar, test_sh)
ctx: ToolContext scoped to this task's sandbox
task_name: For logging
Returns:
1.0 if tests pass, 0.0 otherwise
"""
tests_tar = item.get("tests_tar", "")
test_sh = item.get("test_sh", "")
if not test_sh:
logger.warning("Task %s: no test_sh content, reward=0", task_name)
return 0.0
# Create required directories in the sandbox
ctx.terminal("mkdir -p /tests /logs/verifier")
# Upload test files into the sandbox (binary-safe via base64)
if tests_tar:
tests_temp = Path(tempfile.mkdtemp(prefix=f"tb2-tests-{task_name}-"))
try:
_extract_base64_tar(tests_tar, tests_temp)
ctx.upload_dir(str(tests_temp), "/tests")
except Exception as e:
logger.warning("Task %s: failed to upload test files: %s", task_name, e)
finally:
shutil.rmtree(tests_temp, ignore_errors=True)
# Write the test runner script (test.sh)
ctx.write_file("/tests/test.sh", test_sh)
ctx.terminal("chmod +x /tests/test.sh")
# Execute the test suite
logger.info(
"Task %s: running test suite (timeout=%ds)",
task_name, self.config.test_timeout,
)
test_result = ctx.terminal(
"bash /tests/test.sh",
timeout=self.config.test_timeout,
)
exit_code = test_result.get("exit_code", -1)
output = test_result.get("output", "")
# Download the verifier output directory locally, then read reward.txt
# with native Python I/O. This avoids issues with file_read on the
# Modal VM and matches Harbor's verification pattern.
reward = 0.0
local_verifier_dir = Path(tempfile.mkdtemp(prefix=f"tb2-verifier-{task_name}-"))
try:
ctx.download_dir("/logs/verifier", str(local_verifier_dir))
reward_file = local_verifier_dir / "reward.txt"
if reward_file.exists() and reward_file.stat().st_size > 0:
content = reward_file.read_text().strip()
if content == "1":
reward = 1.0
elif content == "0":
reward = 0.0
else:
# Unexpected content -- try parsing as float
try:
reward = float(content)
except (ValueError, TypeError):
logger.warning(
"Task %s: reward.txt content unexpected (%r), "
"falling back to exit_code=%d",
task_name, content, exit_code,
)
reward = 1.0 if exit_code == 0 else 0.0
else:
# reward.txt not written -- fall back to exit code
logger.warning(
"Task %s: reward.txt not found after download, "
"falling back to exit_code=%d",
task_name, exit_code,
)
reward = 1.0 if exit_code == 0 else 0.0
except Exception as e:
logger.warning(
"Task %s: failed to download verifier dir: %s, "
"falling back to exit_code=%d",
task_name, e, exit_code,
)
reward = 1.0 if exit_code == 0 else 0.0
finally:
shutil.rmtree(local_verifier_dir, ignore_errors=True)
# Log test output for debugging failures
if reward == 0.0:
output_preview = output[-500:] if output else "(no output)"
logger.info(
"Task %s: FAIL (exit_code=%d)\n%s",
task_name, exit_code, output_preview,
)
return reward
# =========================================================================
# Evaluate -- main entry point for the eval subcommand
# =========================================================================
async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
"""
Wrap rollout_and_score_eval with a per-task wall-clock timeout.
If the task exceeds task_timeout seconds, it's automatically scored
as FAIL. This prevents any single task from hanging indefinitely.
"""
task_name = item.get("task_name", "unknown")
category = item.get("category", "unknown")
try:
return await asyncio.wait_for(
self.rollout_and_score_eval(item),
timeout=self.config.task_timeout,
)
except asyncio.TimeoutError:
from tqdm import tqdm
elapsed = self.config.task_timeout
tqdm.write(f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
out = {
"passed": False, "reward": 0.0,
"task_name": task_name, "category": category,
"error": f"timeout ({elapsed}s)",
}
self._save_result(out)
return out
async def evaluate(self, *args, **kwargs) -> None:
"""
Run Terminal-Bench 2.0 evaluation over all tasks.
This is the main entry point when invoked via:
python environments/terminalbench2_env.py evaluate
Runs all tasks through rollout_and_score_eval() via asyncio.gather()
(same pattern as GPQA and other Atropos eval envs). Each task is
wrapped with a wall-clock timeout so hung tasks auto-fail.
Suppresses noisy Modal/terminal output (HERMES_QUIET) so the tqdm
bar stays visible.
"""
start_time = time.time()
# Route all logging through tqdm.write() so the progress bar stays
# pinned at the bottom while log lines scroll above it.
from tqdm import tqdm
class _TqdmHandler(logging.Handler):
def emit(self, record):
try:
tqdm.write(self.format(record))
except Exception:
self.handleError(record)
handler = _TqdmHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(name)s] %(levelname)s: %(message)s",
datefmt="%H:%M:%S",
))
root = logging.getLogger()
root.handlers = [handler] # Replace any existing handlers
root.setLevel(logging.INFO)
# Silence noisy third-party loggers that flood the output
logging.getLogger("httpx").setLevel(logging.WARNING) # Every HTTP request
logging.getLogger("openai").setLevel(logging.WARNING) # OpenAI client retries
logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment
logging.getLogger("rex_image_builder").setLevel(logging.WARNING) # Image builds
print(f"\n{'='*60}")
print("Starting Terminal-Bench 2.0 Evaluation")
print(f"{'='*60}")
print(f" Dataset: {self.config.dataset_name}")
print(f" Total tasks: {len(self.all_eval_items)}")
print(f" Max agent turns: {self.config.max_agent_turns}")
print(f" Task timeout: {self.config.task_timeout}s")
print(f" Terminal backend: {self.config.terminal_backend}")
print(f" Tool thread pool: {self.config.tool_pool_size}")
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
print(f"{'='*60}\n")
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
total_tasks = len(self.all_eval_items)
eval_tasks = [
asyncio.ensure_future(self._eval_with_timeout(item))
for item in self.all_eval_items
]
results = []
passed_count = 0
pbar = tqdm(total=total_tasks, desc="Evaluating TB2", dynamic_ncols=True)
try:
for coro in asyncio.as_completed(eval_tasks):
result = await coro
results.append(result)
if result and result.get("passed"):
passed_count += 1
done = len(results)
pct = (passed_count / done * 100) if done else 0
pbar.set_postfix_str(f"pass={passed_count}/{done} ({pct:.1f}%)")
pbar.update(1)
except (KeyboardInterrupt, asyncio.CancelledError):
pbar.close()
print(f"\n\nInterrupted! Cleaning up {len(eval_tasks)} tasks...")
# Cancel all pending tasks
for task in eval_tasks:
task.cancel()
# Let cancellations propagate (finally blocks run cleanup_vm)
await asyncio.gather(*eval_tasks, return_exceptions=True)
# Belt-and-suspenders: clean up any remaining sandboxes
from tools.terminal_tool import cleanup_all_environments
cleanup_all_environments()
print("All sandboxes cleaned up.")
return
finally:
pbar.close()
end_time = time.time()
# Filter out None results (shouldn't happen, but be safe)
valid_results = [r for r in results if r is not None]
if not valid_results:
print("Warning: No valid evaluation results obtained")
return
# ---- Compute metrics ----
total = len(valid_results)
passed = sum(1 for r in valid_results if r.get("passed"))
overall_pass_rate = passed / total if total > 0 else 0.0
# Per-category breakdown
cat_results: Dict[str, List[Dict]] = defaultdict(list)
for r in valid_results:
cat_results[r.get("category", "unknown")].append(r)
# Build metrics dict
eval_metrics = {
"eval/pass_rate": overall_pass_rate,
"eval/total_tasks": total,
"eval/passed_tasks": passed,
"eval/evaluation_time_seconds": end_time - start_time,
}
# Per-category metrics
for category, cat_items in sorted(cat_results.items()):
cat_passed = sum(1 for r in cat_items if r.get("passed"))
cat_total = len(cat_items)
cat_pass_rate = cat_passed / cat_total if cat_total > 0 else 0.0
cat_key = category.replace(" ", "_").replace("-", "_").lower()
eval_metrics[f"eval/pass_rate_{cat_key}"] = cat_pass_rate
# Store metrics for wandb_log
self.eval_metrics = [(k, v) for k, v in eval_metrics.items()]
# ---- Print summary ----
print(f"\n{'='*60}")
print("Terminal-Bench 2.0 Evaluation Results")
print(f"{'='*60}")
print(f"Overall Pass Rate: {overall_pass_rate:.4f} ({passed}/{total})")
print(f"Evaluation Time: {end_time - start_time:.1f} seconds")
print("\nCategory Breakdown:")
for category, cat_items in sorted(cat_results.items()):
cat_passed = sum(1 for r in cat_items if r.get("passed"))
cat_total = len(cat_items)
cat_rate = cat_passed / cat_total if cat_total > 0 else 0.0
print(f" {category}: {cat_rate:.1%} ({cat_passed}/{cat_total})")
# Print individual task results
print("\nTask Results:")
for r in sorted(valid_results, key=lambda x: x.get("task_name", "")):
status = "PASS" if r.get("passed") else "FAIL"
turns = r.get("turns_used", "?")
error = r.get("error", "")
extra = f" (error: {error})" if error else ""
print(f" [{status}] {r['task_name']} (turns={turns}){extra}")
print(f"{'='*60}\n")
# Build sample records for evaluate_log (includes full conversations)
samples = [
{
"task_name": r.get("task_name"),
"category": r.get("category"),
"passed": r.get("passed"),
"reward": r.get("reward"),
"turns_used": r.get("turns_used"),
"error": r.get("error"),
"messages": r.get("messages"),
}
for r in valid_results
]
# Log evaluation results
try:
await self.evaluate_log(
metrics=eval_metrics,
samples=samples,
start_time=start_time,
end_time=end_time,
generation_parameters={
"temperature": self.config.agent_temperature,
"max_tokens": self.config.max_token_length,
"max_agent_turns": self.config.max_agent_turns,
"terminal_backend": self.config.terminal_backend,
},
)
except Exception as e:
print(f"Error logging evaluation results: {e}")
# Close streaming file
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
self._streaming_file.close()
print(f" Live results saved to: {self._streaming_path}")
# Kill all remaining sandboxes. Timed-out tasks leave orphaned thread
# pool workers still executing commands -- cleanup_all stops them.
from tools.terminal_tool import cleanup_all_environments
print("\nCleaning up all sandboxes...")
cleanup_all_environments()
# Shut down the tool thread pool so orphaned workers from timed-out
# tasks are killed immediately instead of retrying against dead
# sandboxes and spamming the console with TimeoutError warnings.
from environments.agent_loop import _tool_executor
_tool_executor.shutdown(wait=False, cancel_futures=True)
print("Done.")
# =========================================================================
# Wandb logging
# =========================================================================
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log TB2-specific metrics to wandb."""
if wandb_metrics is None:
wandb_metrics = {}
# Add stored eval metrics
for metric_name, metric_value in self.eval_metrics:
wandb_metrics[metric_name] = metric_value
self.eval_metrics = []
await super().wandb_log(wandb_metrics)
if __name__ == "__main__":
TerminalBench2EvalEnv.cli()

View File

@@ -1,5 +0,0 @@
"""Endless Terminals Environment - Terminal task training from HuggingFace dataset."""
from .endless_terminals_env import EndlessTerminalsEnv, EndlessTerminalsEnvConfig
__all__ = ["EndlessTerminalsEnv", "EndlessTerminalsEnvConfig"]

View File

@@ -1,69 +0,0 @@
# Endless Terminals Environment -- Default Configuration
#
# Trains agents on terminal tasks from the Endless Terminals HuggingFace dataset.
# Uses hermes-agent backends (modal/docker/local) with per-task Docker images.
# Tests run in the same sandbox the agent used (no separate containers needed).
#
# Dataset: https://huggingface.co/datasets/obiwan96/endless-terminals-train
#
# Prerequisites:
# 1. Download dataset: huggingface-cli download obiwan96/endless-terminals-train \
# --repo-type dataset --local-dir ~/endless-terminals-data \
# --local-dir-use-symlinks False
# 2. Set TASKS_BASE_DIR environment variable or configure tasks_base_dir below
# 3. For modal backend: Configure Modal CLI (modal token set)
# 4. For docker backend: Install Docker
#
# Usage:
# python environments/endless_terminals/endless_terminals_env.py process \
# --config environments/endless_terminals/default.yaml
env:
# Toolsets
enabled_toolsets: ["terminal", "file"]
# Agent configuration
max_agent_turns: 32
max_token_length: 4096
agent_temperature: 1.0
# Terminal backend
terminal_backend: "local" # Change to "modal" or "docker" for cloud isolation
# Dataset settings
use_dataset: true
dataset_name: "obiwan96/endless-terminals"
dataset_split: "train"
dataset_cache_dir: "~/.cache/huggingface/datasets"
tasks_base_dir: "" # Set to directory containing task_* folders (e.g., ~/endless-terminals-data)
# Test execution
test_timeout_s: 60
# Training configuration
group_size: 8
total_steps: 10000
steps_per_eval: 500
num_eval_tasks: 10
eval_split_ratio: 0.1
# Logging
use_wandb: true
wandb_name: "endless-terminals"
# System prompt
system_prompt: >
You are a skilled Linux system administrator and programmer.
You have access to a terminal and file tools to complete system administration
and programming tasks. Use the tools effectively to solve the given task,
and verify your solution works correctly before finishing.
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-sonnet-4.5"
server_type: "openai"
api_key: "" # Loaded from OPENROUTER_API_KEY env var
health_check: false
timeout: 30 # 30 second timeout per request
max_retries: 2 # Only retry twice

View File

@@ -1,921 +0,0 @@
"""
Endless Terminals Environment for Hermes-Agent + Atropos RL.
Loads pre-generated terminal tasks from HuggingFace dataset and scores
agent performance using test execution in the agent's sandbox.
Uses hermes-agent backends (modal, docker, local) with per-task Docker images
extracted from container.def files. Tests run in the same sandbox the agent
used, following the Terminal Bench 2 pattern.
Dataset: https://huggingface.co/datasets/obiwan96/endless-terminals-train
Run:
python environments/endless_terminals/endless_terminals_env.py process \
--config environments/endless_terminals/default.yaml
"""
import asyncio
import logging
import os
import random
import re
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from pydantic import Field
# Ensure hermes-agent root is on path
_repo_root = Path(__file__).resolve().parent.parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from atroposlib.envs.base import ScoredDataGroup, ScoredDataItem
from atroposlib.type_definitions import Item
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.agent_loop import AgentResult
from environments.tool_context import ToolContext
from tools.terminal_tool import (
register_task_env_overrides,
clear_task_env_overrides,
cleanup_vm,
)
logger = logging.getLogger(__name__)
# Add endless-terminals to path for imports
ENDLESS_TERMINALS_PATH = os.getenv(
"ENDLESS_TERMINALS_PATH",
str(Path.home() / "Desktop" / "Projects" / "endless-terminals")
)
sys.path.insert(0, ENDLESS_TERMINALS_PATH)
class EndlessTerminalsEnvConfig(HermesAgentEnvConfig):
"""Configuration for Endless Terminals environment."""
# Dataset settings
use_dataset: bool = Field(
default=True,
description="Load tasks from HuggingFace dataset (recommended). If False, generate procedurally."
)
dataset_name: str = Field(
default="obiwan96/endless-terminals-train",
description="HuggingFace dataset name"
)
dataset_split: str = Field(
default="train",
description="Dataset split to use"
)
dataset_cache_dir: str = Field(
default="~/.cache/huggingface/datasets",
description="HuggingFace datasets cache directory"
)
tasks_base_dir: str = Field(
default="",
description="Base directory containing task_* folders. If empty, uses paths from dataset."
)
# Test execution
test_timeout_s: int = Field(default=60, description="Test execution timeout (seconds)")
# Docker image fallback
default_docker_image: str = Field(
default="ubuntu:22.04",
description="Default Docker image if container.def parsing fails"
)
# Agent defaults
max_agent_turns: int = Field(default=32, description="Max turns for agent (increased for long traces)")
# Evaluation settings
num_eval_tasks: int = Field(
default=10,
description="Number of tasks to run during periodic evaluation"
)
eval_split_ratio: float = Field(
default=0.1,
description="Fraction of dataset to hold out for evaluation (0.0-1.0)"
)
class EndlessTerminalsEnv(HermesAgentBaseEnv):
"""
Endless Terminals environment using pre-generated HuggingFace dataset.
Loads terminal tasks from dataset, runs agent with terminal tools,
and scores by executing tests in the agent's sandbox using ToolContext.
"""
name = "endless_terminals_env"
env_config_cls = EndlessTerminalsEnvConfig
@classmethod
def config_init(cls) -> Tuple[EndlessTerminalsEnvConfig, List["APIServerConfig"]]:
"""
Default configuration for Endless Terminals environment.
This is used when no config file is provided, but note that when using
--config, the YAML is loaded differently and this may not be called.
"""
from atroposlib.envs.server_handling.server_manager import APIServerConfig
env_config = EndlessTerminalsEnvConfig(
enabled_toolsets=["terminal", "file"],
max_agent_turns=32,
terminal_backend="local",
use_dataset=True,
tasks_base_dir="",
group_size=1,
total_steps=1,
use_wandb=False,
)
server_configs = [
APIServerConfig(
base_url="https://openrouter.ai/api/v1",
model_name="anthropic/claude-sonnet-4.5",
server_type="openai",
api_key=os.getenv("OPENROUTER_API_KEY", ""),
health_check=False,
)
]
return env_config, server_configs
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._dataset = None
self._train_dataset = None
self._eval_dataset = None
self._dataset_indices = []
self._current_index = 0
# Metrics tracking for wandb - single buffer with dicts
self._metrics_buffer = []
# Debug: check server config
if hasattr(self, 'server') and hasattr(self.server, 'servers'):
for i, srv in enumerate(self.server.servers):
logger.debug(f"Server {i}: model_name={getattr(srv.config, 'model_name', 'NONE')}")
async def setup(self):
"""Load dataset from HuggingFace or local directory."""
if not self.config.use_dataset:
logger.info("Using procedural task generation (not implemented yet)")
return
# If tasks_base_dir is set, load from local directory instead of HuggingFace
if self.config.tasks_base_dir:
tasks_base = Path(os.path.expanduser(self.config.tasks_base_dir))
# Resolve to absolute path if relative
if not tasks_base.is_absolute():
tasks_base = Path.cwd() / tasks_base
tasks_base = tasks_base.resolve()
if not tasks_base.exists():
raise RuntimeError(f"tasks_base_dir not found: {tasks_base}")
logger.info(f"Loading tasks from local directory: {tasks_base}")
# Find all task_* directories
task_dirs = sorted(tasks_base.glob("task_*"))
logger.info(f"Found {len(task_dirs)} task directories")
if not task_dirs:
# Debug: show what's actually in the directory
all_items = list(tasks_base.iterdir())
logger.warning(f"Directory contains {len(all_items)} items:")
for item in all_items[:10]:
logger.warning(f" - {item.name} ({'dir' if item.is_dir() else 'file'})")
raise RuntimeError(f"No task_* directories found in {tasks_base}")
# Create fake dataset items (just the directory paths)
self._dataset = [
{
"description": f"Task from {task_dir.name}",
"extra_info": {"task_dir": str(task_dir)},
}
for task_dir in task_dirs
]
logger.info(f"Loaded {len(self._dataset)} tasks from local directory")
self._split_dataset()
return
# Otherwise, load from HuggingFace
logger.info(f"Loading dataset from HuggingFace: {self.config.dataset_name}")
try:
from datasets import load_dataset
self._dataset = await asyncio.get_event_loop().run_in_executor(
None,
lambda: load_dataset(
self.config.dataset_name,
split=self.config.dataset_split,
cache_dir=os.path.expanduser(self.config.dataset_cache_dir)
)
)
logger.info(f"Loaded {len(self._dataset)} tasks from HuggingFace")
self._split_dataset()
except Exception as e:
logger.error(f"ERROR loading dataset: {e}")
raise
def _split_dataset(self):
"""Split dataset into train and eval sets based on eval_split_ratio."""
if self._dataset is None or len(self._dataset) == 0:
raise RuntimeError("Cannot split empty dataset")
total_size = len(self._dataset)
eval_size = int(total_size * self.config.eval_split_ratio)
train_size = total_size - eval_size
all_indices = list(range(total_size))
random.shuffle(all_indices)
train_indices = all_indices[:train_size]
eval_indices = all_indices[train_size:]
if isinstance(self._dataset, list):
self._train_dataset = [self._dataset[i] for i in train_indices]
self._eval_dataset = [self._dataset[i] for i in eval_indices]
else:
self._train_dataset = self._dataset.select(train_indices)
self._eval_dataset = self._dataset.select(eval_indices)
self._dataset_indices = list(range(len(self._train_dataset)))
random.shuffle(self._dataset_indices)
self._current_index = 0
logger.info(
f"Split dataset: {len(self._train_dataset)} train, "
f"{len(self._eval_dataset)} eval "
f"(ratio={self.config.eval_split_ratio:.1%})"
)
async def get_next_item(self) -> Item:
"""Sample next task from training dataset."""
if self._train_dataset is None:
raise RuntimeError("Dataset not loaded. Call setup() first.")
# Get next task (with wraparound)
idx = self._dataset_indices[self._current_index]
task = self._train_dataset[idx]
# Advance to next task
self._current_index += 1
if self._current_index >= len(self._dataset_indices):
# Reshuffle for next epoch
random.shuffle(self._dataset_indices)
self._current_index = 0
logger.info("Reshuffled dataset (completed one epoch)")
# Extract task directory path
task_dir = task.get("extra_info", {}).get("task_dir")
if not task_dir:
task_dir = task.get("reward_spec", {}).get("ground_truth")
# Resolve task directory path
if task_dir:
task_dir_path = Path(task_dir)
# If tasks_base_dir is configured and path doesn't exist, reconstruct it
if self.config.tasks_base_dir and not task_dir_path.exists():
original_path = Path(task_dir)
task_name = original_path.name
task_dir_path = Path(os.path.expanduser(self.config.tasks_base_dir)) / task_name
else:
logger.error("No task directory path found in dataset item")
return await self.get_next_item()
# Verify directory exists
if not task_dir_path.exists():
logger.warning(f"Task dir not found: {task_dir_path}")
logger.warning("Hint: Set tasks_base_dir to directory containing task_* folders")
return await self.get_next_item() # Try next task
# Look for test file in tests/ subdirectory first, then at root
final_test = task_dir_path / "tests" / "test_final_state.py"
if not final_test.exists():
final_test = task_dir_path / "test_final_state.py"
# Verify test file exists
if not final_test.exists():
logger.warning(f"Missing test file in {task_dir_path} (checked tests/ and root)")
return await self.get_next_item()
# Parse container.def to extract Docker image
# Check environment/ subdirectory first, then root
container_def = task_dir_path / "environment" / "container.def"
if not container_def.exists():
container_def = task_dir_path / "container.def"
docker_image = self._parse_docker_image_from_def(container_def)
# Try to load description from instruction.md or task.json
description = task.get("description", "")
# First try instruction.md
instruction_md = task_dir_path / "instruction.md"
if not description and instruction_md.exists():
try:
description = instruction_md.read_text().strip()
except Exception as e:
logger.warning(f"Failed to load instruction.md for {task_dir_path.name}: {e}")
# Fallback to task.json in environment/
if not description:
task_json = task_dir_path / "environment" / "task.json"
if task_json.exists():
try:
import json
task_data = json.loads(task_json.read_text())
description = task_data.get("description", "") or task_data.get("instruction", "")
except Exception as e:
logger.warning(f"Failed to load task.json for {task_dir_path.name}: {e}")
if not description:
description = f"Complete the task in {task_dir_path.name}"
return {
"task_id": f"{task_dir_path.name}",
"task_name": task_dir_path.name,
"description": description,
"task_dir": str(task_dir_path),
"final_test": str(final_test),
"docker_image": docker_image,
"dataset_index": idx,
}
def format_prompt(self, item: Item) -> str:
"""Return the task description for the agent."""
return str(item.get("description", ""))
def _parse_docker_image_from_def(self, container_def_path: Path) -> str:
"""
Parse container.def file to extract the Docker base image.
Apptainer definition files typically look like:
Bootstrap: docker
From: ubuntu:22.04
Returns the image from the "From:" line, or falls back to default.
"""
if not container_def_path.exists():
logger.warning(f"container.def not found at {container_def_path}, using default image")
return self.config.default_docker_image
try:
content = container_def_path.read_text()
# Look for "From: <image>" line (case-insensitive)
match = re.search(r'^From:\s*(.+)$', content, re.MULTILINE | re.IGNORECASE)
if match:
image = match.group(1).strip()
logger.info(f"Extracted Docker image from container.def: {image}")
return image
except Exception as e:
logger.warning(f"Failed to parse {container_def_path}: {e}")
logger.warning(f"Could not extract image from {container_def_path}, using default")
return self.config.default_docker_image
async def collect_trajectory(
self, item: Item
) -> Tuple[Optional[ScoredDataItem], List[Item]]:
"""
Override to register per-task Docker image before running the agent.
Follows Terminal Bench 2 pattern: register_task_env_overrides() tells
the hermes-agent terminal backend to use a specific Docker image for
this task_id.
This is a copy of HermesAgentBaseEnv.collect_trajectory with Docker
image registration added after task_id generation.
"""
import uuid
from environments.agent_loop import HermesAgentLoop
task_id = str(uuid.uuid4())
task_name = item.get("task_name", "unknown")
docker_image = item.get("docker_image", self.config.default_docker_image)
logger.debug(f"collect_trajectory START for {task_name}")
# Register Docker image override for this task_id
logger.debug(f"Registering Docker image: {docker_image}")
register_task_env_overrides(task_id, {"modal_image": docker_image})
logger.info(
f"Task {task_name}: registered Docker image {docker_image} for task_id {task_id[:8]}"
)
logger.debug("Docker image registered")
try:
# Get group-level tools (resolved once in collect_trajectories)
logger.debug("Resolving tools...")
if self._current_group_tools is None:
tools, valid_names = self._resolve_tools_for_group()
else:
tools, valid_names = self._current_group_tools
logger.debug(f"Tools resolved: {len(tools)} tools")
# Build initial messages
logger.debug("Building initial messages...")
messages: List[Dict[str, Any]] = []
if self.config.system_prompt:
messages.append({"role": "system", "content": self.config.system_prompt})
messages.append({"role": "user", "content": self.format_prompt(item)})
logger.debug("Messages built, starting agent loop...")
# Run the agent loop
result: AgentResult
managed_state: Optional[Dict[str, Any]] = None
if self._use_managed_server():
# Phase 2: ManagedServer with parser
from environments.tool_call_parsers import get_parser
try:
tc_parser = get_parser(self.config.tool_call_parser)
except KeyError:
logger.warning(
"Tool call parser '%s' not found, falling back to 'hermes'",
self.config.tool_call_parser,
)
tc_parser = get_parser("hermes")
try:
async with self.server.managed_server(
tokenizer=self.tokenizer,
tool_call_parser=tc_parser,
) as managed:
agent = HermesAgentLoop(
server=managed,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
# Get state directly from managed server while still in context
managed_state = managed.get_state()
except NotImplementedError:
# DummyManagedServer not allowed
logger.warning("ManagedServer not available. Falling back to direct server mode.")
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
else:
# Phase 1: OpenAI server
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
# Skip reward computation if agent produced no output
only_system_and_user = all(
msg.get("role") in ("system", "user") for msg in result.messages
)
if result.turns_used == 0 or only_system_and_user:
logger.warning(
"Agent loop produced no output (turns=%d). Skipping trajectory.",
result.turns_used,
)
# Return None to skip this trajectory (likely an API failure)
return None, []
else:
# Compute reward using ToolContext
ctx = ToolContext(task_id)
try:
reward = await self.compute_reward(item, result, ctx)
except Exception as e:
logger.error("compute_reward failed: %s", e)
reward = 0.0
finally:
ctx.cleanup()
# Track metrics for wandb logging
task_metrics = {
"test_passed": 1.0 if reward > 0.5 else 0.0,
"reward": reward,
"turns_used": result.turns_used,
"finished_naturally": result.finished_naturally,
"docker_image": docker_image,
"num_tool_errors": len(result.tool_errors),
}
# Include detailed tool errors if any occurred
if result.tool_errors:
task_metrics["tool_errors"] = [
{
"turn": err.turn,
"tool": err.tool_name,
"error": err.error[:200],
}
for err in result.tool_errors
]
self._metrics_buffer.append(task_metrics)
# ============================================================================
# Build ScoredDataGroup from ManagedServer state
# ============================================================================
# Phase 2: Extract pre-computed data from SequenceNodes
# We may have multiple trajectories in the nodes due to how interesting
# agents can be, so iterate through all nodes and return multiple sequences.
#
# Each SequenceNode contains:
# - tokens: Full unmasked token sequence [1, 2, 3, ..., N]
# - masked_tokens: Training format [-100, -100, ..., -100, actual, actual, ...]
# - logprobs: Training format [1.0, 1.0, ..., 1.0, -0.5, -0.3, ...]
# - full_text: Complete text (prompt + all completions)
#
# Phase 1: Create placeholder tokens for OpenAI-style servers
# ============================================================================
nodes = (managed_state or {}).get("nodes", []) if managed_state else []
# Create ScoredDataGroup with lists for multiple trajectories
scored_group = ScoredDataGroup()
scored_group["tokens"] = []
scored_group["masks"] = []
scored_group["scores"] = []
scored_group["messages"] = []
scored_group["inference_logprobs"] = []
if nodes:
# Phase 2: iterate through all nodes (may have multiple trajectories)
for i, node in enumerate(nodes):
scored_group["tokens"].append(node.tokens)
scored_group["masks"].append(node.masked_tokens)
scored_group["scores"].append(reward)
scored_group["messages"].append(result.messages)
if hasattr(node, "logprobs") and node.logprobs:
scored_group["inference_logprobs"].append(node.logprobs)
else:
# Placeholder logprobs if not available
scored_group["inference_logprobs"].append([1.0] * len(node.tokens))
logger.debug(f"Added trajectory {i+1}/{len(nodes)} with {len(node.tokens)} tokens")
else:
# Phase 1: create placeholder tokens for OpenAI-style servers
full_text = "\n".join(
msg.get("content", "") for msg in result.messages if msg.get("content")
)
if self.tokenizer:
tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
else:
tokens = list(range(min(len(full_text) // 4, 128)))
scored_group["tokens"].append(tokens)
scored_group["masks"].append([-100] + tokens[1:])
scored_group["scores"].append(reward)
scored_group["messages"].append(result.messages)
scored_group["inference_logprobs"].append([1.0] * len(tokens))
# Return None if no trajectories collected
if len(scored_group["tokens"]) == 0:
return None, []
logger.debug(f"Returning ScoredDataGroup with {len(scored_group['tokens'])} trajectories")
return scored_group, []
finally:
# Clean up task overrides and sandbox
clear_task_env_overrides(task_id)
try:
cleanup_vm(task_id)
except Exception as e:
logger.debug(f"VM cleanup for {task_id[:8]}: {e}")
async def compute_reward(
self,
item: Item,
result: AgentResult,
ctx: ToolContext
) -> float:
"""
Run final tests in the agent's sandbox and return binary reward.
Uses ToolContext to execute pytest in the SAME sandbox the agent used,
following the Terminal Bench 2 verification pattern. No separate
Apptainer execution needed.
Returns 1.0 if tests pass, 0.0 otherwise.
"""
task_name = item.get("task_name", "unknown")
final_test_path = Path(item.get("final_test", ""))
if not final_test_path.exists():
logger.error(f"Task {task_name}: test file not found at {final_test_path}")
return 0.0
logger.info(f"Task {task_name}: running tests in sandbox...")
try:
# Run tests in a thread to avoid blocking the event loop
loop = asyncio.get_event_loop()
reward = await loop.run_in_executor(
None,
self._run_tests_in_sandbox,
final_test_path,
ctx,
task_name,
)
status = "PASS" if reward == 1.0 else "FAIL"
logger.info(f"Task {task_name}: {status} (reward={reward})")
return reward
except Exception as e:
logger.error(f"Task {task_name}: test execution failed: {e}", exc_info=True)
return 0.0
def _run_tests_in_sandbox(
self,
test_file_path: Path,
ctx: ToolContext,
task_name: str,
) -> float:
"""
Upload test file to sandbox and execute pytest.
Runs in thread pool (via run_in_executor) to avoid blocking the event loop
with synchronous ToolContext calls.
Args:
test_file_path: Local path to test_final_state.py
ctx: ToolContext scoped to the agent's sandbox
task_name: For logging
Returns:
1.0 if tests pass, 0.0 otherwise
"""
try:
# Upload test file to sandbox
test_content = test_file_path.read_text()
ctx.write_file("/workspace/test_final_state.py", test_content)
logger.debug(f"Task {task_name}: uploaded test file to /workspace/test_final_state.py")
# Run pytest in the sandbox
result = ctx.terminal(
"cd /workspace && python -m pytest -q test_final_state.py",
timeout=self.config.test_timeout_s,
)
exit_code = result.get("exit_code", -1)
output = result.get("output", "")
if exit_code == 0:
logger.debug(f"Task {task_name}: tests passed")
return 1.0
else:
# Log failure output (last 500 chars for debugging)
output_preview = output[-500:] if output else "(no output)"
logger.info(
f"Task {task_name}: tests failed (exit_code={exit_code})\n{output_preview}"
)
return 0.0
except Exception as e:
logger.error(f"Task {task_name}: error running tests: {e}")
return 0.0
async def evaluate(self):
"""
Periodic evaluation on holdout eval set.
Runs the agent on num_eval_tasks from the held-out eval set
(never seen during training). Returns metrics for wandb logging.
"""
if self._eval_dataset is None:
logger.warning("Cannot evaluate: eval dataset not loaded")
return {}
if len(self._eval_dataset) == 0:
logger.warning("Eval dataset is empty")
return {}
# Use min of num_eval_tasks and actual eval set size
num_tasks = min(self.config.num_eval_tasks, len(self._eval_dataset))
logger.info(f"Starting evaluation on {num_tasks} held-out tasks...")
eval_metrics = {
"rewards": [],
"passes": [],
"turns": [],
"natural_finishes": [],
}
# Sample from eval set (holdout)
import random
eval_indices = random.sample(range(len(self._eval_dataset)), num_tasks)
for idx in eval_indices:
task = self._eval_dataset[idx]
# Build item using same logic as get_next_item
task_dir = task.get("extra_info", {}).get("task_dir")
if not task_dir:
task_dir = task.get("reward_spec", {}).get("ground_truth")
if not task_dir:
continue
task_dir_path = Path(task_dir)
if self.config.tasks_base_dir and not task_dir_path.exists():
original_path = Path(task_dir)
task_name = original_path.name
task_dir_path = Path(os.path.expanduser(self.config.tasks_base_dir)) / task_name
if not task_dir_path.exists():
continue
# Find test file
final_test = task_dir_path / "tests" / "test_final_state.py"
if not final_test.exists():
final_test = task_dir_path / "test_final_state.py"
if not final_test.exists():
continue
# Parse Docker image
container_def = task_dir_path / "environment" / "container.def"
if not container_def.exists():
container_def = task_dir_path / "container.def"
docker_image = self._parse_docker_image_from_def(container_def)
# Load description
description = task.get("description", "")
instruction_md = task_dir_path / "instruction.md"
if not description and instruction_md.exists():
try:
description = instruction_md.read_text().strip()
except Exception:
pass
item = {
"description": description,
"final_test": str(final_test),
"docker_image": docker_image,
}
# Run agent on this task
try:
import uuid
task_id = str(uuid.uuid4())
# Register task environment
from model_tools import register_task_env_overrides
register_task_env_overrides(task_id, {"modal_image": docker_image})
# Build messages
messages = [
{"role": "system", "content": self.config.system_prompt},
{"role": "user", "content": description or "Complete the task."},
]
# Get tools
from model_tools import get_tool_definitions
tools = get_tool_definitions(self.config.enabled_toolsets)
valid_names = {t["function"]["name"] for t in tools}
# Run agent
from environments.agent_loop import HermesAgentLoop
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
# Compute reward
from environments.tool_context import ToolContext
ctx = ToolContext(task_id)
try:
reward = await self.compute_reward(item, result, ctx)
except Exception as e:
logger.warning(f"Eval reward computation failed: {e}")
reward = 0.0
finally:
ctx.cleanup()
# Track metrics
eval_metrics["rewards"].append(reward)
eval_metrics["passes"].append(1.0 if reward > 0.5 else 0.0)
eval_metrics["turns"].append(result.turns_used)
eval_metrics["natural_finishes"].append(1.0 if result.finished_naturally else 0.0)
except Exception as e:
logger.error(f"Eval task failed: {e}")
continue
finally:
# Cleanup
from model_tools import clear_task_env_overrides, cleanup_vm
clear_task_env_overrides(task_id)
cleanup_vm(task_id)
# Aggregate metrics
if not eval_metrics["rewards"]:
logger.warning("No eval tasks completed successfully")
return {}
aggregated = {
"eval/pass_rate": sum(eval_metrics["passes"]) / len(eval_metrics["passes"]),
"eval/avg_reward": sum(eval_metrics["rewards"]) / len(eval_metrics["rewards"]),
"eval/avg_turns": sum(eval_metrics["turns"]) / len(eval_metrics["turns"]),
"eval/natural_finish_rate": sum(eval_metrics["natural_finishes"]) / len(eval_metrics["natural_finishes"]),
"eval/num_tasks": len(eval_metrics["rewards"]),
}
logger.info(f"Evaluation complete: pass_rate={aggregated['eval/pass_rate']:.2%}, avg_turns={aggregated['eval/avg_turns']:.1f}")
return aggregated
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log Endless Terminals specific metrics to wandb."""
if wandb_metrics is None:
wandb_metrics = {}
# Aggregate metrics from buffer
if self._metrics_buffer:
# Test pass rate
test_passes = [m["test_passed"] for m in self._metrics_buffer]
wandb_metrics["endless_terminals/test_pass_rate"] = sum(test_passes) / len(test_passes)
wandb_metrics["endless_terminals/num_tests_passed"] = sum(test_passes)
wandb_metrics["endless_terminals/num_tests_total"] = len(test_passes)
# Turns used statistics
turns = [m["turns_used"] for m in self._metrics_buffer]
wandb_metrics["endless_terminals/avg_turns_used"] = sum(turns) / len(turns)
wandb_metrics["endless_terminals/max_turns_used"] = max(turns)
wandb_metrics["endless_terminals/min_turns_used"] = min(turns)
# Natural finish rate (did model stop on its own vs hitting max turns)
natural_finishes = [1.0 if m["finished_naturally"] else 0.0 for m in self._metrics_buffer]
wandb_metrics["endless_terminals/natural_finish_rate"] = sum(natural_finishes) / len(natural_finishes)
# Tool error statistics
total_tool_errors = sum(m["num_tool_errors"] for m in self._metrics_buffer)
wandb_metrics["endless_terminals/total_tool_errors"] = total_tool_errors
wandb_metrics["endless_terminals/avg_tool_errors_per_task"] = total_tool_errors / len(self._metrics_buffer)
# Docker image distribution (count unique images used)
docker_images = [m["docker_image"] for m in self._metrics_buffer]
unique_images = set(docker_images)
wandb_metrics["endless_terminals/num_unique_docker_images"] = len(unique_images)
# Log most common errors if any
all_errors = []
for m in self._metrics_buffer:
if "tool_errors" in m:
all_errors.extend(m["tool_errors"])
if all_errors:
# Count error types
error_tools = {}
for err in all_errors:
tool = err["tool"]
error_tools[tool] = error_tools.get(tool, 0) + 1
# Log top 3 error-prone tools
for i, (tool, count) in enumerate(sorted(error_tools.items(), key=lambda x: x[1], reverse=True)[:3]):
wandb_metrics[f"endless_terminals/errors_by_tool/{tool}"] = count
# Clear buffer after logging
self._metrics_buffer = []
await super().wandb_log(wandb_metrics)
if __name__ == "__main__":
EndlessTerminalsEnv.cli()

View File

@@ -1,672 +0,0 @@
"""
HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
Provides the Atropos integration plumbing that all hermes-agent environments share:
- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
- Per-group toolset/distribution resolution
- Agent loop orchestration via HermesAgentLoop
- ToolContext creation for reward functions
- ScoredDataGroup construction from ManagedServer state
Subclasses only need to implement:
setup() -- Load dataset, initialize state
get_next_item() -- Return the next item from the dataset
format_prompt() -- Convert a dataset item into the user message
compute_reward() -- Score the rollout (has full ToolContext access)
evaluate() -- Periodic evaluation
"""
import asyncio
import json
import logging
import os
import sys
import uuid
from abc import abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, Union
# Ensure the hermes-agent repo root is on sys.path so that imports like
# `from model_tools import ...` and `from environments.X import ...` work
# regardless of where the script is invoked from.
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from dotenv import load_dotenv
from pydantic import Field
# Load API keys from hermes-agent/.env so all environments can access them
_env_path = _repo_root / ".env"
if _env_path.exists():
load_dotenv(dotenv_path=_env_path)
# Apply monkey patches for async-safe tool operation inside Atropos's event loop.
# This patches SwerexModalEnvironment to use a background thread instead of
# asyncio.run(), which would deadlock inside Atropos. Safe for normal CLI too.
from environments.patches import apply_patches
apply_patches()
from atroposlib.envs.base import (
BaseEnv,
BaseEnvConfig,
ScoredDataGroup,
ScoredDataItem,
)
from atroposlib.envs.server_handling.server_manager import (
APIServerConfig,
ServerBaseline,
ServerManager,
)
from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext
# Import hermes-agent toolset infrastructure
from model_tools import get_tool_definitions
from toolset_distributions import sample_toolsets_from_distribution
logger = logging.getLogger(__name__)
class HermesAgentEnvConfig(BaseEnvConfig):
"""
Configuration for hermes-agent Atropos environments.
Extends BaseEnvConfig with agent-specific settings for toolsets,
terminal backend, dataset loading, and tool call parsing.
"""
# --- Toolset configuration ---
# Mutually exclusive: use either enabled_toolsets OR distribution
enabled_toolsets: Optional[List[str]] = Field(
default=None,
description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
"If None and distribution is also None, all available toolsets are enabled.",
)
disabled_toolsets: Optional[List[str]] = Field(
default=None,
description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
)
distribution: Optional[str] = Field(
default=None,
description="Name of a toolset distribution from toolset_distributions.py "
"(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
"Mutually exclusive with enabled_toolsets.",
)
# --- Agent loop configuration ---
max_agent_turns: int = Field(
default=30,
description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
)
system_prompt: Optional[str] = Field(
default=None,
description="System prompt for the agent. Tools are handled via the tools= parameter, "
"not embedded in the prompt text.",
)
agent_temperature: float = Field(
default=1.0,
description="Sampling temperature for agent generation during rollouts.",
)
# --- Terminal backend ---
terminal_backend: str = Field(
default="local",
description="Terminal backend: 'local', 'docker', 'modal', 'ssh', 'singularity'. "
"Modal recommended for production RL (cloud isolation per rollout).",
)
terminal_timeout: int = Field(
default=120,
description="Per-command timeout in seconds for terminal tool calls. "
"Commands exceeding this are killed. Increase for tasks with long-running "
"commands (compilation, pip install, etc.).",
)
terminal_lifetime: int = Field(
default=3600,
description="Sandbox inactivity lifetime in seconds. The cleanup thread kills "
"sandboxes that have been idle longer than this. Must be longer than "
"the longest gap between tool calls (e.g., waiting for LLM response).",
)
# --- Dataset ---
dataset_name: Optional[str] = Field(
default=None,
description="HuggingFace dataset name. Optional if tasks are defined inline.",
)
dataset_split: str = Field(
default="train",
description="Dataset split to use.",
)
prompt_field: str = Field(
default="prompt",
description="Which field in the dataset contains the prompt.",
)
# --- Thread pool ---
tool_pool_size: int = Field(
default=128,
description="Thread pool size for tool execution. Each concurrent task needs a "
"thread for tool calls. Must be large enough for parallel evaluation. "
"Too small = thread pool starvation.",
)
# --- Phase 2: Tool call parsing ---
tool_call_parser: str = Field(
default="hermes",
description="Tool call parser name for Phase 2 (VLLM server type). "
"Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
)
# --- Provider-specific parameters ---
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
# Example YAML:
# extra_body:
# provider:
# ignore: ["DeepInfra", "Fireworks"]
# order: ["Together"]
# transforms: ["middle-out"]
extra_body: Optional[Dict[str, Any]] = Field(
default=None,
description="Extra body parameters passed to the OpenAI client's "
"chat.completions.create(). Used for OpenRouter provider preferences, "
"transforms, and other provider-specific settings.",
)
class HermesAgentBaseEnv(BaseEnv):
"""
Abstract base environment for hermes-agent Atropos integration.
Handles two modes of operation:
- Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
and reasoning extraction natively. DummyManagedServer provides placeholder
tokens. Good for SFT data gen, verifier testing, evaluation.
- Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
via /generate. Client-side tool call parser reconstructs structured tool_calls
from raw output. Full RL training capability.
Subclasses must implement:
setup() -- Load dataset, initialize state
get_next_item() -- Return the next item to roll out
format_prompt() -- Convert a dataset item into the user message string
compute_reward() -- Score the rollout using ToolContext
evaluate() -- Periodic evaluation
"""
name: Optional[str] = "hermes-agent"
env_config_cls = HermesAgentEnvConfig
def __init__(
self,
config: HermesAgentEnvConfig,
server_configs: Union[ServerBaseline, List[APIServerConfig]],
slurm=False,
testing=False,
):
super().__init__(config, server_configs, slurm, testing)
# Set terminal environment variables so hermes tools pick them up.
# These can all be overridden per-environment via config fields instead
# of requiring users to set shell env vars.
if config.terminal_backend:
os.environ["TERMINAL_ENV"] = config.terminal_backend
os.environ["TERMINAL_TIMEOUT"] = str(config.terminal_timeout)
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(config.terminal_lifetime)
print(
f"🖥️ Terminal: backend={config.terminal_backend}, "
f"timeout={config.terminal_timeout}s, lifetime={config.terminal_lifetime}s"
)
# Resize the agent loop's thread pool for tool execution.
# This must be large enough for the number of concurrent tasks
# (e.g., 89 parallel TB2 eval tasks each need a thread for tool calls).
from environments.agent_loop import resize_tool_pool
resize_tool_pool(config.tool_pool_size)
# Current group's resolved tools (set in collect_trajectories)
self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
# Tool error tracking for wandb logging
self._tool_error_buffer: List[Dict[str, Any]] = []
# =========================================================================
# Toolset resolution (per-group)
# =========================================================================
def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
"""
Resolve toolsets for a group. Called once in collect_trajectories(),
then shared by all collect_trajectory() calls in the group.
If distribution is set, samples probabilistically.
If enabled_toolsets is set, uses that explicit list.
disabled_toolsets is applied as a filter on top.
Returns:
(tool_schemas, valid_tool_names) tuple
"""
config = self.config
if config.distribution:
group_toolsets = sample_toolsets_from_distribution(config.distribution)
logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
else:
group_toolsets = config.enabled_toolsets # None means "all available"
if group_toolsets is None:
logger.warning(
"enabled_toolsets is None -- loading ALL tools including messaging. "
"Set explicit enabled_toolsets for RL training."
)
tools = get_tool_definitions(
enabled_toolsets=group_toolsets,
disabled_toolsets=config.disabled_toolsets,
quiet_mode=True,
)
valid_names = {t["function"]["name"] for t in tools} if tools else set()
logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
return tools, valid_names
# =========================================================================
# Server mode detection
# =========================================================================
def _use_managed_server(self) -> bool:
"""
Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
which go through the /generate endpoint for exact token tracking.
Phase 1 (direct server) is used for 'openai' server type, which uses
/v1/chat/completions with native tool call parsing.
"""
if not self.server.servers:
return False
server = self.server.servers[0]
# If the server is an OpenAI server (not VLLM/SGLang), use direct mode
from atroposlib.envs.server_handling.openai_server import OpenAIServer
return not isinstance(server, OpenAIServer)
# =========================================================================
# Core Atropos integration
# =========================================================================
async def collect_trajectories(
self, item: Item
) -> Tuple[
Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
List[Item],
]:
"""
Override collect_trajectories to resolve toolsets once per group,
then delegate to the standard group-level collection.
The default BaseEnv.collect_trajectories() calls collect_trajectory()
group_size times in parallel. We resolve tools once here and store
them for all those calls to use.
"""
# Resolve toolsets for this group (shared by all rollouts in the group)
self._current_group_tools = self._resolve_tools_for_group()
# Delegate to the default implementation which calls collect_trajectory()
# group_size times via asyncio.gather
return await super().collect_trajectories(item)
# =========================================================================
# Wandb rollout display -- format trajectories nicely
# =========================================================================
@staticmethod
def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
"""
Format a conversation's messages into a readable trajectory string
for wandb rollout tables. Shows tool calls, tool results, and reasoning
in a structured way instead of raw token decoding.
"""
parts = []
for msg in messages:
role = msg.get("role", "unknown")
content = msg.get("content", "")
if role == "system":
parts.append(f"[SYSTEM]\n{content}")
elif role == "user":
parts.append(f"[USER]\n{content}")
elif role == "assistant":
# Show reasoning if present
reasoning = msg.get("reasoning_content", "")
if reasoning:
# Truncate long reasoning for display
if len(reasoning) > 300:
reasoning = reasoning[:300] + "..."
parts.append(f"[ASSISTANT thinking]\n{reasoning}")
# Show content
if content:
parts.append(f"[ASSISTANT]\n{content}")
# Show tool calls
tool_calls = msg.get("tool_calls", [])
for tc in tool_calls:
func = tc.get("function", {})
name = func.get("name", "?")
args = func.get("arguments", "{}")
# Truncate long arguments for display
if len(args) > 200:
args = args[:200] + "..."
parts.append(f"[TOOL CALL] {name}({args})")
elif role == "tool":
tool_id = msg.get("tool_call_id", "")
result = content
# Truncate long tool results for display
if len(result) > 500:
result = result[:500] + "..."
parts.append(f"[TOOL RESULT] {result}")
return "\n\n".join(parts)
async def add_rollouts_for_wandb(
self,
scored_data,
item=None,
):
"""
Override to show formatted trajectories with tool calls visible,
instead of raw token decoding which loses all structure.
"""
num_keep = self.config.num_rollouts_per_group_for_logging
if num_keep == -1:
num_keep = self.config.group_size
group = []
for i in range(min(num_keep, len(scored_data.get("scores", [])))):
score = scored_data["scores"][i]
# Use messages if available for rich display
messages = None
if scored_data.get("messages") and i < len(scored_data["messages"]):
messages = scored_data["messages"][i]
if messages:
text = self._format_trajectory_for_display(messages)
elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
text = self.tokenizer.decode(scored_data["tokens"][i])
else:
text = "(no data)"
group.append((text, score))
self.rollouts_for_wandb.append(group)
if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
self.rollouts_for_wandb.pop(0)
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log base metrics including tool errors to wandb."""
if wandb_metrics is None:
wandb_metrics = {}
# Log tool error stats
if self._tool_error_buffer:
wandb_metrics["train/tool_errors_count"] = len(self._tool_error_buffer)
# Log error details as a summary string (tables can crash wandb on tmp cleanup)
error_summaries = []
for err in self._tool_error_buffer:
error_summaries.append(
f"[turn {err['turn']}] {err['tool']}({err['args'][:80]}) -> {err['error'][:150]}"
)
wandb_metrics["train/tool_error_details"] = "\n".join(error_summaries)
# Also print to stdout for immediate visibility
for summary in error_summaries:
print(f" Tool Error: {summary}")
self._tool_error_buffer = []
else:
wandb_metrics["train/tool_errors_count"] = 0
await super().wandb_log(wandb_metrics)
async def collect_trajectory(
self, item: Item
) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
"""
Run a single rollout: agent loop + reward computation.
This is called group_size times in parallel by collect_trajectories().
Each call gets its own task_id for terminal/browser session isolation.
"""
task_id = str(uuid.uuid4())
# Get group-level tools (resolved once in collect_trajectories)
if self._current_group_tools is None:
# Fallback: resolve per-trajectory if called outside collect_trajectories
tools, valid_names = self._resolve_tools_for_group()
else:
tools, valid_names = self._current_group_tools
# Build initial messages
messages: List[Dict[str, Any]] = []
if self.config.system_prompt:
messages.append({"role": "system", "content": self.config.system_prompt})
messages.append({"role": "user", "content": self.format_prompt(item)})
# Run the agent loop
result: AgentResult
if self._use_managed_server():
# Phase 2: ManagedServer with parser -- exact tokens + logprobs
# Load the tool call parser from registry based on config
from environments.tool_call_parsers import get_parser
try:
tc_parser = get_parser(self.config.tool_call_parser)
except KeyError:
logger.warning(
"Tool call parser '%s' not found, falling back to 'hermes'",
self.config.tool_call_parser,
)
tc_parser = get_parser("hermes")
try:
async with self.server.managed_server(
tokenizer=self.tokenizer,
tool_call_parser=tc_parser,
) as managed:
agent = HermesAgentLoop(
server=managed,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
except NotImplementedError:
# DummyManagedServer not allowed -- fall back to Phase 1
logger.warning(
"ManagedServer not available (OpenAI server?). "
"Falling back to direct server mode."
)
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
else:
# Phase 1: OpenAI server -- native tool_calls, placeholder tokens
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
# Skip reward computation if the agent loop produced no meaningful work
# (e.g., API call failed on turn 1). No point spinning up a Modal sandbox
# just to verify files that were never created.
only_system_and_user = all(
msg.get("role") in ("system", "user") for msg in result.messages
)
if result.turns_used == 0 or only_system_and_user:
logger.warning(
"Agent loop produced no output (turns=%d, msgs=%d). Skipping reward.",
result.turns_used, len(result.messages),
)
reward = 0.0
else:
# Compute reward using ToolContext (gives verifier full tool access)
ctx = ToolContext(task_id)
try:
reward = await self.compute_reward(item, result, ctx)
except Exception as e:
logger.error("compute_reward failed: %s", e)
reward = 0.0
finally:
ctx.cleanup()
# Track tool errors for wandb logging
if result.tool_errors:
for err in result.tool_errors:
self._tool_error_buffer.append({
"turn": err.turn,
"tool": err.tool_name,
"args": err.arguments[:150],
"error": err.error[:300],
"result": err.tool_result[:300],
})
# Build ScoredDataItem from ManagedServer state
# Phase 2: real tokens/masks/logprobs from SequenceNodes
# Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
nodes = (result.managed_state or {}).get("nodes", [])
if nodes:
# Phase 2 (or DummyManagedServer): use actual node data
node = nodes[-1] # Final sequence node = full trajectory
scored_item: Dict[str, Any] = {
"tokens": node.tokens,
"masks": node.masked_tokens,
"scores": reward,
}
# Include logprobs if available (Phase 2)
if hasattr(node, "logprobs") and node.logprobs:
scored_item["advantages"] = None # Computed by trainer
scored_item["ref_logprobs"] = None
else:
# Phase 1 with no managed state: create placeholder tokens
# so the data pipeline doesn't break. These are NOT suitable
# for training but allow process mode (SFT data gen) to work.
# Tokenize the full conversation to get approximate tokens.
full_text = "\n".join(
msg.get("content", "") for msg in result.messages if msg.get("content")
)
if self.tokenizer:
tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
else:
tokens = list(range(min(len(full_text) // 4, 128)))
scored_item = {
"tokens": tokens,
"masks": [-100] + tokens[1:], # Mask first token as prompt
"scores": reward,
}
# Always include messages for wandb rollout display and data logging
scored_item["messages"] = result.messages
return scored_item, []
# =========================================================================
# Abstract methods -- subclasses must implement
# =========================================================================
@abstractmethod
async def setup(self):
"""
Load dataset, initialize state.
Called once when the environment starts. Typical implementation:
self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
self.iter = 0
"""
raise NotImplementedError
@abstractmethod
async def get_next_item(self) -> Item:
"""
Return the next item from the dataset for rollout.
Called by the base env's main loop to get items for workers.
Should cycle through the dataset.
"""
raise NotImplementedError
@abstractmethod
def format_prompt(self, item: Item) -> str:
"""
Convert a dataset item into the user message for the agent.
Args:
item: Dataset item (dict, tuple, etc.)
Returns:
The prompt string to send to the agent
"""
raise NotImplementedError
@abstractmethod
async def compute_reward(
self, item: Item, result: AgentResult, ctx: ToolContext
) -> float:
"""
Score the rollout. Has full access to:
- item: the original dataset item (ground truth, test commands, etc.)
- result: AgentResult with full messages, turn count, reasoning, etc.
- ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
browser, vision...) scoped to this rollout's sandbox. Nothing
is off-limits.
Args:
item: The dataset item that was rolled out
result: The agent's rollout result
ctx: ToolContext with full tool access for verification
Returns:
Reward float (typically 0.0 to 1.0, but any float is valid)
"""
raise NotImplementedError
@abstractmethod
async def evaluate(self, *args, **kwargs):
"""
Periodic evaluation. Called every steps_per_eval steps.
Typical implementation runs the agent on a held-out eval set
and logs metrics via wandb/evaluate_log.
"""
raise NotImplementedError

View File

@@ -1,34 +0,0 @@
# SWE Environment -- Default Configuration
#
# SWE-bench style tasks with Modal sandboxes for cloud isolation.
# Uses terminal + file + web toolsets.
#
# Usage:
# python environments/hermes_swe_env/hermes_swe_env.py serve \
# --config environments/hermes_swe_env/default.yaml
env:
enabled_toolsets: ["terminal", "file", "web"]
max_agent_turns: 30
max_token_length: 4096
group_size: 4
terminal_backend: "modal"
tool_call_parser: "hermes"
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
dataset_name: "bigcode/humanevalpack"
dataset_split: "test"
prompt_field: "prompt"
steps_per_eval: 50
total_steps: 500
use_wandb: true
wandb_name: "hermes-swe"
system_prompt: >
You are a skilled software engineer. You have access to a terminal,
file tools, and web search. Use these tools to complete the coding task.
Write clean, working code and verify it runs correctly before finishing.
openai:
base_url: "http://localhost:8000/v1"
model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
server_type: "openai"
api_key: ""

View File

@@ -1,229 +0,0 @@
"""
HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
A concrete environment for software engineering tasks where the model writes code
and the reward function runs tests to verify correctness. Uses Modal terminal
backend for cloud-isolated sandboxes per rollout.
The reward function uses ToolContext.terminal() to run test commands in the same
Modal sandbox the model used during its agentic loop. All filesystem state from
the model's tool calls is preserved for verification.
Usage:
# Phase 1: OpenAI server type
vllm serve YourModel --tool-parser hermes
run-api
python environments/hermes_swe_env.py serve \\
--openai.base_url http://localhost:8000/v1 \\
--openai.model_name YourModel \\
--openai.server_type openai \\
--env.dataset_name bigcode/humanevalpack \\
--env.terminal_backend modal
# Phase 2: VLLM server type (full RL training)
python environments/hermes_swe_env.py serve \\
--openai.base_url http://localhost:8000/v1 \\
--openai.model_name YourModel \\
--openai.server_type vllm \\
--env.tool_call_parser hermes \\
--env.terminal_backend modal
"""
import logging
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
# Ensure repo root is on sys.path for imports
_repo_root = Path(__file__).resolve().parent.parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from datasets import load_dataset
from atroposlib.envs.base import ScoredDataGroup
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.tool_context import ToolContext
logger = logging.getLogger(__name__)
class HermesSweEnvConfig(HermesAgentEnvConfig):
"""Config with defaults for SWE-bench style tasks."""
pass # Inherits all fields, overrides defaults in config_init
class HermesSweEnv(HermesAgentBaseEnv):
"""
SWE-bench style environment using Modal terminal backend.
The model gets a coding task, uses terminal + file + web tools to solve it,
and the reward function runs tests in the same Modal sandbox to verify.
Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
and customize format_prompt() and compute_reward() as needed.
"""
name = "hermes-swe"
env_config_cls = HermesSweEnvConfig
@classmethod
def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
"""
Default configuration for the SWE environment.
Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
"""
env_config = HermesSweEnvConfig(
# Toolsets: terminal for running code, file for reading/writing, web for docs
enabled_toolsets=["terminal", "file", "web"],
disabled_toolsets=None,
distribution=None,
# Agent settings -- SWE tasks need more turns
max_agent_turns=30,
max_token_length=4096,
agent_temperature=1.0,
system_prompt=(
"You are a skilled software engineer. You have access to a terminal, "
"file tools, and web search. Use these tools to complete the coding task. "
"Write clean, working code and verify it runs correctly before finishing."
),
# Modal backend for cloud-isolated sandboxes
terminal_backend="modal",
# Dataset -- override via CLI for your specific SWE dataset
dataset_name="bigcode/humanevalpack",
dataset_split="test",
prompt_field="prompt",
# Atropos settings
group_size=4,
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
tool_call_parser="hermes",
steps_per_eval=50,
total_steps=500,
use_wandb=True,
wandb_name="hermes-swe",
)
server_configs = [
APIServerConfig(
base_url="http://localhost:8000/v1",
model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
server_type="openai", # Phase 1; switch to "vllm" for Phase 2
api_key="",
)
]
return env_config, server_configs
async def setup(self):
"""Load the SWE dataset."""
if self.config.dataset_name:
self.dataset = load_dataset(
self.config.dataset_name, split=self.config.dataset_split
)
else:
# Placeholder if no dataset specified
self.dataset = []
self.iter = 0
self.reward_buffer: List[float] = []
async def get_next_item(self) -> Dict[str, Any]:
"""Cycle through the SWE dataset."""
if not self.dataset:
raise ValueError("No dataset loaded. Set dataset_name in config.")
item = self.dataset[self.iter % len(self.dataset)]
self.iter += 1
return item
def format_prompt(self, item: Dict[str, Any]) -> str:
"""
Format the SWE task prompt.
Override this in subclasses for different dataset formats.
Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
"""
prompt = item.get(self.config.prompt_field, "")
# If the dataset has test information, include it in the prompt
test_info = item.get("test", item.get("test_code", item.get("tests", "")))
if test_info:
prompt += f"\n\nTests to pass:\n{test_info}"
return prompt
async def compute_reward(
self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
) -> float:
"""
Score by running tests in the model's Modal sandbox.
Default implementation:
- If the dataset item has a 'test' or 'test_code' field, run it
- Check exit code: 0 = pass, non-zero = fail
- Partial credit for file creation
Override this in subclasses for more sophisticated reward logic.
"""
# Find the test command from the dataset item
test_code = item.get("test", item.get("test_code", item.get("tests", "")))
if test_code:
# Run the test in the model's sandbox
test_result = ctx.terminal(
f'cd /workspace && python3 -c "{test_code}"', timeout=60
)
if test_result["exit_code"] == 0:
self.reward_buffer.append(1.0)
return 1.0
# Partial credit: check if the model created any Python files
file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
self.reward_buffer.append(0.1)
return 0.1
self.reward_buffer.append(0.0)
return 0.0
async def evaluate(self, *args, **kwargs):
"""
Run evaluation on a held-out set.
Override for dataset-specific evaluation logic.
"""
start_time = time.time()
end_time = time.time()
eval_metrics = {"eval/placeholder": 0.0}
await self.evaluate_log(
metrics=eval_metrics,
start_time=start_time,
end_time=end_time,
)
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log SWE-specific metrics."""
if wandb_metrics is None:
wandb_metrics = {}
if self.reward_buffer:
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
self.reward_buffer
)
wandb_metrics["train/pass_rate"] = sum(
1 for r in self.reward_buffer if r == 1.0
) / len(self.reward_buffer)
self.reward_buffer = []
await super().wandb_log(wandb_metrics)
if __name__ == "__main__":
HermesSweEnv.cli()

View File

@@ -1,188 +0,0 @@
"""
Monkey patches for making hermes-agent tools work inside async frameworks (Atropos).
Problem:
Some tools use asyncio.run() internally (e.g., mini-swe-agent's Modal backend,
web_extract). This crashes when called from inside Atropos's event loop because
asyncio.run() can't be nested.
Solution:
Replace the problematic methods with versions that use a dedicated background
thread with its own event loop. The calling code sees the same sync interface --
call a function, get a result -- but internally the async work happens on a
separate thread that doesn't conflict with Atropos's loop.
These patches are safe for normal CLI use too: when there's no running event
loop, the behavior is identical (the background thread approach works regardless).
What gets patched:
- SwerexModalEnvironment.__init__ -- creates Modal deployment on a background thread
- SwerexModalEnvironment.execute -- runs commands on the same background thread
- SwerexModalEnvironment.stop -- stops deployment on the background thread
Usage:
Call apply_patches() once at import time (done automatically by hermes_base_env.py).
This is idempotent -- calling it multiple times is safe.
"""
import asyncio
import logging
import threading
from typing import Any
logger = logging.getLogger(__name__)
_patches_applied = False
class _AsyncWorker:
"""
A dedicated background thread with its own event loop.
Allows sync code to submit async coroutines and block for results,
even when called from inside another running event loop. Used to
bridge sync tool interfaces with async backends (Modal, SWE-ReX).
"""
def __init__(self):
self._loop: asyncio.AbstractEventLoop = None
self._thread: threading.Thread = None
self._started = threading.Event()
def start(self):
"""Start the background event loop thread."""
self._thread = threading.Thread(target=self._run_loop, daemon=True)
self._thread.start()
self._started.wait(timeout=30)
def _run_loop(self):
"""Background thread entry point -- runs the event loop forever."""
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
self._started.set()
self._loop.run_forever()
def run_coroutine(self, coro, timeout=600):
"""
Submit a coroutine to the background loop and block until it completes.
Safe to call from any thread, including threads that already have
a running event loop.
"""
if self._loop is None or self._loop.is_closed():
raise RuntimeError("AsyncWorker loop is not running")
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
return future.result(timeout=timeout)
def stop(self):
"""Stop the background event loop and join the thread."""
if self._loop and self._loop.is_running():
self._loop.call_soon_threadsafe(self._loop.stop)
if self._thread:
self._thread.join(timeout=10)
def _patch_swerex_modal():
"""
Monkey patch SwerexModalEnvironment to use a background thread event loop
instead of asyncio.run(). This makes it safe to call from inside Atropos's
async event loop.
The patched methods have the exact same interface and behavior -- the only
difference is HOW the async work is executed internally.
"""
try:
from minisweagent.environments.extra.swerex_modal import (
SwerexModalEnvironment,
SwerexModalEnvironmentConfig,
)
from swerex.deployment.modal import ModalDeployment
from swerex.runtime.abstract import Command as RexCommand
except ImportError:
# mini-swe-agent or swe-rex not installed -- nothing to patch
logger.debug("mini-swe-agent Modal backend not available, skipping patch")
return
# Save original methods so we can refer to config handling
_original_init = SwerexModalEnvironment.__init__
def _patched_init(self, **kwargs):
"""Patched __init__: creates Modal deployment on a background thread."""
self.config = SwerexModalEnvironmentConfig(**kwargs)
# Start a dedicated event loop thread for all Modal async operations
self._worker = _AsyncWorker()
self._worker.start()
# Create AND start the deployment entirely on the worker's loop/thread
# so all gRPC channels and async state are bound to that loop
async def _create_and_start():
deployment = ModalDeployment(
image=self.config.image,
startup_timeout=self.config.startup_timeout,
runtime_timeout=self.config.runtime_timeout,
deployment_timeout=self.config.deployment_timeout,
install_pipx=self.config.install_pipx,
modal_sandbox_kwargs=self.config.modal_sandbox_kwargs,
)
await deployment.start()
return deployment
self.deployment = self._worker.run_coroutine(_create_and_start())
def _patched_execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]:
"""Patched execute: runs commands on the background thread's loop."""
async def _do_execute():
return await self.deployment.runtime.execute(
RexCommand(
command=command,
shell=True,
check=False,
cwd=cwd or self.config.cwd,
timeout=timeout or self.config.timeout,
merge_output_streams=True,
env=self.config.env if self.config.env else None,
)
)
output = self._worker.run_coroutine(_do_execute())
return {
"output": output.stdout,
"returncode": output.exit_code,
}
def _patched_stop(self):
"""Patched stop: stops deployment on the background thread, then stops the thread."""
try:
self._worker.run_coroutine(
asyncio.wait_for(self.deployment.stop(), timeout=10),
timeout=15,
)
except Exception:
pass
finally:
self._worker.stop()
# Apply the patches
SwerexModalEnvironment.__init__ = _patched_init
SwerexModalEnvironment.execute = _patched_execute
SwerexModalEnvironment.stop = _patched_stop
logger.debug("Patched SwerexModalEnvironment for async-safe operation")
def apply_patches():
"""
Apply all monkey patches needed for Atropos compatibility.
Safe to call multiple times -- patches are only applied once.
Safe for normal CLI use -- patched code works identically when
there is no running event loop.
"""
global _patches_applied
if _patches_applied:
return
_patch_swerex_modal()
_patches_applied = True

View File

@@ -1,34 +0,0 @@
# Terminal Test Environment -- Default Configuration
#
# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
# Uses Modal terminal backend and OpenRouter (Claude) for inference.
# API keys loaded from ~/hermes-agent/.env
#
# Usage:
# run-api
# python environments/terminal_test_env/terminal_test_env.py serve \
# --config environments/terminal_test_env/default.yaml
env:
enabled_toolsets: ["terminal", "file"]
max_agent_turns: 10
max_token_length: 2048
group_size: 3
total_steps: 3
steps_per_eval: 3
terminal_backend: "modal"
tool_call_parser: "hermes"
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
ensure_scores_are_not_same: false
use_wandb: false
system_prompt: >
You are a helpful assistant with access to a terminal and file tools.
Complete the user's request by using the available tools.
Be precise and follow instructions exactly.
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-opus-4.6"
server_type: "openai"
health_check: false
# api_key loaded from OPENROUTER_API_KEY in .env

View File

@@ -1,292 +0,0 @@
"""
TerminalTestEnv -- Simple Test Environment for Validating the Stack
A self-contained environment with inline tasks (no external dataset needed).
Each task asks the model to create a file at a known path with specific content.
The reward verifier cats the file and checks if the content matches.
Enables only terminal + file toolsets. Uses Modal terminal backend with
OpenRouter (Claude) by default.
Training tasks (3):
1. Create ~/greeting.txt with "Hello from Hermes Agent"
2. Create ~/count.txt with numbers 1-5, one per line
3. Create ~/answer.txt with the result of 123 + 456
Eval task (1):
1. Create ~/result.txt with the result of 6 * 7
Usage:
# Start Atropos API server
run-api
# Run environment (uses OpenRouter + Modal by default)
python environments/terminal_test_env.py serve
# Process mode (no run-api needed, saves to JSONL)
python environments/terminal_test_env.py process \\
--env.data_path_to_save_groups terminal_test_output.jsonl
"""
import logging
import os
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
# Ensure repo root is on sys.path for imports
_repo_root = Path(__file__).resolve().parent.parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from atroposlib.envs.base import ScoredDataGroup
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.tool_context import ToolContext
logger = logging.getLogger(__name__)
# =============================================================================
# Inline task definitions -- no external dataset needed
# =============================================================================
TRAIN_TASKS = [
{
"prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
"verify_path": "~/greeting.txt",
"expected_content": "Hello from Hermes Agent",
},
{
"prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
"verify_path": "~/count.txt",
"expected_content": "1\n2\n3\n4\n5",
},
{
"prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
"verify_path": "~/answer.txt",
"expected_content": "579",
},
]
EVAL_TASKS = [
{
"prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
"verify_path": "~/result.txt",
"expected_content": "42",
},
]
class TerminalTestEnvConfig(HermesAgentEnvConfig):
"""Config with defaults suitable for terminal testing."""
pass # Inherits all fields, overrides defaults in config_init
class TerminalTestEnv(HermesAgentBaseEnv):
"""
Simple test environment with inline file-creation tasks.
All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
against the expected string. Same verifier logic for all tasks.
This environment is designed to validate the full stack end-to-end:
- Agent loop executes tool calls (terminal/file)
- ToolContext provides terminal access to the reward function
- Reward function verifies file content via cat
- Scored data flows through the Atropos pipeline
"""
name = "terminal-test"
env_config_cls = TerminalTestEnvConfig
@classmethod
def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
"""
Default configuration for the terminal test environment.
Uses Modal terminal backend for cloud isolation and OpenRouter with
Claude for inference. API keys loaded from ~/hermes-agent/.env.
"""
env_config = TerminalTestEnvConfig(
# Terminal + file tools only
enabled_toolsets=["terminal", "file"],
disabled_toolsets=None,
distribution=None,
# Agent settings
max_agent_turns=10, # Simple tasks, don't need many turns
max_token_length=16000,
agent_temperature=1.0,
system_prompt=(
"You are a helpful assistant with access to a terminal and file tools. "
"Complete the user's request by using the available tools. "
"Be precise and follow instructions exactly."
),
# Modal terminal backend for cloud-isolated sandboxes per rollout
terminal_backend="modal",
# Atropos settings
group_size=3, # 3 rollouts per group
tokenizer_name="NousResearch/q-30b-t-h45-e1",
tool_call_parser="hermes",
steps_per_eval=3, # Eval after all 3 steps
total_steps=3, # 3 groups total (1 group per step)
use_wandb=True,
wandb_name="terminal-test",
ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks
# No external dataset
dataset_name=None,
)
# OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
server_configs = [
APIServerConfig(
base_url="https://openrouter.ai/api/v1",
model_name="anthropic/claude-opus-4.6",
server_type="openai",
api_key=os.getenv("OPENROUTER_API_KEY", ""),
health_check=False, # OpenRouter doesn't have a /health endpoint
)
]
return env_config, server_configs
async def setup(self):
"""Initialize inline task lists."""
self.train_tasks = list(TRAIN_TASKS)
self.eval_tasks = list(EVAL_TASKS)
self.iter = 0
# Track reward stats for wandb logging
self.reward_buffer: List[float] = []
async def get_next_item(self) -> Dict[str, str]:
"""Cycle through training tasks."""
item = self.train_tasks[self.iter % len(self.train_tasks)]
self.iter += 1
return item
def format_prompt(self, item: Dict[str, str]) -> str:
"""The prompt is directly in the task item."""
return item["prompt"]
async def compute_reward(
self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
) -> float:
"""
Verify by cat-ing the expected file path and checking content matches.
Same verifier for all tasks -- they all write a file at a known path.
Scoring:
1.0 = exact match
0.5 = expected content is present but has extra stuff
0.0 = file doesn't exist or content doesn't match
"""
verify_result = ctx.terminal(f"cat {item['verify_path']}")
# File doesn't exist or can't be read
if verify_result["exit_code"] != 0:
self.reward_buffer.append(0.0)
return 0.0
actual = verify_result.get("output", "").strip()
expected = item["expected_content"].strip()
# Exact match
if actual == expected:
self.reward_buffer.append(1.0)
return 1.0
# Partial credit: expected content is present but has extra stuff
if expected in actual:
self.reward_buffer.append(0.5)
return 0.5
self.reward_buffer.append(0.0)
return 0.0
async def evaluate(self, *args, **kwargs):
"""
Run eval tasks using the agent loop and verify results.
Logs accuracy metrics.
"""
start_time = time.time()
correct = 0
total = len(self.eval_tasks)
samples = []
for eval_item in self.eval_tasks:
try:
# For eval, we do a simple single-turn completion (not full agent loop)
# to keep eval fast. The agent loop is tested via training.
completion = await self.server.chat_completion(
messages=[
{"role": "system", "content": self.config.system_prompt or ""},
{"role": "user", "content": eval_item["prompt"]},
],
n=1,
max_tokens=self.config.max_token_length,
temperature=0.0,
split="eval",
)
response_content = (
completion.choices[0].message.content if completion.choices else ""
)
samples.append(
{
"prompt": eval_item["prompt"],
"response": response_content,
"expected": eval_item["expected_content"],
}
)
except Exception as e:
logger.error("Eval failed for item: %s", e)
samples.append(
{
"prompt": eval_item["prompt"],
"response": f"ERROR: {e}",
"expected": eval_item["expected_content"],
}
)
end_time = time.time()
eval_metrics = {
"eval/num_samples": total,
}
await self.evaluate_log(
metrics=eval_metrics,
samples=samples,
start_time=start_time,
end_time=end_time,
)
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log training metrics including reward stats and accuracy."""
if wandb_metrics is None:
wandb_metrics = {}
if self.reward_buffer:
total = len(self.reward_buffer)
correct = sum(1 for r in self.reward_buffer if r == 1.0)
partial = sum(1 for r in self.reward_buffer if r == 0.5)
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
wandb_metrics["train/accuracy"] = correct / total
wandb_metrics["train/partial_match_rate"] = partial / total
wandb_metrics["train/total_rollouts"] = total
self.reward_buffer = []
await super().wandb_log(wandb_metrics)
if __name__ == "__main__":
TerminalTestEnv.cli()

View File

@@ -1,120 +0,0 @@
"""
Tool Call Parser Registry
Client-side parsers that extract structured tool_calls from raw model output text.
Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
raw text without tool call parsing.
Each parser is a standalone reimplementation of the corresponding VLLM parser's
non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
(re, json, uuid) and openai types.
Usage:
from environments.tool_call_parsers import get_parser
parser = get_parser("hermes")
content, tool_calls = parser.parse(raw_model_output)
# content = text with tool call markup stripped
# tool_calls = list of ChatCompletionMessageToolCall objects, or None
"""
import logging
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple, Type
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
)
logger = logging.getLogger(__name__)
# Type alias for parser return value
ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
class ToolCallParser(ABC):
"""
Base class for tool call parsers.
Each parser knows how to extract structured tool_calls from a specific
model family's raw output text format.
"""
@abstractmethod
def parse(self, text: str) -> ParseResult:
"""
Parse raw model output text for tool calls.
Args:
text: Raw decoded text from the model's completion
Returns:
Tuple of (content, tool_calls) where:
- content: text with tool call markup stripped (the message 'content' field),
or None if the entire output was tool calls
- tool_calls: list of ChatCompletionMessageToolCall objects,
or None if no tool calls were found
"""
raise NotImplementedError
# Global parser registry: name -> parser class
PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
def register_parser(name: str):
"""
Decorator to register a parser class under a given name.
Usage:
@register_parser("hermes")
class HermesToolCallParser(ToolCallParser):
...
"""
def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
PARSER_REGISTRY[name] = cls
return cls
return decorator
def get_parser(name: str) -> ToolCallParser:
"""
Get a parser instance by name.
Args:
name: Parser name (e.g., "hermes", "mistral", "llama3_json")
Returns:
Instantiated parser
Raises:
KeyError: If parser name is not found in registry
"""
if name not in PARSER_REGISTRY:
available = sorted(PARSER_REGISTRY.keys())
raise KeyError(
f"Tool call parser '{name}' not found. Available parsers: {available}"
)
return PARSER_REGISTRY[name]()
def list_parsers() -> List[str]:
"""Return sorted list of registered parser names."""
return sorted(PARSER_REGISTRY.keys())
# Import all parser modules to trigger registration via @register_parser decorators
# Each module registers itself when imported
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.mistral_parser import MistralToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.llama_parser import LlamaToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.qwen_parser import QwenToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser # noqa: E402, F401

View File

@@ -1,71 +0,0 @@
"""
DeepSeek V3.1 tool call parser.
Similar to V3 but with a slightly different format:
<tool▁call▁begin>function_name<tool▁sep>arguments<tool▁call▁end>
Note: V3 has type+name before the separator, V3.1 has name before and args after.
Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
"""
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("deepseek_v3_1")
@register_parser("deepseek_v31")
class DeepSeekV31ToolCallParser(ToolCallParser):
"""
Parser for DeepSeek V3.1 tool calls.
Slightly different regex than V3: function_name comes before the separator,
arguments come after (no type field, no json code block wrapper).
"""
START_TOKEN = "<tool▁calls▁begin>"
# Regex captures: function_name, function_arguments
PATTERN = re.compile(
r"<tool▁call▁begin>(?P<function_name>.*?)<tool▁sep>(?P<function_arguments>.*?)<tool▁call▁end>"
)
def parse(self, text: str) -> ParseResult:
if self.START_TOKEN not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
func_name, func_args = match
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=func_name.strip(),
arguments=func_args.strip(),
),
)
)
if not tool_calls:
return text, None
content = text[: text.find(self.START_TOKEN)].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,75 +0,0 @@
"""
DeepSeek V3 tool call parser.
Format uses special unicode tokens:
<tool▁calls▁begin>
<tool▁call▁begin>type<tool▁sep>function_name
```json
{"arg": "value"}
```
<tool▁call▁end>
<tool▁calls▁end>
Based on VLLM's DeepSeekV3ToolParser.extract_tool_calls()
"""
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("deepseek_v3")
class DeepSeekV3ToolCallParser(ToolCallParser):
"""
Parser for DeepSeek V3 tool calls.
Uses special unicode tokens with fullwidth angle brackets and block elements.
Extracts type, function name, and JSON arguments from the structured format.
"""
START_TOKEN = "<tool▁calls▁begin>"
# Regex captures: type, function_name, function_arguments
PATTERN = re.compile(
r"<tool▁call▁begin>(?P<type>.*)<tool▁sep>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<tool▁call▁end>"
)
def parse(self, text: str) -> ParseResult:
if self.START_TOKEN not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
tc_type, func_name, func_args = match
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=func_name.strip(),
arguments=func_args.strip(),
),
)
)
if not tool_calls:
return text, None
# Content is everything before the tool calls section
content = text[: text.find(self.START_TOKEN)].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,109 +0,0 @@
"""
GLM 4.5 (GLM-4-MoE) tool call parser.
Format uses custom arg_key/arg_value tags rather than standard JSON:
<tool_call>function_name
<arg_key>param1</arg_key><arg_value>value1</arg_value>
<arg_key>param2</arg_key><arg_value>value2</arg_value>
</tool_call>
Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
"""
import ast
import json
import re
import uuid
from typing import Any, Dict, List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
def _deserialize_value(value: str) -> Any:
"""
Try to deserialize a string value to its native Python type.
Attempts json.loads, then ast.literal_eval, then returns raw string.
"""
try:
return json.loads(value)
except (json.JSONDecodeError, TypeError):
pass
try:
return ast.literal_eval(value)
except (ValueError, SyntaxError, TypeError):
pass
return value
@register_parser("glm45")
class Glm45ToolCallParser(ToolCallParser):
"""
Parser for GLM 4.5 (GLM-4-MoE) tool calls.
Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
instead of standard JSON arguments.
"""
FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
FUNC_ARG_REGEX = re.compile(
r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
)
START_TOKEN = "<tool_call>"
def parse(self, text: str) -> ParseResult:
if self.START_TOKEN not in text:
return text, None
try:
matched_calls = self.FUNC_CALL_REGEX.findall(text)
if not matched_calls:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matched_calls:
detail = self.FUNC_DETAIL_REGEX.search(match)
if not detail:
continue
func_name = detail.group(1).strip()
func_args_raw = detail.group(2)
# Parse arg_key/arg_value pairs
pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
arg_dict: Dict[str, Any] = {}
for key, value in pairs:
arg_key = key.strip()
arg_val = _deserialize_value(value.strip())
arg_dict[arg_key] = arg_val
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=func_name,
arguments=json.dumps(arg_dict, ensure_ascii=False),
),
)
)
if not tool_calls:
return text, None
content = text[: text.find(self.START_TOKEN)].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,35 +0,0 @@
"""
GLM 4.7 tool call parser.
Same as GLM 4.5 but with slightly different regex patterns.
The tool_call tags may wrap differently and arg parsing handles
newlines between key/value pairs.
Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
"""
import re
from environments.tool_call_parsers import ParseResult, register_parser
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
@register_parser("glm47")
class Glm47ToolCallParser(Glm45ToolCallParser):
"""
Parser for GLM 4.7 tool calls.
Extends GLM 4.5 with updated regex patterns.
"""
def __init__(self):
super().__init__()
# GLM 4.7 uses a slightly different detail regex that includes
# the <tool_call> wrapper and optional arg_key content
self.FUNC_DETAIL_REGEX = re.compile(
r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
)
# GLM 4.7 handles newlines between arg_key and arg_value tags
self.FUNC_ARG_REGEX = re.compile(
r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
re.DOTALL,
)

View File

@@ -1,73 +0,0 @@
"""
Hermes tool call parser.
Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
"""
import json
import re
import uuid
from typing import List, Optional, Tuple
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("hermes")
class HermesToolCallParser(ToolCallParser):
"""
Parser for Hermes-format tool calls.
Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
Also handles unclosed <tool_call> at end-of-string (truncated generation).
"""
# Matches both closed and unclosed tool_call tags
PATTERN = re.compile(
r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
)
def parse(self, text: str) -> ParseResult:
if "<tool_call>" not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
# match is a tuple: (closed_content, unclosed_content)
raw_json = match[0] if match[0] else match[1]
if not raw_json.strip():
continue
tc_data = json.loads(raw_json)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=tc_data["name"],
arguments=json.dumps(
tc_data.get("arguments", {}), ensure_ascii=False
),
),
)
)
if not tool_calls:
return text, None
# Content is everything before the first <tool_call> tag
content = text[: text.find("<tool_call>")].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,93 +0,0 @@
"""
Kimi K2 tool call parser.
Format:
<|tool_calls_section_begin|>
<|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
<|tool_calls_section_end|>
The function_id format is typically "functions.func_name:index" or "func_name:index".
Based on VLLM's KimiK2ToolParser.extract_tool_calls()
"""
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("kimi_k2")
class KimiK2ToolCallParser(ToolCallParser):
"""
Parser for Kimi K2 tool calls.
Uses section begin/end tokens wrapping individual tool call begin/end tokens.
The tool_call_id contains the function name (after last dot, before colon).
"""
# Support both singular and plural variants
START_TOKENS = [
"<|tool_calls_section_begin|>",
"<|tool_call_section_begin|>",
]
# Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
PATTERN = re.compile(
r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
r"<\|tool_call_argument_begin\|>\s*"
r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
r"<\|tool_call_end\|>",
re.DOTALL,
)
def parse(self, text: str) -> ParseResult:
# Check for any variant of the start token
has_start = any(token in text for token in self.START_TOKENS)
if not has_start:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
function_id, function_args = match
# Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
function_name = function_id.split(":")[0].split(".")[-1]
tool_calls.append(
ChatCompletionMessageToolCall(
id=function_id, # Preserve the original ID format
type="function",
function=Function(
name=function_name,
arguments=function_args.strip(),
),
)
)
if not tool_calls:
return text, None
# Content is everything before the tool calls section
earliest_start = len(text)
for token in self.START_TOKENS:
idx = text.find(token)
if idx >= 0 and idx < earliest_start:
earliest_start = idx
content = text[:earliest_start].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,96 +0,0 @@
"""
Llama 3.x / 4 tool call parser.
Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
by content or semicolons.
Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
"""
import json
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("llama3_json")
@register_parser("llama4_json")
class LlamaToolCallParser(ToolCallParser):
"""
Parser for Llama 3.x and 4 JSON-format tool calls.
Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
Uses Python's json.JSONDecoder.raw_decode for robust extraction of
JSON objects from mixed text.
"""
BOT_TOKEN = "<|python_tag|>"
# Regex to find the start of potential JSON objects
JSON_START = re.compile(r"\{")
def parse(self, text: str) -> ParseResult:
# Quick check: need either the bot token or a JSON brace
if self.BOT_TOKEN not in text and "{" not in text:
return text, None
try:
decoder = json.JSONDecoder()
tool_calls: List[ChatCompletionMessageToolCall] = []
end_index = -1 # Track where the last parsed JSON ended
for match in self.JSON_START.finditer(text):
start = match.start()
# Skip if this brace is inside a previously parsed JSON object
if start <= end_index:
continue
try:
obj, json_end = decoder.raw_decode(text[start:])
end_index = start + json_end
# Must have "name" and either "arguments" or "parameters"
name = obj.get("name")
args = obj.get("arguments", obj.get("parameters"))
if not name or args is None:
continue
# Normalize arguments to JSON string
if isinstance(args, dict):
args = json.dumps(args, ensure_ascii=False)
elif not isinstance(args, str):
args = json.dumps(args, ensure_ascii=False)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(name=name, arguments=args),
)
)
except (json.JSONDecodeError, KeyError, ValueError):
continue
if not tool_calls:
return text, None
# Content is everything before the first tool call JSON
# Find where the first tool call starts in the text
first_tc_start = text.find("{")
if self.BOT_TOKEN in text:
first_tc_start = text.find(self.BOT_TOKEN)
content = text[:first_tc_start].strip() if first_tc_start > 0 else None
return content, tool_calls
except Exception:
return text, None

View File

@@ -1,69 +0,0 @@
"""
Longcat Flash Chat tool call parser.
Same as Hermes but uses <longcat_tool_call> tags instead of <tool_call>.
Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser).
"""
import json
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("longcat")
class LongcatToolCallParser(ToolCallParser):
"""
Parser for Longcat Flash Chat tool calls.
Identical logic to Hermes, just different tag names.
"""
PATTERN = re.compile(
r"<longcat_tool_call>\s*(.*?)\s*</longcat_tool_call>|<longcat_tool_call>\s*(.*)",
re.DOTALL,
)
def parse(self, text: str) -> ParseResult:
if "<longcat_tool_call>" not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
raw_json = match[0] if match[0] else match[1]
if not raw_json.strip():
continue
tc_data = json.loads(raw_json)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=tc_data["name"],
arguments=json.dumps(
tc_data.get("arguments", {}), ensure_ascii=False
),
),
)
)
if not tool_calls:
return text, None
content = text[: text.find("<longcat_tool_call>")].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,130 +0,0 @@
"""
Mistral tool call parser.
Supports two formats depending on tokenizer version:
- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...]
- v11+: content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"}
Based on VLLM's MistralToolParser.extract_tool_calls()
The [TOOL_CALLS] token is the bot_token used by Mistral models.
"""
import json
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
def _generate_mistral_id() -> str:
"""Mistral tool call IDs are 9-char alphanumeric strings."""
import random
import string
return "".join(random.choices(string.ascii_letters + string.digits, k=9))
@register_parser("mistral")
class MistralToolCallParser(ToolCallParser):
"""
Parser for Mistral-format tool calls.
Detects format by checking if the content after [TOOL_CALLS] starts with '['
(pre-v11 JSON array) or with a tool name (v11+ format).
"""
# The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer
BOT_TOKEN = "[TOOL_CALLS]"
# Fallback regex for pre-v11 format when JSON parsing fails
TOOL_CALL_REGEX = re.compile(r"\[?\s*(\{.*?\})\s*\]?", re.DOTALL)
def parse(self, text: str) -> ParseResult:
if self.BOT_TOKEN not in text:
return text, None
try:
parts = text.split(self.BOT_TOKEN)
content = parts[0].strip()
raw_tool_calls = parts[1:]
# Detect format: if the first raw part starts with '[', it's pre-v11
first_raw = raw_tool_calls[0].strip() if raw_tool_calls else ""
is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{")
tool_calls: List[ChatCompletionMessageToolCall] = []
if not is_pre_v11:
# v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2}
for raw in raw_tool_calls:
raw = raw.strip()
if not raw or "{" not in raw:
continue
brace_idx = raw.find("{")
tool_name = raw[:brace_idx].strip()
args_str = raw[brace_idx:]
tool_calls.append(
ChatCompletionMessageToolCall(
id=_generate_mistral_id(),
type="function",
function=Function(name=tool_name, arguments=args_str),
)
)
else:
# Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}]
try:
parsed = json.loads(first_raw)
if isinstance(parsed, dict):
parsed = [parsed]
for tc in parsed:
args = tc.get("arguments", {})
if isinstance(args, dict):
args = json.dumps(args, ensure_ascii=False)
tool_calls.append(
ChatCompletionMessageToolCall(
id=_generate_mistral_id(),
type="function",
function=Function(
name=tc["name"], arguments=args
),
)
)
except json.JSONDecodeError:
# Fallback regex extraction
match = self.TOOL_CALL_REGEX.findall(first_raw)
if match:
for raw_json in match:
try:
tc = json.loads(raw_json)
args = tc.get("arguments", {})
if isinstance(args, dict):
args = json.dumps(args, ensure_ascii=False)
tool_calls.append(
ChatCompletionMessageToolCall(
id=_generate_mistral_id(),
type="function",
function=Function(
name=tc["name"], arguments=args
),
)
)
except (json.JSONDecodeError, KeyError):
continue
if not tool_calls:
return text, None
return content if content else None, tool_calls
except Exception:
return text, None

View File

@@ -1,163 +0,0 @@
"""
Qwen3-Coder tool call parser.
Format uses XML-style nested tags:
<tool_call>
<function=function_name>
<parameter=param_name>value</parameter>
<parameter=param_name2>value2</parameter>
</function>
</tool_call>
Parameters are extracted from <parameter=name>value</parameter> tags and
type-converted using the schema if available, otherwise treated as strings.
Based on VLLM's Qwen3CoderToolParser.extract_tool_calls()
"""
import ast
import json
import re
import uuid
from typing import Any, Dict, List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
def _try_convert_value(value: str) -> Any:
"""
Try to convert a parameter value string to a native Python type.
Handles null, numbers, booleans, JSON objects/arrays, and falls back to string.
"""
stripped = value.strip()
# Handle null
if stripped.lower() == "null":
return None
# Try JSON first (handles objects, arrays, strings, numbers, booleans)
try:
return json.loads(stripped)
except (json.JSONDecodeError, TypeError):
pass
# Try Python literal eval (handles tuples, etc.)
try:
return ast.literal_eval(stripped)
except (ValueError, SyntaxError, TypeError):
pass
# Return as string
return stripped
@register_parser("qwen3_coder")
class Qwen3CoderToolCallParser(ToolCallParser):
"""
Parser for Qwen3-Coder XML-format tool calls.
Uses nested XML tags: <tool_call><function=name><parameter=key>val</parameter></function></tool_call>
"""
START_TOKEN = "<tool_call>"
FUNCTION_PREFIX = "<function="
# Find complete tool_call blocks (or unclosed at end)
TOOL_CALL_REGEX = re.compile(
r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
)
# Find function blocks within a tool_call
FUNCTION_REGEX = re.compile(
r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
)
# Find parameter blocks within a function
PARAMETER_REGEX = re.compile(
r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
re.DOTALL,
)
def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]:
"""Parse a single <function=name>...</function> block into a ToolCall."""
try:
# Extract function name: everything before the first '>'
gt_idx = function_str.index(">")
func_name = function_str[:gt_idx].strip()
params_str = function_str[gt_idx + 1:]
# Extract parameters
param_dict: Dict[str, Any] = {}
for match_text in self.PARAMETER_REGEX.findall(params_str):
if ">" not in match_text:
continue
eq_idx = match_text.index(">")
param_name = match_text[:eq_idx].strip()
param_value = match_text[eq_idx + 1:]
# Clean up whitespace
if param_value.startswith("\n"):
param_value = param_value[1:]
if param_value.endswith("\n"):
param_value = param_value[:-1]
param_dict[param_name] = _try_convert_value(param_value)
return ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:24]}",
type="function",
function=Function(
name=func_name,
arguments=json.dumps(param_dict, ensure_ascii=False),
),
)
except (ValueError, IndexError):
return None
def parse(self, text: str) -> ParseResult:
if self.FUNCTION_PREFIX not in text:
return text, None
try:
# Find all tool_call blocks
tc_matches = self.TOOL_CALL_REGEX.findall(text)
raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches]
# Fallback: if no tool_call tags, try the whole text
if not raw_blocks:
raw_blocks = [text]
# Find function blocks within each tool_call
function_strs: List[str] = []
for block in raw_blocks:
func_matches = self.FUNCTION_REGEX.findall(block)
function_strs.extend(m[0] if m[0] else m[1] for m in func_matches)
if not function_strs:
return text, None
# Parse each function call
tool_calls: List[ChatCompletionMessageToolCall] = []
for func_str in function_strs:
tc = self._parse_function_call(func_str)
if tc is not None:
tool_calls.append(tc)
if not tool_calls:
return text, None
# Content before tool calls
first_tc = text.find(self.START_TOKEN)
if first_tc < 0:
first_tc = text.find(self.FUNCTION_PREFIX)
content = text[:first_tc].strip() if first_tc > 0 else None
return content, tool_calls
except Exception:
return text, None

View File

@@ -1,19 +0,0 @@
"""
Qwen 2.5 tool call parser.
Uses the same <tool_call> format as Hermes.
Registered as a separate parser name for clarity when using --tool-parser=qwen.
"""
from environments.tool_call_parsers import register_parser
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser
@register_parser("qwen")
class QwenToolCallParser(HermesToolCallParser):
"""
Parser for Qwen 2.5 tool calls.
Same <tool_call>{"name": ..., "arguments": ...}</tool_call> format as Hermes.
"""
pass # Identical format -- inherits everything from Hermes

View File

@@ -1,473 +0,0 @@
"""
ToolContext -- Unrestricted Tool Access for Reward Functions
A per-rollout handle that gives reward/verification functions direct access to
ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means
the terminal/browser session is the SAME one the model used during its rollout --
all state (files, processes, browser tabs) is preserved.
The verifier author decides which tools to use. Nothing is hardcoded or gated.
Example usage in a compute_reward():
async def compute_reward(self, item, result, ctx):
# Run tests in the model's terminal sandbox
test = ctx.terminal("pytest -v")
if test["exit_code"] == 0:
return 1.0
# Check if a file was created
content = ctx.read_file("/workspace/solution.py")
if content.get("content"):
return 0.5
return 0.0
"""
import json
import logging
import os
from typing import Any, Dict, List, Optional
import asyncio
import concurrent.futures
from model_tools import handle_function_call
from tools.terminal_tool import cleanup_vm
from tools.browser_tool import cleanup_browser
logger = logging.getLogger(__name__)
# Thread pool for running sync tool calls that internally use asyncio.run()
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
def _run_tool_in_thread(tool_name: str, arguments: Dict[str, Any], task_id: str) -> str:
"""
Run a tool call in a thread pool executor so backends that use asyncio.run()
internally (modal, docker) get a clean event loop.
If we're already in an async context, uses run_in_executor.
If not (e.g., called from sync code), runs directly.
"""
try:
loop = asyncio.get_running_loop()
# We're in an async context -- need to run in thread
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
future = pool.submit(
handle_function_call, tool_name, arguments, task_id
)
return future.result(timeout=300)
except RuntimeError:
# No running event loop -- safe to call directly
return handle_function_call(tool_name, arguments, task_id)
class ToolContext:
"""
Open-ended access to all hermes-agent tools for a specific rollout.
Passed to compute_reward() so verifiers can use any tool they need:
terminal commands, file reads/writes, web searches, browser automation, etc.
All calls share the rollout's task_id for session isolation.
"""
def __init__(self, task_id: str):
self.task_id = task_id
# -------------------------------------------------------------------------
# Terminal tools
# -------------------------------------------------------------------------
def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]:
"""
Run a command in the rollout's terminal session.
Args:
command: Shell command to execute
timeout: Command timeout in seconds
Returns:
Dict with 'exit_code' (int) and 'output' (str)
"""
import os
backend = os.getenv("TERMINAL_ENV", "local")
logger.debug("ToolContext.terminal [%s backend] task=%s: %s", backend, self.task_id[:8], command[:100])
# Run in thread pool so modal/docker backends' asyncio.run() doesn't deadlock
result = _run_tool_in_thread(
"terminal",
{"command": command, "timeout": timeout},
self.task_id,
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"exit_code": -1, "output": result}
# -------------------------------------------------------------------------
# File tools
# -------------------------------------------------------------------------
def read_file(self, path: str) -> Dict[str, Any]:
"""
Read a file from the rollout's filesystem.
Args:
path: File path to read
Returns:
Dict with file content or error
"""
result = handle_function_call(
"read_file", {"path": path}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def write_file(self, path: str, content: str) -> Dict[str, Any]:
"""
Write a TEXT file in the rollout's filesystem.
Uses a shell heredoc under the hood, so this is only safe for text content.
For binary files (images, compiled artifacts, etc.), use upload_file() instead.
Args:
path: File path to write
content: Text content to write
Returns:
Dict with success status or error
"""
result = handle_function_call(
"write_file", {"path": path, "content": content}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def upload_file(self, local_path: str, remote_path: str) -> Dict[str, Any]:
"""
Upload a local file to the rollout's sandbox (binary-safe).
Unlike write_file() which passes content through a shell heredoc (text-only),
this method base64-encodes the file and decodes it inside the sandbox.
Safe for any file type: binaries, images, archives, etc.
For large files (>1MB), the content is split into chunks to avoid
hitting shell command-length limits.
Args:
local_path: Path to a local file on the host
remote_path: Destination path inside the sandbox
Returns:
Dict with 'exit_code' and 'output'
"""
import base64
from pathlib import Path as _Path
local = _Path(local_path)
if not local.exists():
return {"exit_code": -1, "output": f"Local file not found: {local_path}"}
raw = local.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
# Ensure parent directory exists in the sandbox
parent = str(_Path(remote_path).parent)
if parent not in (".", "/"):
self.terminal(f"mkdir -p {parent}", timeout=10)
# For small files, single command is fine
chunk_size = 60_000 # ~60KB per chunk (well within shell limits)
if len(b64) <= chunk_size:
result = self.terminal(
f"printf '%s' '{b64}' | base64 -d > {remote_path}",
timeout=30,
)
else:
# For larger files, write base64 in chunks then decode
tmp_b64 = "/tmp/_hermes_upload.b64"
self.terminal(f": > {tmp_b64}", timeout=5) # truncate
for i in range(0, len(b64), chunk_size):
chunk = b64[i : i + chunk_size]
self.terminal(f"printf '%s' '{chunk}' >> {tmp_b64}", timeout=15)
result = self.terminal(
f"base64 -d {tmp_b64} > {remote_path} && rm -f {tmp_b64}",
timeout=30,
)
return result
def upload_dir(self, local_dir: str, remote_dir: str) -> List[Dict[str, Any]]:
"""
Upload an entire local directory to the rollout's sandbox (binary-safe).
Recursively uploads all files, preserving directory structure.
Args:
local_dir: Path to a local directory on the host
remote_dir: Destination directory inside the sandbox
Returns:
List of results, one per file uploaded
"""
from pathlib import Path as _Path
local = _Path(local_dir)
if not local.exists() or not local.is_dir():
return [{"exit_code": -1, "output": f"Local directory not found: {local_dir}"}]
results = []
for file_path in sorted(local.rglob("*")):
if file_path.is_file():
relative = file_path.relative_to(local)
target = f"{remote_dir}/{relative}"
results.append(self.upload_file(str(file_path), target))
return results
def download_file(self, remote_path: str, local_path: str) -> Dict[str, Any]:
"""
Download a file from the rollout's sandbox to the host (binary-safe).
The inverse of upload_file(). Base64-encodes the file inside the sandbox,
reads the encoded data through the terminal, and decodes it locally.
Safe for any file type.
Args:
remote_path: Path to the file inside the sandbox
local_path: Destination path on the host
Returns:
Dict with 'success' (bool) and 'bytes' (int) or 'error' (str)
"""
import base64
from pathlib import Path as _Path
# Base64-encode the file inside the sandbox and capture output
result = self.terminal(
f"base64 {remote_path} 2>/dev/null",
timeout=30,
)
if result.get("exit_code", -1) != 0:
return {
"success": False,
"error": f"Failed to read remote file: {result.get('output', '')}",
}
b64_data = result.get("output", "").strip()
if not b64_data:
return {"success": False, "error": f"Remote file is empty or missing: {remote_path}"}
try:
raw = base64.b64decode(b64_data)
except Exception as e:
return {"success": False, "error": f"Base64 decode failed: {e}"}
# Write to local host filesystem
local = _Path(local_path)
local.parent.mkdir(parents=True, exist_ok=True)
local.write_bytes(raw)
return {"success": True, "bytes": len(raw)}
def download_dir(self, remote_dir: str, local_dir: str) -> List[Dict[str, Any]]:
"""
Download a directory from the rollout's sandbox to the host (binary-safe).
Lists all files in the remote directory, then downloads each one.
Preserves directory structure.
Args:
remote_dir: Path to the directory inside the sandbox
local_dir: Destination directory on the host
Returns:
List of results, one per file downloaded
"""
from pathlib import Path as _Path
# List files in the remote directory
ls_result = self.terminal(
f"find {remote_dir} -type f 2>/dev/null",
timeout=15,
)
if ls_result.get("exit_code", -1) != 0:
return [{"success": False, "error": f"Failed to list remote dir: {remote_dir}"}]
file_list = ls_result.get("output", "").strip()
if not file_list:
return [{"success": False, "error": f"Remote directory is empty or missing: {remote_dir}"}]
results = []
for remote_file in file_list.splitlines():
remote_file = remote_file.strip()
if not remote_file:
continue
# Compute the relative path to preserve directory structure
if remote_file.startswith(remote_dir):
relative = remote_file[len(remote_dir):].lstrip("/")
else:
relative = _Path(remote_file).name
local_file = str(_Path(local_dir) / relative)
results.append(self.download_file(remote_file, local_file))
return results
def search(self, query: str, path: str = ".") -> Dict[str, Any]:
"""
Search for text in the rollout's filesystem.
Args:
query: Search query
path: Directory to search in
Returns:
Dict with search results
"""
result = handle_function_call(
"search", {"query": query, "path": path}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
# -------------------------------------------------------------------------
# Web tools
# -------------------------------------------------------------------------
def web_search(self, query: str) -> Dict[str, Any]:
"""
Search the web.
Args:
query: Search query
Returns:
Dict with search results
"""
result = handle_function_call("web_search", {"query": query})
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def web_extract(self, urls: List[str]) -> Dict[str, Any]:
"""
Extract content from URLs.
Args:
urls: List of URLs to extract content from
Returns:
Dict with extracted content
"""
result = handle_function_call("web_extract", {"urls": urls})
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
# -------------------------------------------------------------------------
# Browser tools
# -------------------------------------------------------------------------
def browser_navigate(self, url: str) -> Dict[str, Any]:
"""
Navigate the rollout's browser session to a URL.
Args:
url: URL to navigate to
Returns:
Dict with page snapshot or error
"""
result = handle_function_call(
"browser_navigate", {"url": url}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def browser_snapshot(self) -> Dict[str, Any]:
"""
Take a snapshot of the current browser page.
Returns:
Dict with page content/accessibility snapshot
"""
result = handle_function_call(
"browser_snapshot", {}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
# -------------------------------------------------------------------------
# Generic tool access
# -------------------------------------------------------------------------
def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str:
"""
Call any hermes-agent tool by name.
This is the generic escape hatch -- if a tool doesn't have a convenience
wrapper above, you can call it directly here.
Args:
tool_name: Name of the tool (e.g., "vision_analyze", "skills_list")
arguments: Dict of arguments for the tool
Returns:
Raw JSON string result from the tool
"""
return _run_tool_in_thread(tool_name, arguments, self.task_id)
# -------------------------------------------------------------------------
# Cleanup
# -------------------------------------------------------------------------
def cleanup(self):
"""
Release all resources (terminal VMs, browser sessions, background processes)
for this rollout.
Called automatically by the base environment via try/finally after
compute_reward() completes. You generally don't need to call this yourself.
"""
# Kill any background processes from this rollout (safety net)
try:
from tools.process_registry import process_registry
killed = process_registry.kill_all(task_id=self.task_id)
if killed:
logger.debug("Process cleanup for task %s: killed %d process(es)", self.task_id, killed)
except Exception as e:
logger.debug("Process cleanup for task %s: %s", self.task_id, e)
try:
cleanup_vm(self.task_id)
except Exception as e:
logger.debug("VM cleanup for task %s: %s", self.task_id, e)
# Suppress browser_tool's noisy debug prints during cleanup.
# The cleanup still runs (safe), it just doesn't spam the console.
_prev_quiet = os.environ.get("HERMES_QUIET")
os.environ["HERMES_QUIET"] = "1"
try:
cleanup_browser(self.task_id)
except Exception as e:
logger.debug("Browser cleanup for task %s: %s", self.task_id, e)
finally:
if _prev_quiet is None:
os.environ.pop("HERMES_QUIET", None)
else:
os.environ["HERMES_QUIET"] = _prev_quiet

View File

@@ -1,35 +0,0 @@
"""
Hermes Gateway - Multi-platform messaging integration.
This module provides a unified gateway for connecting the Hermes agent
to various messaging platforms (Telegram, Discord, WhatsApp) with:
- Session management (persistent conversations with reset policies)
- Dynamic context injection (agent knows where messages come from)
- Delivery routing (cron job outputs to appropriate channels)
- Platform-specific toolsets (different capabilities per platform)
"""
from .config import GatewayConfig, PlatformConfig, HomeChannel, load_gateway_config
from .session import (
SessionContext,
SessionStore,
SessionResetPolicy,
build_session_context_prompt,
)
from .delivery import DeliveryRouter, DeliveryTarget
__all__ = [
# Config
"GatewayConfig",
"PlatformConfig",
"HomeChannel",
"load_gateway_config",
# Session
"SessionContext",
"SessionStore",
"SessionResetPolicy",
"build_session_context_prompt",
# Delivery
"DeliveryRouter",
"DeliveryTarget",
]

View File

@@ -1,350 +0,0 @@
"""
Gateway configuration management.
Handles loading and validating configuration for:
- Connected platforms (Telegram, Discord, WhatsApp)
- Home channels for each platform
- Session reset policies
- Delivery preferences
"""
import os
import json
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any
from enum import Enum
class Platform(Enum):
"""Supported messaging platforms."""
LOCAL = "local"
TELEGRAM = "telegram"
DISCORD = "discord"
WHATSAPP = "whatsapp"
SLACK = "slack"
@dataclass
class HomeChannel:
"""
Default destination for a platform.
When a cron job specifies deliver="telegram" without a specific chat ID,
messages are sent to this home channel.
"""
platform: Platform
chat_id: str
name: str # Human-readable name for display
def to_dict(self) -> Dict[str, Any]:
return {
"platform": self.platform.value,
"chat_id": self.chat_id,
"name": self.name,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HomeChannel":
return cls(
platform=Platform(data["platform"]),
chat_id=str(data["chat_id"]),
name=data.get("name", "Home"),
)
@dataclass
class SessionResetPolicy:
"""
Controls when sessions reset (lose context).
Modes:
- "daily": Reset at a specific hour each day
- "idle": Reset after N minutes of inactivity
- "both": Whichever triggers first (daily boundary OR idle timeout)
"""
mode: str = "both" # "daily", "idle", or "both"
at_hour: int = 4 # Hour for daily reset (0-23, local time)
idle_minutes: int = 1440 # Minutes of inactivity before reset (24 hours)
def to_dict(self) -> Dict[str, Any]:
return {
"mode": self.mode,
"at_hour": self.at_hour,
"idle_minutes": self.idle_minutes,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SessionResetPolicy":
return cls(
mode=data.get("mode", "both"),
at_hour=data.get("at_hour", 4),
idle_minutes=data.get("idle_minutes", 1440),
)
@dataclass
class PlatformConfig:
"""Configuration for a single messaging platform."""
enabled: bool = False
token: Optional[str] = None # Bot token (Telegram, Discord)
api_key: Optional[str] = None # API key if different from token
home_channel: Optional[HomeChannel] = None
# Platform-specific settings
extra: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
result = {
"enabled": self.enabled,
"extra": self.extra,
}
if self.token:
result["token"] = self.token
if self.api_key:
result["api_key"] = self.api_key
if self.home_channel:
result["home_channel"] = self.home_channel.to_dict()
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PlatformConfig":
home_channel = None
if "home_channel" in data:
home_channel = HomeChannel.from_dict(data["home_channel"])
return cls(
enabled=data.get("enabled", False),
token=data.get("token"),
api_key=data.get("api_key"),
home_channel=home_channel,
extra=data.get("extra", {}),
)
@dataclass
class GatewayConfig:
"""
Main gateway configuration.
Manages all platform connections, session policies, and delivery settings.
"""
# Platform configurations
platforms: Dict[Platform, PlatformConfig] = field(default_factory=dict)
# Session reset policies by type
default_reset_policy: SessionResetPolicy = field(default_factory=SessionResetPolicy)
reset_by_type: Dict[str, SessionResetPolicy] = field(default_factory=dict)
reset_by_platform: Dict[Platform, SessionResetPolicy] = field(default_factory=dict)
# Reset trigger commands
reset_triggers: List[str] = field(default_factory=lambda: ["/new", "/reset"])
# Storage paths
sessions_dir: Path = field(default_factory=lambda: Path.home() / ".hermes" / "sessions")
# Delivery settings
always_log_local: bool = True # Always save cron outputs to local files
def get_connected_platforms(self) -> List[Platform]:
"""Return list of platforms that are enabled and configured."""
connected = []
for platform, config in self.platforms.items():
if config.enabled and (config.token or config.api_key):
connected.append(platform)
return connected
def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
"""Get the home channel for a platform."""
config = self.platforms.get(platform)
if config:
return config.home_channel
return None
def get_reset_policy(
self,
platform: Optional[Platform] = None,
session_type: Optional[str] = None
) -> SessionResetPolicy:
"""
Get the appropriate reset policy for a session.
Priority: platform override > type override > default
"""
# Platform-specific override takes precedence
if platform and platform in self.reset_by_platform:
return self.reset_by_platform[platform]
# Type-specific override (dm, group, thread)
if session_type and session_type in self.reset_by_type:
return self.reset_by_type[session_type]
return self.default_reset_policy
def to_dict(self) -> Dict[str, Any]:
return {
"platforms": {
p.value: c.to_dict() for p, c in self.platforms.items()
},
"default_reset_policy": self.default_reset_policy.to_dict(),
"reset_by_type": {
k: v.to_dict() for k, v in self.reset_by_type.items()
},
"reset_by_platform": {
p.value: v.to_dict() for p, v in self.reset_by_platform.items()
},
"reset_triggers": self.reset_triggers,
"sessions_dir": str(self.sessions_dir),
"always_log_local": self.always_log_local,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "GatewayConfig":
platforms = {}
for platform_name, platform_data in data.get("platforms", {}).items():
try:
platform = Platform(platform_name)
platforms[platform] = PlatformConfig.from_dict(platform_data)
except ValueError:
pass # Skip unknown platforms
reset_by_type = {}
for type_name, policy_data in data.get("reset_by_type", {}).items():
reset_by_type[type_name] = SessionResetPolicy.from_dict(policy_data)
reset_by_platform = {}
for platform_name, policy_data in data.get("reset_by_platform", {}).items():
try:
platform = Platform(platform_name)
reset_by_platform[platform] = SessionResetPolicy.from_dict(policy_data)
except ValueError:
pass
default_policy = SessionResetPolicy()
if "default_reset_policy" in data:
default_policy = SessionResetPolicy.from_dict(data["default_reset_policy"])
sessions_dir = Path.home() / ".hermes" / "sessions"
if "sessions_dir" in data:
sessions_dir = Path(data["sessions_dir"])
return cls(
platforms=platforms,
default_reset_policy=default_policy,
reset_by_type=reset_by_type,
reset_by_platform=reset_by_platform,
reset_triggers=data.get("reset_triggers", ["/new", "/reset"]),
sessions_dir=sessions_dir,
always_log_local=data.get("always_log_local", True),
)
def load_gateway_config() -> GatewayConfig:
"""
Load gateway configuration from multiple sources.
Priority (highest to lowest):
1. Environment variables
2. ~/.hermes/gateway.json
3. cli-config.yaml gateway section
4. Defaults
"""
config = GatewayConfig()
# Try loading from ~/.hermes/gateway.json
gateway_config_path = Path.home() / ".hermes" / "gateway.json"
if gateway_config_path.exists():
try:
with open(gateway_config_path, "r") as f:
data = json.load(f)
config = GatewayConfig.from_dict(data)
except Exception as e:
print(f"[gateway] Warning: Failed to load {gateway_config_path}: {e}")
# Override with environment variables
_apply_env_overrides(config)
return config
def _apply_env_overrides(config: GatewayConfig) -> None:
"""Apply environment variable overrides to config."""
# Telegram
telegram_token = os.getenv("TELEGRAM_BOT_TOKEN")
if telegram_token:
if Platform.TELEGRAM not in config.platforms:
config.platforms[Platform.TELEGRAM] = PlatformConfig()
config.platforms[Platform.TELEGRAM].enabled = True
config.platforms[Platform.TELEGRAM].token = telegram_token
telegram_home = os.getenv("TELEGRAM_HOME_CHANNEL")
if telegram_home and Platform.TELEGRAM in config.platforms:
config.platforms[Platform.TELEGRAM].home_channel = HomeChannel(
platform=Platform.TELEGRAM,
chat_id=telegram_home,
name=os.getenv("TELEGRAM_HOME_CHANNEL_NAME", "Home"),
)
# Discord
discord_token = os.getenv("DISCORD_BOT_TOKEN")
if discord_token:
if Platform.DISCORD not in config.platforms:
config.platforms[Platform.DISCORD] = PlatformConfig()
config.platforms[Platform.DISCORD].enabled = True
config.platforms[Platform.DISCORD].token = discord_token
discord_home = os.getenv("DISCORD_HOME_CHANNEL")
if discord_home and Platform.DISCORD in config.platforms:
config.platforms[Platform.DISCORD].home_channel = HomeChannel(
platform=Platform.DISCORD,
chat_id=discord_home,
name=os.getenv("DISCORD_HOME_CHANNEL_NAME", "Home"),
)
# WhatsApp (typically uses different auth mechanism)
whatsapp_enabled = os.getenv("WHATSAPP_ENABLED", "").lower() in ("true", "1", "yes")
if whatsapp_enabled:
if Platform.WHATSAPP not in config.platforms:
config.platforms[Platform.WHATSAPP] = PlatformConfig()
config.platforms[Platform.WHATSAPP].enabled = True
# Slack
slack_token = os.getenv("SLACK_BOT_TOKEN")
if slack_token:
if Platform.SLACK not in config.platforms:
config.platforms[Platform.SLACK] = PlatformConfig()
config.platforms[Platform.SLACK].enabled = True
config.platforms[Platform.SLACK].token = slack_token
# Home channel
slack_home = os.getenv("SLACK_HOME_CHANNEL")
if slack_home:
config.platforms[Platform.SLACK].home_channel = HomeChannel(
platform=Platform.SLACK,
chat_id=slack_home,
name=os.getenv("SLACK_HOME_CHANNEL_NAME", ""),
)
# Session settings
idle_minutes = os.getenv("SESSION_IDLE_MINUTES")
if idle_minutes:
try:
config.default_reset_policy.idle_minutes = int(idle_minutes)
except ValueError:
pass
reset_hour = os.getenv("SESSION_RESET_HOUR")
if reset_hour:
try:
config.default_reset_policy.at_hour = int(reset_hour)
except ValueError:
pass
def save_gateway_config(config: GatewayConfig) -> None:
"""Save gateway configuration to ~/.hermes/gateway.json."""
gateway_config_path = Path.home() / ".hermes" / "gateway.json"
gateway_config_path.parent.mkdir(parents=True, exist_ok=True)
with open(gateway_config_path, "w") as f:
json.dump(config.to_dict(), f, indent=2)

View File

@@ -1,318 +0,0 @@
"""
Delivery routing for cron job outputs and agent responses.
Routes messages to the appropriate destination based on:
- Explicit targets (e.g., "telegram:123456789")
- Platform home channels (e.g., "telegram" → home channel)
- Origin (back to where the job was created)
- Local (always saved to files)
"""
import json
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
from typing import Dict, List, Optional, Any, Union
from enum import Enum
from .config import Platform, GatewayConfig, HomeChannel
from .session import SessionSource
@dataclass
class DeliveryTarget:
"""
A single delivery target.
Represents where a message should be sent:
- "origin" → back to source
- "local" → save to local files
- "telegram" → Telegram home channel
- "telegram:123456" → specific Telegram chat
"""
platform: Platform
chat_id: Optional[str] = None # None means use home channel
is_origin: bool = False
is_explicit: bool = False # True if chat_id was explicitly specified
@classmethod
def parse(cls, target: str, origin: Optional[SessionSource] = None) -> "DeliveryTarget":
"""
Parse a delivery target string.
Formats:
- "origin" → back to source
- "local" → local files only
- "telegram" → Telegram home channel
- "telegram:123456" → specific Telegram chat
"""
target = target.strip().lower()
if target == "origin":
if origin:
return cls(
platform=origin.platform,
chat_id=origin.chat_id,
is_origin=True,
)
else:
# Fallback to local if no origin
return cls(platform=Platform.LOCAL, is_origin=True)
if target == "local":
return cls(platform=Platform.LOCAL)
# Check for platform:chat_id format
if ":" in target:
platform_str, chat_id = target.split(":", 1)
try:
platform = Platform(platform_str)
return cls(platform=platform, chat_id=chat_id, is_explicit=True)
except ValueError:
# Unknown platform, treat as local
return cls(platform=Platform.LOCAL)
# Just a platform name (use home channel)
try:
platform = Platform(target)
return cls(platform=platform)
except ValueError:
# Unknown platform, treat as local
return cls(platform=Platform.LOCAL)
def to_string(self) -> str:
"""Convert back to string format."""
if self.is_origin:
return "origin"
if self.platform == Platform.LOCAL:
return "local"
if self.chat_id:
return f"{self.platform.value}:{self.chat_id}"
return self.platform.value
class DeliveryRouter:
"""
Routes messages to appropriate destinations.
Handles the logic of resolving delivery targets and dispatching
messages to the right platform adapters.
"""
def __init__(self, config: GatewayConfig, adapters: Dict[Platform, Any] = None):
"""
Initialize the delivery router.
Args:
config: Gateway configuration
adapters: Dict mapping platforms to their adapter instances
"""
self.config = config
self.adapters = adapters or {}
self.output_dir = Path.home() / ".hermes" / "cron" / "output"
def resolve_targets(
self,
deliver: Union[str, List[str]],
origin: Optional[SessionSource] = None
) -> List[DeliveryTarget]:
"""
Resolve delivery specification to concrete targets.
Args:
deliver: Delivery spec - "origin", "telegram", ["local", "discord"], etc.
origin: The source where the request originated (for "origin" target)
Returns:
List of resolved delivery targets
"""
if isinstance(deliver, str):
deliver = [deliver]
targets = []
seen_platforms = set()
for target_str in deliver:
target = DeliveryTarget.parse(target_str, origin)
# Resolve home channel if needed
if target.chat_id is None and target.platform != Platform.LOCAL:
home = self.config.get_home_channel(target.platform)
if home:
target.chat_id = home.chat_id
else:
# No home channel configured, skip this platform
continue
# Deduplicate
key = (target.platform, target.chat_id)
if key not in seen_platforms:
seen_platforms.add(key)
targets.append(target)
# Always include local if configured
if self.config.always_log_local:
local_key = (Platform.LOCAL, None)
if local_key not in seen_platforms:
targets.append(DeliveryTarget(platform=Platform.LOCAL))
return targets
async def deliver(
self,
content: str,
targets: List[DeliveryTarget],
job_id: Optional[str] = None,
job_name: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Deliver content to all specified targets.
Args:
content: The message/output to deliver
targets: List of delivery targets
job_id: Optional job ID (for cron jobs)
job_name: Optional job name
metadata: Additional metadata to include
Returns:
Dict with delivery results per target
"""
results = {}
for target in targets:
try:
if target.platform == Platform.LOCAL:
result = self._deliver_local(content, job_id, job_name, metadata)
else:
result = await self._deliver_to_platform(target, content, metadata)
results[target.to_string()] = {
"success": True,
"result": result
}
except Exception as e:
results[target.to_string()] = {
"success": False,
"error": str(e)
}
return results
def _deliver_local(
self,
content: str,
job_id: Optional[str],
job_name: Optional[str],
metadata: Optional[Dict[str, Any]]
) -> Dict[str, Any]:
"""Save content to local files."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if job_id:
output_path = self.output_dir / job_id / f"{timestamp}.md"
else:
output_path = self.output_dir / "misc" / f"{timestamp}.md"
output_path.parent.mkdir(parents=True, exist_ok=True)
# Build the output document
lines = []
if job_name:
lines.append(f"# {job_name}")
else:
lines.append("# Delivery Output")
lines.append("")
lines.append(f"**Timestamp:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if job_id:
lines.append(f"**Job ID:** {job_id}")
if metadata:
for key, value in metadata.items():
lines.append(f"**{key}:** {value}")
lines.append("")
lines.append("---")
lines.append("")
lines.append(content)
output_path.write_text("\n".join(lines))
return {
"path": str(output_path),
"timestamp": timestamp
}
async def _deliver_to_platform(
self,
target: DeliveryTarget,
content: str,
metadata: Optional[Dict[str, Any]]
) -> Dict[str, Any]:
"""Deliver content to a messaging platform."""
adapter = self.adapters.get(target.platform)
if not adapter:
raise ValueError(f"No adapter configured for {target.platform.value}")
if not target.chat_id:
raise ValueError(f"No chat ID for {target.platform.value} delivery")
# Call the adapter's send method
# Adapters should implement: async def send(chat_id: str, content: str) -> Dict
return await adapter.send(target.chat_id, content, metadata=metadata)
def parse_deliver_spec(
deliver: Optional[Union[str, List[str]]],
origin: Optional[SessionSource] = None,
default: str = "origin"
) -> Union[str, List[str]]:
"""
Normalize a delivery specification.
If None or empty, returns the default.
"""
if not deliver:
return default
return deliver
def build_delivery_context_for_tool(
config: GatewayConfig,
origin: Optional[SessionSource] = None
) -> Dict[str, Any]:
"""
Build context for the schedule_cronjob tool to understand delivery options.
This is passed to the tool so it can validate and explain delivery targets.
"""
connected = config.get_connected_platforms()
options = {
"origin": {
"description": "Back to where this job was created",
"available": origin is not None,
},
"local": {
"description": "Save to local files only",
"available": True,
}
}
for platform in connected:
home = config.get_home_channel(platform)
options[platform.value] = {
"description": f"{platform.value.title()} home channel",
"available": True,
"home_channel": home.to_dict() if home else None,
}
return {
"origin": origin.to_dict() if origin else None,
"options": options,
"always_log_local": config.always_log_local,
}

View File

@@ -1,150 +0,0 @@
"""
Event Hook System
A lightweight event-driven system that fires handlers at key lifecycle points.
Hooks are discovered from ~/.hermes/hooks/ directories, each containing:
- HOOK.yaml (metadata: name, description, events list)
- handler.py (Python handler with async def handle(event_type, context))
Events:
- gateway:startup -- Gateway process starts
- session:start -- New session created
- session:reset -- User ran /new or /reset
- agent:start -- Agent begins processing a message
- agent:step -- Each turn in the tool-calling loop
- agent:end -- Agent finishes processing
- command:* -- Any slash command executed (wildcard match)
Errors in hooks are caught and logged but never block the main pipeline.
"""
import asyncio
import importlib.util
import os
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
import yaml
HOOKS_DIR = Path(os.path.expanduser("~/.hermes/hooks"))
class HookRegistry:
"""
Discovers, loads, and fires event hooks.
Usage:
registry = HookRegistry()
registry.discover_and_load()
await registry.emit("agent:start", {"platform": "telegram", ...})
"""
def __init__(self):
# event_type -> [handler_fn, ...]
self._handlers: Dict[str, List[Callable]] = {}
self._loaded_hooks: List[dict] = [] # metadata for listing
@property
def loaded_hooks(self) -> List[dict]:
"""Return metadata about all loaded hooks."""
return list(self._loaded_hooks)
def discover_and_load(self) -> None:
"""
Scan the hooks directory for hook directories and load their handlers.
Each hook directory must contain:
- HOOK.yaml with at least 'name' and 'events' keys
- handler.py with a top-level 'handle' function (sync or async)
"""
if not HOOKS_DIR.exists():
return
for hook_dir in sorted(HOOKS_DIR.iterdir()):
if not hook_dir.is_dir():
continue
manifest_path = hook_dir / "HOOK.yaml"
handler_path = hook_dir / "handler.py"
if not manifest_path.exists() or not handler_path.exists():
continue
try:
manifest = yaml.safe_load(manifest_path.read_text(encoding="utf-8"))
if not manifest or not isinstance(manifest, dict):
print(f"[hooks] Skipping {hook_dir.name}: invalid HOOK.yaml", flush=True)
continue
hook_name = manifest.get("name", hook_dir.name)
events = manifest.get("events", [])
if not events:
print(f"[hooks] Skipping {hook_name}: no events declared", flush=True)
continue
# Dynamically load the handler module
spec = importlib.util.spec_from_file_location(
f"hermes_hook_{hook_name}", handler_path
)
if spec is None or spec.loader is None:
print(f"[hooks] Skipping {hook_name}: could not load handler.py", flush=True)
continue
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
handle_fn = getattr(module, "handle", None)
if handle_fn is None:
print(f"[hooks] Skipping {hook_name}: no 'handle' function found", flush=True)
continue
# Register the handler for each declared event
for event in events:
self._handlers.setdefault(event, []).append(handle_fn)
self._loaded_hooks.append({
"name": hook_name,
"description": manifest.get("description", ""),
"events": events,
"path": str(hook_dir),
})
print(f"[hooks] Loaded hook '{hook_name}' for events: {events}", flush=True)
except Exception as e:
print(f"[hooks] Error loading hook {hook_dir.name}: {e}", flush=True)
async def emit(self, event_type: str, context: Optional[Dict[str, Any]] = None) -> None:
"""
Fire all handlers registered for an event.
Supports wildcard matching: handlers registered for "command:*" will
fire for any "command:..." event. Handlers registered for a base type
like "agent" won't fire for "agent:start" -- only exact matches and
explicit wildcards.
Args:
event_type: The event identifier (e.g. "agent:start").
context: Optional dict with event-specific data.
"""
if context is None:
context = {}
# Collect handlers: exact match + wildcard match
handlers = list(self._handlers.get(event_type, []))
# Check for wildcard patterns (e.g., "command:*" matches "command:reset")
if ":" in event_type:
base = event_type.split(":")[0]
wildcard_key = f"{base}:*"
handlers.extend(self._handlers.get(wildcard_key, []))
for fn in handlers:
try:
result = fn(event_type, context)
# Support both sync and async handlers
if asyncio.iscoroutine(result):
await result
except Exception as e:
print(f"[hooks] Error in handler for '{event_type}': {e}", flush=True)

View File

@@ -1,282 +0,0 @@
"""
DM Pairing System
Code-based approval flow for authorizing new users on messaging platforms.
Instead of static allowlists with user IDs, unknown users receive a one-time
pairing code that the bot owner approves via the CLI.
Security features (based on OWASP + NIST SP 800-63-4 guidance):
- 8-char codes from 32-char unambiguous alphabet (no 0/O/1/I)
- Cryptographic randomness via secrets.choice()
- 1-hour code expiry
- Max 3 pending codes per platform
- Rate limiting: 1 request per user per 10 minutes
- Lockout after 5 failed approval attempts (1 hour)
- File permissions: chmod 0600 on all data files
- Codes are never logged to stdout
Storage: ~/.hermes/pairing/
"""
import json
import os
import secrets
import time
from pathlib import Path
from typing import Optional
# Unambiguous alphabet -- excludes 0/O, 1/I to prevent confusion
ALPHABET = "ABCDEFGHJKLMNPQRSTUVWXYZ23456789"
CODE_LENGTH = 8
# Timing constants
CODE_TTL_SECONDS = 3600 # Codes expire after 1 hour
RATE_LIMIT_SECONDS = 600 # 1 request per user per 10 minutes
LOCKOUT_SECONDS = 3600 # Lockout duration after too many failures
# Limits
MAX_PENDING_PER_PLATFORM = 3 # Max pending codes per platform
MAX_FAILED_ATTEMPTS = 5 # Failed approvals before lockout
PAIRING_DIR = Path(os.path.expanduser("~/.hermes/pairing"))
def _secure_write(path: Path, data: str) -> None:
"""Write data to file with restrictive permissions (owner read/write only)."""
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(data, encoding="utf-8")
try:
os.chmod(path, 0o600)
except OSError:
pass # Windows doesn't support chmod the same way
class PairingStore:
"""
Manages pairing codes and approved user lists.
Data files per platform:
- {platform}-pending.json : pending pairing requests
- {platform}-approved.json : approved (paired) users
- _rate_limits.json : rate limit tracking
"""
def __init__(self):
PAIRING_DIR.mkdir(parents=True, exist_ok=True)
def _pending_path(self, platform: str) -> Path:
return PAIRING_DIR / f"{platform}-pending.json"
def _approved_path(self, platform: str) -> Path:
return PAIRING_DIR / f"{platform}-approved.json"
def _rate_limit_path(self) -> Path:
return PAIRING_DIR / "_rate_limits.json"
def _load_json(self, path: Path) -> dict:
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return {}
return {}
def _save_json(self, path: Path, data: dict) -> None:
_secure_write(path, json.dumps(data, indent=2, ensure_ascii=False))
# ----- Approved users -----
def is_approved(self, platform: str, user_id: str) -> bool:
"""Check if a user is approved (paired) on a platform."""
approved = self._load_json(self._approved_path(platform))
return user_id in approved
def list_approved(self, platform: str = None) -> list:
"""List approved users, optionally filtered by platform."""
results = []
platforms = [platform] if platform else self._all_platforms("approved")
for p in platforms:
approved = self._load_json(self._approved_path(p))
for uid, info in approved.items():
results.append({"platform": p, "user_id": uid, **info})
return results
def _approve_user(self, platform: str, user_id: str, user_name: str = "") -> None:
"""Add a user to the approved list."""
approved = self._load_json(self._approved_path(platform))
approved[user_id] = {
"user_name": user_name,
"approved_at": time.time(),
}
self._save_json(self._approved_path(platform), approved)
def revoke(self, platform: str, user_id: str) -> bool:
"""Remove a user from the approved list. Returns True if found."""
path = self._approved_path(platform)
approved = self._load_json(path)
if user_id in approved:
del approved[user_id]
self._save_json(path, approved)
return True
return False
# ----- Pending codes -----
def generate_code(
self, platform: str, user_id: str, user_name: str = ""
) -> Optional[str]:
"""
Generate a pairing code for a new user.
Returns the code string, or None if:
- User is rate-limited (too recent request)
- Max pending codes reached for this platform
- User/platform is in lockout due to failed attempts
"""
self._cleanup_expired(platform)
# Check lockout
if self._is_locked_out(platform):
return None
# Check rate limit for this specific user
if self._is_rate_limited(platform, user_id):
return None
# Check max pending
pending = self._load_json(self._pending_path(platform))
if len(pending) >= MAX_PENDING_PER_PLATFORM:
return None
# Generate cryptographically random code
code = "".join(secrets.choice(ALPHABET) for _ in range(CODE_LENGTH))
# Store pending request
pending[code] = {
"user_id": user_id,
"user_name": user_name,
"created_at": time.time(),
}
self._save_json(self._pending_path(platform), pending)
# Record rate limit
self._record_rate_limit(platform, user_id)
return code
def approve_code(self, platform: str, code: str) -> Optional[dict]:
"""
Approve a pairing code. Adds the user to the approved list.
Returns {user_id, user_name} on success, None if code is invalid/expired.
"""
self._cleanup_expired(platform)
code = code.upper().strip()
pending = self._load_json(self._pending_path(platform))
if code not in pending:
self._record_failed_attempt(platform)
return None
entry = pending.pop(code)
self._save_json(self._pending_path(platform), pending)
# Add to approved list
self._approve_user(platform, entry["user_id"], entry.get("user_name", ""))
return {
"user_id": entry["user_id"],
"user_name": entry.get("user_name", ""),
}
def list_pending(self, platform: str = None) -> list:
"""List pending pairing requests, optionally filtered by platform."""
results = []
platforms = [platform] if platform else self._all_platforms("pending")
for p in platforms:
self._cleanup_expired(p)
pending = self._load_json(self._pending_path(p))
for code, info in pending.items():
age_min = int((time.time() - info["created_at"]) / 60)
results.append({
"platform": p,
"code": code,
"user_id": info["user_id"],
"user_name": info.get("user_name", ""),
"age_minutes": age_min,
})
return results
def clear_pending(self, platform: str = None) -> int:
"""Clear all pending requests. Returns count removed."""
count = 0
platforms = [platform] if platform else self._all_platforms("pending")
for p in platforms:
pending = self._load_json(self._pending_path(p))
count += len(pending)
self._save_json(self._pending_path(p), {})
return count
# ----- Rate limiting and lockout -----
def _is_rate_limited(self, platform: str, user_id: str) -> bool:
"""Check if a user has requested a code too recently."""
limits = self._load_json(self._rate_limit_path())
key = f"{platform}:{user_id}"
last_request = limits.get(key, 0)
return (time.time() - last_request) < RATE_LIMIT_SECONDS
def _record_rate_limit(self, platform: str, user_id: str) -> None:
"""Record the time of a pairing request for rate limiting."""
limits = self._load_json(self._rate_limit_path())
key = f"{platform}:{user_id}"
limits[key] = time.time()
self._save_json(self._rate_limit_path(), limits)
def _is_locked_out(self, platform: str) -> bool:
"""Check if a platform is in lockout due to failed approval attempts."""
limits = self._load_json(self._rate_limit_path())
lockout_key = f"_lockout:{platform}"
lockout_until = limits.get(lockout_key, 0)
return time.time() < lockout_until
def _record_failed_attempt(self, platform: str) -> None:
"""Record a failed approval attempt. Triggers lockout after MAX_FAILED_ATTEMPTS."""
limits = self._load_json(self._rate_limit_path())
fail_key = f"_failures:{platform}"
fails = limits.get(fail_key, 0) + 1
limits[fail_key] = fails
if fails >= MAX_FAILED_ATTEMPTS:
lockout_key = f"_lockout:{platform}"
limits[lockout_key] = time.time() + LOCKOUT_SECONDS
limits[fail_key] = 0 # Reset counter
print(f"[pairing] Platform {platform} locked out for {LOCKOUT_SECONDS}s "
f"after {MAX_FAILED_ATTEMPTS} failed attempts", flush=True)
self._save_json(self._rate_limit_path(), limits)
# ----- Cleanup -----
def _cleanup_expired(self, platform: str) -> None:
"""Remove expired pending codes."""
path = self._pending_path(platform)
pending = self._load_json(path)
now = time.time()
expired = [
code for code, info in pending.items()
if (now - info["created_at"]) > CODE_TTL_SECONDS
]
if expired:
for code in expired:
del pending[code]
self._save_json(path, pending)
def _all_platforms(self, suffix: str) -> list:
"""List all platforms that have data files of a given suffix."""
platforms = []
for f in PAIRING_DIR.iterdir():
if f.name.endswith(f"-{suffix}.json"):
platform = f.name.replace(f"-{suffix}.json", "")
if not platform.startswith("_"):
platforms.append(platform)
return platforms

View File

@@ -1,17 +0,0 @@
"""
Platform adapters for messaging integrations.
Each adapter handles:
- Receiving messages from a platform
- Sending messages/responses back
- Platform-specific authentication
- Message formatting and media handling
"""
from .base import BasePlatformAdapter, MessageEvent, SendResult
__all__ = [
"BasePlatformAdapter",
"MessageEvent",
"SendResult",
]

View File

@@ -1,691 +0,0 @@
"""
Base platform adapter interface.
All platform adapters (Telegram, Discord, WhatsApp) inherit from this
and implement the required methods.
"""
import asyncio
import os
import re
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Callable, Awaitable, Tuple
from enum import Enum
import sys
sys.path.insert(0, str(__file__).rsplit("/", 3)[0])
from gateway.config import Platform, PlatformConfig
from gateway.session import SessionSource
# ---------------------------------------------------------------------------
# Image cache utilities
#
# When users send images on messaging platforms, we download them to a local
# cache directory so they can be analyzed by the vision tool (which accepts
# local file paths). This avoids issues with ephemeral platform URLs
# (e.g. Telegram file URLs expire after ~1 hour).
# ---------------------------------------------------------------------------
# Default location: ~/.hermes/image_cache/
IMAGE_CACHE_DIR = Path(os.path.expanduser("~/.hermes/image_cache"))
def get_image_cache_dir() -> Path:
"""Return the image cache directory, creating it if it doesn't exist."""
IMAGE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
return IMAGE_CACHE_DIR
def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str:
"""
Save raw image bytes to the cache and return the absolute file path.
Args:
data: Raw image bytes.
ext: File extension including the dot (e.g. ".jpg", ".png").
Returns:
Absolute path to the cached image file as a string.
"""
cache_dir = get_image_cache_dir()
filename = f"img_{uuid.uuid4().hex[:12]}{ext}"
filepath = cache_dir / filename
filepath.write_bytes(data)
return str(filepath)
async def cache_image_from_url(url: str, ext: str = ".jpg") -> str:
"""
Download an image from a URL and save it to the local cache.
Uses httpx for async download with a reasonable timeout.
Args:
url: The HTTP/HTTPS URL to download from.
ext: File extension including the dot (e.g. ".jpg", ".png").
Returns:
Absolute path to the cached image file as a string.
"""
import httpx
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (compatible; HermesAgent/1.0)",
"Accept": "image/*,*/*;q=0.8",
},
)
response.raise_for_status()
return cache_image_from_bytes(response.content, ext)
def cleanup_image_cache(max_age_hours: int = 24) -> int:
"""
Delete cached images older than *max_age_hours*.
Returns the number of files removed.
"""
import time
cache_dir = get_image_cache_dir()
cutoff = time.time() - (max_age_hours * 3600)
removed = 0
for f in cache_dir.iterdir():
if f.is_file() and f.stat().st_mtime < cutoff:
try:
f.unlink()
removed += 1
except OSError:
pass
return removed
# ---------------------------------------------------------------------------
# Audio cache utilities
#
# Same pattern as image cache -- voice messages from platforms are downloaded
# here so the STT tool (OpenAI Whisper) can transcribe them from local files.
# ---------------------------------------------------------------------------
AUDIO_CACHE_DIR = Path(os.path.expanduser("~/.hermes/audio_cache"))
def get_audio_cache_dir() -> Path:
"""Return the audio cache directory, creating it if it doesn't exist."""
AUDIO_CACHE_DIR.mkdir(parents=True, exist_ok=True)
return AUDIO_CACHE_DIR
def cache_audio_from_bytes(data: bytes, ext: str = ".ogg") -> str:
"""
Save raw audio bytes to the cache and return the absolute file path.
Args:
data: Raw audio bytes.
ext: File extension including the dot (e.g. ".ogg", ".mp3").
Returns:
Absolute path to the cached audio file as a string.
"""
cache_dir = get_audio_cache_dir()
filename = f"audio_{uuid.uuid4().hex[:12]}{ext}"
filepath = cache_dir / filename
filepath.write_bytes(data)
return str(filepath)
async def cache_audio_from_url(url: str, ext: str = ".ogg") -> str:
"""
Download an audio file from a URL and save it to the local cache.
Args:
url: The HTTP/HTTPS URL to download from.
ext: File extension including the dot (e.g. ".ogg", ".mp3").
Returns:
Absolute path to the cached audio file as a string.
"""
import httpx
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (compatible; HermesAgent/1.0)",
"Accept": "audio/*,*/*;q=0.8",
},
)
response.raise_for_status()
return cache_audio_from_bytes(response.content, ext)
class MessageType(Enum):
"""Types of incoming messages."""
TEXT = "text"
PHOTO = "photo"
VIDEO = "video"
AUDIO = "audio"
VOICE = "voice"
DOCUMENT = "document"
STICKER = "sticker"
COMMAND = "command" # /command style
@dataclass
class MessageEvent:
"""
Incoming message from a platform.
Normalized representation that all adapters produce.
"""
# Message content
text: str
message_type: MessageType = MessageType.TEXT
# Source information
source: SessionSource = None
# Original platform data
raw_message: Any = None
message_id: Optional[str] = None
# Media attachments
media_urls: List[str] = field(default_factory=list)
media_types: List[str] = field(default_factory=list)
# Reply context
reply_to_message_id: Optional[str] = None
# Timestamps
timestamp: datetime = field(default_factory=datetime.now)
def is_command(self) -> bool:
"""Check if this is a command message (e.g., /new, /reset)."""
return self.text.startswith("/")
def get_command(self) -> Optional[str]:
"""Extract command name if this is a command message."""
if not self.is_command():
return None
# Split on space and get first word, strip the /
parts = self.text.split(maxsplit=1)
return parts[0][1:].lower() if parts else None
def get_command_args(self) -> str:
"""Get the arguments after a command."""
if not self.is_command():
return self.text
parts = self.text.split(maxsplit=1)
return parts[1] if len(parts) > 1 else ""
@dataclass
class SendResult:
"""Result of sending a message."""
success: bool
message_id: Optional[str] = None
error: Optional[str] = None
raw_response: Any = None
# Type for message handlers
MessageHandler = Callable[[MessageEvent], Awaitable[Optional[str]]]
class BasePlatformAdapter(ABC):
"""
Base class for platform adapters.
Subclasses implement platform-specific logic for:
- Connecting and authenticating
- Receiving messages
- Sending messages/responses
- Handling media
"""
def __init__(self, config: PlatformConfig, platform: Platform):
self.config = config
self.platform = platform
self._message_handler: Optional[MessageHandler] = None
self._running = False
# Track active message handlers per session for interrupt support
# Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
self._active_sessions: Dict[str, asyncio.Event] = {}
self._pending_messages: Dict[str, MessageEvent] = {}
@property
def name(self) -> str:
"""Human-readable name for this adapter."""
return self.platform.value.title()
@property
def is_connected(self) -> bool:
"""Check if adapter is currently connected."""
return self._running
def set_message_handler(self, handler: MessageHandler) -> None:
"""
Set the handler for incoming messages.
The handler receives a MessageEvent and should return
an optional response string.
"""
self._message_handler = handler
@abstractmethod
async def connect(self) -> bool:
"""
Connect to the platform and start receiving messages.
Returns True if connection was successful.
"""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Disconnect from the platform."""
pass
@abstractmethod
async def send(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> SendResult:
"""
Send a message to a chat.
Args:
chat_id: The chat/channel ID to send to
content: Message content (may be markdown)
reply_to: Optional message ID to reply to
metadata: Additional platform-specific options
Returns:
SendResult with success status and message ID
"""
pass
async def send_typing(self, chat_id: str) -> None:
"""
Send a typing indicator.
Override in subclasses if the platform supports it.
"""
pass
async def send_image(
self,
chat_id: str,
image_url: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""
Send an image natively via the platform API.
Override in subclasses to send images as proper attachments
instead of plain-text URLs. Default falls back to sending the
URL as a text message.
"""
# Fallback: send URL as text (subclasses override for native images)
text = f"{caption}\n{image_url}" if caption else image_url
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
@staticmethod
def extract_images(content: str) -> Tuple[List[Tuple[str, str]], str]:
"""
Extract image URLs from markdown and HTML image tags in a response.
Finds patterns like:
- ![alt text](https://example.com/image.png)
- <img src="https://example.com/image.png">
- <img src="https://example.com/image.png"></img>
Args:
content: The response text to scan.
Returns:
Tuple of (list of (url, alt_text) pairs, cleaned content with image tags removed).
"""
images = []
cleaned = content
# Match markdown images: ![alt](url)
md_pattern = r'!\[([^\]]*)\]\((https?://[^\s\)]+)\)'
for match in re.finditer(md_pattern, content):
alt_text = match.group(1)
url = match.group(2)
# Only extract URLs that look like actual images
if any(url.lower().endswith(ext) or ext in url.lower() for ext in
['.png', '.jpg', '.jpeg', '.gif', '.webp', 'fal.media', 'fal-cdn', 'replicate.delivery']):
images.append((url, alt_text))
# Match HTML img tags: <img src="url"> or <img src="url"></img> or <img src="url"/>
html_pattern = r'<img\s+src=["\']?(https?://[^\s"\'<>]+)["\']?\s*/?>\s*(?:</img>)?'
for match in re.finditer(html_pattern, content):
url = match.group(1)
images.append((url, ""))
# Remove matched image tags from content if we found images
if images:
cleaned = re.sub(md_pattern, '', cleaned)
cleaned = re.sub(html_pattern, '', cleaned)
# Clean up leftover blank lines
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
return images, cleaned
async def send_voice(
self,
chat_id: str,
audio_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""
Send an audio file as a native voice message via the platform API.
Override in subclasses to send audio as voice bubbles (Telegram)
or file attachments (Discord). Default falls back to sending the
file path as text.
"""
text = f"🔊 Audio: {audio_path}"
if caption:
text = f"{caption}\n{text}"
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
@staticmethod
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
"""
Extract MEDIA:<path> tags and [[audio_as_voice]] directives from response text.
The TTS tool returns responses like:
[[audio_as_voice]]
MEDIA:/path/to/audio.ogg
Args:
content: The response text to scan.
Returns:
Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed).
"""
media = []
cleaned = content
# Check for [[audio_as_voice]] directive
has_voice_tag = "[[audio_as_voice]]" in content
cleaned = cleaned.replace("[[audio_as_voice]]", "")
# Extract MEDIA:<path> tags (path may contain spaces)
media_pattern = r'MEDIA:(\S+)'
for match in re.finditer(media_pattern, content):
path = match.group(1).strip()
if path:
media.append((path, has_voice_tag))
# Remove MEDIA tags from content
if media:
cleaned = re.sub(media_pattern, '', cleaned)
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
return media, cleaned
async def _keep_typing(self, chat_id: str, interval: float = 2.0) -> None:
"""
Continuously send typing indicator until cancelled.
Telegram/Discord typing status expires after ~5 seconds, so we refresh every 2
to recover quickly after progress messages interrupt it.
"""
try:
while True:
await self.send_typing(chat_id)
await asyncio.sleep(interval)
except asyncio.CancelledError:
pass # Normal cancellation when handler completes
async def handle_message(self, event: MessageEvent) -> None:
"""
Process an incoming message.
This method returns quickly by spawning background tasks.
This allows new messages to be processed even while an agent is running,
enabling interruption support.
"""
if not self._message_handler:
return
session_key = event.source.chat_id
# Check if there's already an active handler for this session
if session_key in self._active_sessions:
# Store this as a pending message - it will interrupt the running agent
print(f"[{self.name}] ⚡ New message while session {session_key} is active - triggering interrupt")
self._pending_messages[session_key] = event
# Signal the interrupt (the processing task checks this)
self._active_sessions[session_key].set()
return # Don't process now - will be handled after current task finishes
# Spawn background task to process this message
asyncio.create_task(self._process_message_background(event, session_key))
@staticmethod
def _get_human_delay() -> float:
"""
Return a random delay in seconds for human-like response pacing.
Reads from env vars:
HERMES_HUMAN_DELAY_MODE: "off" (default) | "natural" | "custom"
HERMES_HUMAN_DELAY_MIN_MS: minimum delay in ms (default 800, custom mode)
HERMES_HUMAN_DELAY_MAX_MS: maximum delay in ms (default 2500, custom mode)
"""
import random
mode = os.getenv("HERMES_HUMAN_DELAY_MODE", "off").lower()
if mode == "off":
return 0.0
min_ms = int(os.getenv("HERMES_HUMAN_DELAY_MIN_MS", "800"))
max_ms = int(os.getenv("HERMES_HUMAN_DELAY_MAX_MS", "2500"))
if mode == "natural":
min_ms, max_ms = 800, 2500
return random.uniform(min_ms / 1000.0, max_ms / 1000.0)
async def _process_message_background(self, event: MessageEvent, session_key: str) -> None:
"""Background task that actually processes the message."""
# Create interrupt event for this session
interrupt_event = asyncio.Event()
self._active_sessions[session_key] = interrupt_event
# Start continuous typing indicator (refreshes every 2 seconds)
typing_task = asyncio.create_task(self._keep_typing(event.source.chat_id))
try:
# Call the handler (this can take a while with tool calls)
response = await self._message_handler(event)
# Send response if any
if response:
# Extract MEDIA:<path> tags (from TTS tool) before other processing
media_files, response = self.extract_media(response)
# Extract image URLs and send them as native platform attachments
images, text_content = self.extract_images(response)
# Send the text portion first (if any remains after extractions)
if text_content:
result = await self.send(
chat_id=event.source.chat_id,
content=text_content,
reply_to=event.message_id
)
# Log send failures (don't raise - user already saw tool progress)
if not result.success:
print(f"[{self.name}] Failed to send response: {result.error}")
# Try sending without markdown as fallback
fallback_result = await self.send(
chat_id=event.source.chat_id,
content=f"(Response formatting failed, plain text:)\n\n{text_content[:3500]}",
reply_to=event.message_id
)
if not fallback_result.success:
print(f"[{self.name}] Fallback send also failed: {fallback_result.error}")
# Human-like pacing delay between text and media
human_delay = self._get_human_delay()
# Send extracted images as native attachments
for image_url, alt_text in images:
if human_delay > 0:
await asyncio.sleep(human_delay)
try:
img_result = await self.send_image(
chat_id=event.source.chat_id,
image_url=image_url,
caption=alt_text if alt_text else None,
)
if not img_result.success:
print(f"[{self.name}] Failed to send image: {img_result.error}")
except Exception as img_err:
print(f"[{self.name}] Error sending image: {img_err}")
# Send extracted audio/voice files as native attachments
for audio_path, is_voice in media_files:
if human_delay > 0:
await asyncio.sleep(human_delay)
try:
voice_result = await self.send_voice(
chat_id=event.source.chat_id,
audio_path=audio_path,
)
if not voice_result.success:
print(f"[{self.name}] Failed to send voice: {voice_result.error}")
except Exception as voice_err:
print(f"[{self.name}] Error sending voice: {voice_err}")
# Check if there's a pending message that was queued during our processing
if session_key in self._pending_messages:
pending_event = self._pending_messages.pop(session_key)
print(f"[{self.name}] 📨 Processing queued message from interrupt")
# Clean up current session before processing pending
if session_key in self._active_sessions:
del self._active_sessions[session_key]
typing_task.cancel()
try:
await typing_task
except asyncio.CancelledError:
pass
# Process pending message in new background task
await self._process_message_background(pending_event, session_key)
return # Already cleaned up
except Exception as e:
print(f"[{self.name}] Error handling message: {e}")
import traceback
traceback.print_exc()
finally:
# Stop typing indicator
typing_task.cancel()
try:
await typing_task
except asyncio.CancelledError:
pass
# Clean up session tracking
if session_key in self._active_sessions:
del self._active_sessions[session_key]
def has_pending_interrupt(self, session_key: str) -> bool:
"""Check if there's a pending interrupt for a session."""
return session_key in self._active_sessions and self._active_sessions[session_key].is_set()
def get_pending_message(self, session_key: str) -> Optional[MessageEvent]:
"""Get and clear any pending message for a session."""
return self._pending_messages.pop(session_key, None)
def build_source(
self,
chat_id: str,
chat_name: Optional[str] = None,
chat_type: str = "dm",
user_id: Optional[str] = None,
user_name: Optional[str] = None,
thread_id: Optional[str] = None
) -> SessionSource:
"""Helper to build a SessionSource for this platform."""
return SessionSource(
platform=self.platform,
chat_id=str(chat_id),
chat_name=chat_name,
chat_type=chat_type,
user_id=str(user_id) if user_id else None,
user_name=user_name,
thread_id=str(thread_id) if thread_id else None,
)
@abstractmethod
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
"""
Get information about a chat/channel.
Returns dict with at least:
- name: Chat name
- type: "dm", "group", "channel"
"""
pass
def format_message(self, content: str) -> str:
"""
Format a message for this platform.
Override in subclasses to handle platform-specific formatting
(e.g., Telegram MarkdownV2, Discord markdown).
Default implementation returns content as-is.
"""
return content
def truncate_message(self, content: str, max_length: int = 4096) -> List[str]:
"""
Split a long message into chunks.
Args:
content: The full message content
max_length: Maximum length per chunk (platform-specific)
Returns:
List of message chunks
"""
if len(content) <= max_length:
return [content]
chunks = []
while content:
if len(content) <= max_length:
chunks.append(content)
break
# Try to split at a newline
split_idx = content.rfind("\n", 0, max_length)
if split_idx == -1:
# No newline, split at space
split_idx = content.rfind(" ", 0, max_length)
if split_idx == -1:
# No space either, hard split
split_idx = max_length
chunks.append(content[:split_idx])
content = content[split_idx:].lstrip()
return chunks

View File

@@ -1,679 +0,0 @@
"""
Discord platform adapter.
Uses discord.py library for:
- Receiving messages from servers and DMs
- Sending responses back
- Handling threads and channels
"""
import asyncio
import os
from typing import Dict, List, Optional, Any
try:
import discord
from discord import Message as DiscordMessage, Intents
from discord.ext import commands
DISCORD_AVAILABLE = True
except ImportError:
DISCORD_AVAILABLE = False
discord = None
DiscordMessage = Any
Intents = Any
commands = None
import sys
sys.path.insert(0, str(__file__).rsplit("/", 3)[0])
from gateway.config import Platform, PlatformConfig
from gateway.platforms.base import (
BasePlatformAdapter,
MessageEvent,
MessageType,
SendResult,
cache_image_from_url,
cache_audio_from_url,
)
def check_discord_requirements() -> bool:
"""Check if Discord dependencies are available."""
return DISCORD_AVAILABLE
class DiscordAdapter(BasePlatformAdapter):
"""
Discord bot adapter.
Handles:
- Receiving messages from servers and DMs
- Sending responses with Discord markdown
- Thread support
- Native slash commands (/ask, /reset, /status, /stop)
- Button-based exec approvals
- Auto-threading for long conversations
- Reaction-based feedback
"""
# Discord message limits
MAX_MESSAGE_LENGTH = 2000
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.DISCORD)
self._client: Optional[commands.Bot] = None
self._ready_event = asyncio.Event()
self._allowed_user_ids: set = set() # For button approval authorization
async def connect(self) -> bool:
"""Connect to Discord and start receiving events."""
if not DISCORD_AVAILABLE:
print(f"[{self.name}] discord.py not installed. Run: pip install discord.py")
return False
if not self.config.token:
print(f"[{self.name}] No bot token configured")
return False
try:
# Set up intents
intents = Intents.default()
intents.message_content = True
intents.dm_messages = True
intents.guild_messages = True
# Create bot
self._client = commands.Bot(
command_prefix="!", # Not really used, we handle raw messages
intents=intents,
)
# Parse allowed user IDs for button authorization
allowed_env = os.getenv("DISCORD_ALLOWED_USERS", "")
if allowed_env:
self._allowed_user_ids = {
uid.strip() for uid in allowed_env.split(",") if uid.strip()
}
# Register event handlers
@self._client.event
async def on_ready():
print(f"[{self.name}] Connected as {self._client.user}")
# Sync slash commands with Discord
try:
synced = await self._client.tree.sync()
print(f"[{self.name}] Synced {len(synced)} slash command(s)")
except Exception as e:
print(f"[{self.name}] Slash command sync failed: {e}")
self._ready_event.set()
@self._client.event
async def on_message(message: DiscordMessage):
# Ignore bot's own messages
if message.author == self._client.user:
return
await self._handle_message(message)
# Register slash commands
self._register_slash_commands()
# Start the bot in background
asyncio.create_task(self._client.start(self.config.token))
# Wait for ready
await asyncio.wait_for(self._ready_event.wait(), timeout=30)
self._running = True
return True
except asyncio.TimeoutError:
print(f"[{self.name}] Timeout waiting for connection")
return False
except Exception as e:
print(f"[{self.name}] Failed to connect: {e}")
return False
async def disconnect(self) -> None:
"""Disconnect from Discord."""
if self._client:
try:
await self._client.close()
except Exception as e:
print(f"[{self.name}] Error during disconnect: {e}")
self._running = False
self._client = None
self._ready_event.clear()
print(f"[{self.name}] Disconnected")
async def send(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> SendResult:
"""Send a message to a Discord channel."""
if not self._client:
return SendResult(success=False, error="Not connected")
try:
# Get the channel
channel = self._client.get_channel(int(chat_id))
if not channel:
channel = await self._client.fetch_channel(int(chat_id))
if not channel:
return SendResult(success=False, error=f"Channel {chat_id} not found")
# Format and split message if needed
formatted = self.format_message(content)
chunks = self.truncate_message(formatted, self.MAX_MESSAGE_LENGTH)
message_ids = []
reference = None
if reply_to:
try:
ref_msg = await channel.fetch_message(int(reply_to))
reference = ref_msg
except Exception:
pass # Ignore if we can't find the referenced message
for i, chunk in enumerate(chunks):
msg = await channel.send(
content=chunk,
reference=reference if i == 0 else None,
)
message_ids.append(str(msg.id))
return SendResult(
success=True,
message_id=message_ids[0] if message_ids else None,
raw_response={"message_ids": message_ids}
)
except Exception as e:
return SendResult(success=False, error=str(e))
async def send_voice(
self,
chat_id: str,
audio_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send audio as a Discord file attachment."""
if not self._client:
return SendResult(success=False, error="Not connected")
try:
import io
channel = self._client.get_channel(int(chat_id))
if not channel:
channel = await self._client.fetch_channel(int(chat_id))
if not channel:
return SendResult(success=False, error=f"Channel {chat_id} not found")
if not os.path.exists(audio_path):
return SendResult(success=False, error=f"Audio file not found: {audio_path}")
# Determine filename from path
filename = os.path.basename(audio_path)
with open(audio_path, "rb") as f:
file = discord.File(io.BytesIO(f.read()), filename=filename)
msg = await channel.send(
content=caption if caption else None,
file=file,
)
return SendResult(success=True, message_id=str(msg.id))
except Exception as e:
print(f"[{self.name}] Failed to send audio: {e}")
return await super().send_voice(chat_id, audio_path, caption, reply_to)
async def send_image(
self,
chat_id: str,
image_url: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send an image natively as a Discord file attachment."""
if not self._client:
return SendResult(success=False, error="Not connected")
try:
import aiohttp
channel = self._client.get_channel(int(chat_id))
if not channel:
channel = await self._client.fetch_channel(int(chat_id))
if not channel:
return SendResult(success=False, error=f"Channel {chat_id} not found")
# Download the image and send as a Discord file attachment
# (Discord renders attachments inline, unlike plain URLs)
async with aiohttp.ClientSession() as session:
async with session.get(image_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status != 200:
raise Exception(f"Failed to download image: HTTP {resp.status}")
image_data = await resp.read()
# Determine filename from URL or content type
content_type = resp.headers.get("content-type", "image/png")
ext = "png"
if "jpeg" in content_type or "jpg" in content_type:
ext = "jpg"
elif "gif" in content_type:
ext = "gif"
elif "webp" in content_type:
ext = "webp"
import io
file = discord.File(io.BytesIO(image_data), filename=f"image.{ext}")
msg = await channel.send(
content=caption if caption else None,
file=file,
)
return SendResult(success=True, message_id=str(msg.id))
except ImportError:
print(f"[{self.name}] aiohttp not installed, falling back to URL. Run: pip install aiohttp")
return await super().send_image(chat_id, image_url, caption, reply_to)
except Exception as e:
print(f"[{self.name}] Failed to send image attachment, falling back to URL: {e}")
return await super().send_image(chat_id, image_url, caption, reply_to)
async def send_typing(self, chat_id: str) -> None:
"""Send typing indicator."""
if self._client:
try:
channel = self._client.get_channel(int(chat_id))
if channel:
await channel.typing()
except Exception:
pass # Ignore typing indicator failures
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
"""Get information about a Discord channel."""
if not self._client:
return {"name": "Unknown", "type": "dm"}
try:
channel = self._client.get_channel(int(chat_id))
if not channel:
channel = await self._client.fetch_channel(int(chat_id))
if not channel:
return {"name": str(chat_id), "type": "dm"}
# Determine channel type
if isinstance(channel, discord.DMChannel):
chat_type = "dm"
name = channel.recipient.name if channel.recipient else str(chat_id)
elif isinstance(channel, discord.Thread):
chat_type = "thread"
name = channel.name
elif isinstance(channel, discord.TextChannel):
chat_type = "channel"
name = f"#{channel.name}"
if channel.guild:
name = f"{channel.guild.name} / {name}"
else:
chat_type = "channel"
name = getattr(channel, "name", str(chat_id))
return {
"name": name,
"type": chat_type,
"guild_id": str(channel.guild.id) if hasattr(channel, "guild") and channel.guild else None,
"guild_name": channel.guild.name if hasattr(channel, "guild") and channel.guild else None,
}
except Exception as e:
return {"name": str(chat_id), "type": "dm", "error": str(e)}
def format_message(self, content: str) -> str:
"""
Format message for Discord.
Discord uses its own markdown variant.
"""
# Discord markdown is fairly standard, no special escaping needed
return content
def _register_slash_commands(self) -> None:
"""Register Discord slash commands on the command tree."""
if not self._client:
return
tree = self._client.tree
@tree.command(name="ask", description="Ask Hermes a question")
@discord.app_commands.describe(question="Your question for Hermes")
async def slash_ask(interaction: discord.Interaction, question: str):
await interaction.response.defer()
event = self._build_slash_event(interaction, question)
await self.handle_message(event)
# The response is sent via the normal send() flow
# Send a followup to close the interaction if needed
try:
await interaction.followup.send("Processing complete~", ephemeral=True)
except Exception:
pass
@tree.command(name="reset", description="Reset your Hermes session")
async def slash_reset(interaction: discord.Interaction):
await interaction.response.defer(ephemeral=True)
event = self._build_slash_event(interaction, "/reset")
await self.handle_message(event)
try:
await interaction.followup.send("Session reset~", ephemeral=True)
except Exception:
pass
@tree.command(name="status", description="Show Hermes session status")
async def slash_status(interaction: discord.Interaction):
await interaction.response.defer(ephemeral=True)
event = self._build_slash_event(interaction, "/status")
await self.handle_message(event)
try:
await interaction.followup.send("Status sent~", ephemeral=True)
except Exception:
pass
@tree.command(name="stop", description="Stop the running Hermes agent")
async def slash_stop(interaction: discord.Interaction):
await interaction.response.defer(ephemeral=True)
event = self._build_slash_event(interaction, "/stop")
await self.handle_message(event)
try:
await interaction.followup.send("Stop requested~", ephemeral=True)
except Exception:
pass
def _build_slash_event(self, interaction: discord.Interaction, text: str) -> MessageEvent:
"""Build a MessageEvent from a Discord slash command interaction."""
is_dm = isinstance(interaction.channel, discord.DMChannel)
chat_type = "dm" if is_dm else "group"
chat_name = ""
if not is_dm and hasattr(interaction.channel, "name"):
chat_name = interaction.channel.name
if hasattr(interaction.channel, "guild") and interaction.channel.guild:
chat_name = f"{interaction.channel.guild.name} / #{chat_name}"
source = self.build_source(
chat_id=str(interaction.channel_id),
chat_name=chat_name,
chat_type=chat_type,
user_id=str(interaction.user.id),
user_name=interaction.user.display_name,
)
msg_type = MessageType.COMMAND if text.startswith("/") else MessageType.TEXT
return MessageEvent(
text=text,
message_type=msg_type,
source=source,
raw_message=interaction,
)
async def send_exec_approval(
self, chat_id: str, command: str, approval_id: str
) -> SendResult:
"""
Send a button-based exec approval prompt for a dangerous command.
Returns SendResult. The approval is resolved when a user clicks a button.
"""
if not self._client or not DISCORD_AVAILABLE:
return SendResult(success=False, error="Not connected")
try:
channel = self._client.get_channel(int(chat_id))
if not channel:
channel = await self._client.fetch_channel(int(chat_id))
embed = discord.Embed(
title="Command Approval Required",
description=f"```\n{command[:500]}\n```",
color=discord.Color.orange(),
)
embed.set_footer(text=f"Approval ID: {approval_id}")
view = ExecApprovalView(
approval_id=approval_id,
allowed_user_ids=self._allowed_user_ids,
)
msg = await channel.send(embed=embed, view=view)
return SendResult(success=True, message_id=str(msg.id))
except Exception as e:
return SendResult(success=False, error=str(e))
async def _handle_message(self, message: DiscordMessage) -> None:
"""Handle incoming Discord messages."""
# In server channels (not DMs), require the bot to be @mentioned
# UNLESS the channel is in the free-response list.
#
# Config:
# DISCORD_FREE_RESPONSE_CHANNELS: Comma-separated channel IDs where the
# bot responds to every message without needing a mention.
# DISCORD_REQUIRE_MENTION: Set to "false" to disable mention requirement
# globally (all channels become free-response). Default: "true".
if not isinstance(message.channel, discord.DMChannel):
# Check if this channel is in the free-response list
free_channels_raw = os.getenv("DISCORD_FREE_RESPONSE_CHANNELS", "")
free_channels = {ch.strip() for ch in free_channels_raw.split(",") if ch.strip()}
channel_id = str(message.channel.id)
# Global override: if DISCORD_REQUIRE_MENTION=false, all channels are free
require_mention = os.getenv("DISCORD_REQUIRE_MENTION", "true").lower() not in ("false", "0", "no")
is_free_channel = channel_id in free_channels
if require_mention and not is_free_channel:
# Must be @mentioned to respond
if self._client.user not in message.mentions:
return # Silently ignore messages that don't mention the bot
# Strip the bot mention from the message text so the agent sees clean input
if self._client.user and self._client.user in message.mentions:
message.content = message.content.replace(f"<@{self._client.user.id}>", "").strip()
message.content = message.content.replace(f"<@!{self._client.user.id}>", "").strip()
# Determine message type
msg_type = MessageType.TEXT
if message.content.startswith("/"):
msg_type = MessageType.COMMAND
elif message.attachments:
# Check attachment types
for att in message.attachments:
if att.content_type:
if att.content_type.startswith("image/"):
msg_type = MessageType.PHOTO
elif att.content_type.startswith("video/"):
msg_type = MessageType.VIDEO
elif att.content_type.startswith("audio/"):
msg_type = MessageType.AUDIO
else:
msg_type = MessageType.DOCUMENT
break
# Determine chat type
if isinstance(message.channel, discord.DMChannel):
chat_type = "dm"
chat_name = message.author.name
elif isinstance(message.channel, discord.Thread):
chat_type = "thread"
chat_name = message.channel.name
else:
chat_type = "group" # Treat server channels as groups
chat_name = getattr(message.channel, "name", str(message.channel.id))
if hasattr(message.channel, "guild") and message.channel.guild:
chat_name = f"{message.channel.guild.name} / #{chat_name}"
# Get thread ID if in a thread
thread_id = None
if isinstance(message.channel, discord.Thread):
thread_id = str(message.channel.id)
# Build source
source = self.build_source(
chat_id=str(message.channel.id),
chat_name=chat_name,
chat_type=chat_type,
user_id=str(message.author.id),
user_name=message.author.display_name,
thread_id=thread_id,
)
# Build media URLs -- download image attachments to local cache so the
# vision tool can access them reliably (Discord CDN URLs can expire).
media_urls = []
media_types = []
for att in message.attachments:
content_type = att.content_type or "unknown"
if content_type.startswith("image/"):
try:
# Determine extension from content type (image/png -> .png)
ext = "." + content_type.split("/")[-1].split(";")[0]
if ext not in (".jpg", ".jpeg", ".png", ".gif", ".webp"):
ext = ".jpg"
cached_path = await cache_image_from_url(att.url, ext=ext)
media_urls.append(cached_path)
media_types.append(content_type)
print(f"[Discord] Cached user image: {cached_path}", flush=True)
except Exception as e:
print(f"[Discord] Failed to cache image attachment: {e}", flush=True)
# Fall back to the CDN URL if caching fails
media_urls.append(att.url)
media_types.append(content_type)
elif content_type.startswith("audio/"):
try:
ext = "." + content_type.split("/")[-1].split(";")[0]
if ext not in (".ogg", ".mp3", ".wav", ".webm", ".m4a"):
ext = ".ogg"
cached_path = await cache_audio_from_url(att.url, ext=ext)
media_urls.append(cached_path)
media_types.append(content_type)
print(f"[Discord] Cached user audio: {cached_path}", flush=True)
except Exception as e:
print(f"[Discord] Failed to cache audio attachment: {e}", flush=True)
media_urls.append(att.url)
media_types.append(content_type)
else:
# Other attachments: keep the original URL
media_urls.append(att.url)
media_types.append(content_type)
event = MessageEvent(
text=message.content,
message_type=msg_type,
source=source,
raw_message=message,
message_id=str(message.id),
media_urls=media_urls,
media_types=media_types,
reply_to_message_id=str(message.reference.message_id) if message.reference else None,
timestamp=message.created_at,
)
await self.handle_message(event)
# ---------------------------------------------------------------------------
# Discord UI Components (outside the adapter class)
# ---------------------------------------------------------------------------
if DISCORD_AVAILABLE:
class ExecApprovalView(discord.ui.View):
"""
Interactive button view for exec approval of dangerous commands.
Shows three buttons: Allow Once (green), Always Allow (blue), Deny (red).
Only users in the allowed list can click. The view times out after 5 minutes.
"""
def __init__(self, approval_id: str, allowed_user_ids: set):
super().__init__(timeout=300) # 5-minute timeout
self.approval_id = approval_id
self.allowed_user_ids = allowed_user_ids
self.resolved = False
def _check_auth(self, interaction: discord.Interaction) -> bool:
"""Verify the user clicking is authorized."""
if not self.allowed_user_ids:
return True # No allowlist = anyone can approve
return str(interaction.user.id) in self.allowed_user_ids
async def _resolve(
self, interaction: discord.Interaction, action: str, color: discord.Color
):
"""Resolve the approval and update the message."""
if self.resolved:
await interaction.response.send_message(
"This approval has already been resolved~", ephemeral=True
)
return
if not self._check_auth(interaction):
await interaction.response.send_message(
"You're not authorized to approve commands~", ephemeral=True
)
return
self.resolved = True
# Update the embed with the decision
embed = interaction.message.embeds[0] if interaction.message.embeds else None
if embed:
embed.color = color
embed.set_footer(text=f"{action} by {interaction.user.display_name}")
# Disable all buttons
for child in self.children:
child.disabled = True
await interaction.response.edit_message(embed=embed, view=self)
# Store the approval decision for the gateway to pick up
try:
from tools.terminal_tool import _session_approved_patterns
if action == "allow_once":
pass # One-time approval handled by gateway
elif action == "allow_always":
_session_approved_patterns.add(self.approval_id)
except ImportError:
pass
@discord.ui.button(label="Allow Once", style=discord.ButtonStyle.green)
async def allow_once(
self, interaction: discord.Interaction, button: discord.ui.Button
):
await self._resolve(interaction, "allow_once", discord.Color.green())
@discord.ui.button(label="Always Allow", style=discord.ButtonStyle.blurple)
async def allow_always(
self, interaction: discord.Interaction, button: discord.ui.Button
):
await self._resolve(interaction, "allow_always", discord.Color.blue())
@discord.ui.button(label="Deny", style=discord.ButtonStyle.red)
async def deny(
self, interaction: discord.Interaction, button: discord.ui.Button
):
await self._resolve(interaction, "deny", discord.Color.red())
async def on_timeout(self):
"""Handle view timeout -- disable buttons and mark as expired."""
self.resolved = True
for child in self.children:
child.disabled = True

View File

@@ -1,374 +0,0 @@
"""
Slack platform adapter.
Uses slack-bolt (Python) with Socket Mode for:
- Receiving messages from channels and DMs
- Sending responses back
- Handling slash commands
- Thread support
"""
import asyncio
import os
from typing import Dict, List, Optional, Any
try:
from slack_bolt.async_app import AsyncApp
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
from slack_sdk.web.async_client import AsyncWebClient
SLACK_AVAILABLE = True
except ImportError:
SLACK_AVAILABLE = False
AsyncApp = Any
AsyncSocketModeHandler = Any
AsyncWebClient = Any
import sys
sys.path.insert(0, str(__file__).rsplit("/", 3)[0])
from gateway.config import Platform, PlatformConfig
from gateway.platforms.base import (
BasePlatformAdapter,
MessageEvent,
MessageType,
SendResult,
cache_image_from_url,
cache_audio_from_url,
)
def check_slack_requirements() -> bool:
"""Check if Slack dependencies are available."""
return SLACK_AVAILABLE
class SlackAdapter(BasePlatformAdapter):
"""
Slack bot adapter using Socket Mode.
Requires two tokens:
- SLACK_BOT_TOKEN (xoxb-...) for API calls
- SLACK_APP_TOKEN (xapp-...) for Socket Mode connection
Features:
- DMs and channel messages (mention-gated in channels)
- Thread support
- File/image/audio attachments
- Slash commands (/hermes)
- Typing indicators (not natively supported by Slack bots)
"""
MAX_MESSAGE_LENGTH = 4000 # Slack's limit is higher but mrkdwn can inflate
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.SLACK)
self._app: Optional[AsyncApp] = None
self._handler: Optional[AsyncSocketModeHandler] = None
self._bot_user_id: Optional[str] = None
async def connect(self) -> bool:
"""Connect to Slack via Socket Mode."""
if not SLACK_AVAILABLE:
print("[Slack] slack-bolt not installed. Run: pip install slack-bolt")
return False
bot_token = self.config.token
app_token = os.getenv("SLACK_APP_TOKEN")
if not bot_token:
print("[Slack] SLACK_BOT_TOKEN not set")
return False
if not app_token:
print("[Slack] SLACK_APP_TOKEN not set")
return False
try:
self._app = AsyncApp(token=bot_token)
# Get our own bot user ID for mention detection
auth_response = await self._app.client.auth_test()
self._bot_user_id = auth_response.get("user_id")
bot_name = auth_response.get("user", "unknown")
# Register message event handler
@self._app.event("message")
async def handle_message_event(event, say):
await self._handle_slack_message(event)
# Register slash command handler
@self._app.command("/hermes")
async def handle_hermes_command(ack, command):
await ack()
await self._handle_slash_command(command)
# Start Socket Mode handler in background
self._handler = AsyncSocketModeHandler(self._app, app_token)
asyncio.create_task(self._handler.start_async())
self._running = True
print(f"[Slack] Connected as @{bot_name} (Socket Mode)")
return True
except Exception as e:
print(f"[Slack] Connection failed: {e}")
return False
async def disconnect(self) -> None:
"""Disconnect from Slack."""
if self._handler:
await self._handler.close_async()
self._running = False
print("[Slack] Disconnected")
async def send(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> SendResult:
"""Send a message to a Slack channel or DM."""
if not self._app:
return SendResult(success=False, error="Not connected")
try:
kwargs = {
"channel": chat_id,
"text": content,
}
# Reply in thread if thread_ts is available
if reply_to:
kwargs["thread_ts"] = reply_to
elif metadata and metadata.get("thread_ts"):
kwargs["thread_ts"] = metadata["thread_ts"]
result = await self._app.client.chat_postMessage(**kwargs)
return SendResult(
success=True,
message_id=result.get("ts"),
raw_response=result,
)
except Exception as e:
print(f"[Slack] Send error: {e}")
return SendResult(success=False, error=str(e))
async def send_typing(self, chat_id: str) -> None:
"""Slack doesn't have a direct typing indicator API for bots."""
pass
async def send_image(
self,
chat_id: str,
image_url: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send an image to Slack by uploading the URL as a file."""
if not self._app:
return SendResult(success=False, error="Not connected")
try:
import httpx
# Download the image first
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(image_url)
response.raise_for_status()
result = await self._app.client.files_upload_v2(
channel=chat_id,
content=response.content,
filename="image.png",
initial_comment=caption or "",
thread_ts=reply_to,
)
return SendResult(success=True, raw_response=result)
except Exception as e:
# Fall back to sending the URL as text
text = f"{caption}\n{image_url}" if caption else image_url
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
async def send_voice(
self,
chat_id: str,
audio_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send an audio file to Slack."""
if not self._app:
return SendResult(success=False, error="Not connected")
try:
result = await self._app.client.files_upload_v2(
channel=chat_id,
file=audio_path,
filename=os.path.basename(audio_path),
initial_comment=caption or "",
thread_ts=reply_to,
)
return SendResult(success=True, raw_response=result)
except Exception as e:
return SendResult(success=False, error=str(e))
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
"""Get information about a Slack channel."""
if not self._app:
return {"name": chat_id, "type": "unknown"}
try:
result = await self._app.client.conversations_info(channel=chat_id)
channel = result.get("channel", {})
is_dm = channel.get("is_im", False)
return {
"name": channel.get("name", chat_id),
"type": "dm" if is_dm else "group",
}
except Exception:
return {"name": chat_id, "type": "unknown"}
# ----- Internal handlers -----
async def _handle_slack_message(self, event: dict) -> None:
"""Handle an incoming Slack message event."""
# Ignore bot messages (including our own)
if event.get("bot_id") or event.get("subtype") == "bot_message":
return
# Ignore message edits and deletions
subtype = event.get("subtype")
if subtype in ("message_changed", "message_deleted"):
return
text = event.get("text", "")
user_id = event.get("user", "")
channel_id = event.get("channel", "")
thread_ts = event.get("thread_ts") or event.get("ts")
ts = event.get("ts", "")
# Determine if this is a DM or channel message
channel_type = event.get("channel_type", "")
is_dm = channel_type == "im"
# In channels, only respond if bot is mentioned
if not is_dm and self._bot_user_id:
if f"<@{self._bot_user_id}>" not in text:
return
# Strip the bot mention from the text
text = text.replace(f"<@{self._bot_user_id}>", "").strip()
# Determine message type
msg_type = MessageType.TEXT
if text.startswith("/"):
msg_type = MessageType.COMMAND
# Handle file attachments
media_urls = []
media_types = []
files = event.get("files", [])
for f in files:
mimetype = f.get("mimetype", "unknown")
url = f.get("url_private_download") or f.get("url_private", "")
if mimetype.startswith("image/") and url:
try:
ext = "." + mimetype.split("/")[-1].split(";")[0]
if ext not in (".jpg", ".jpeg", ".png", ".gif", ".webp"):
ext = ".jpg"
# Slack private URLs require the bot token as auth header
cached = await self._download_slack_file(url, ext)
media_urls.append(cached)
media_types.append(mimetype)
msg_type = MessageType.PHOTO
except Exception as e:
print(f"[Slack] Failed to cache image: {e}", flush=True)
elif mimetype.startswith("audio/") and url:
try:
ext = "." + mimetype.split("/")[-1].split(";")[0]
if ext not in (".ogg", ".mp3", ".wav", ".webm", ".m4a"):
ext = ".ogg"
cached = await self._download_slack_file(url, ext, audio=True)
media_urls.append(cached)
media_types.append(mimetype)
msg_type = MessageType.VOICE
except Exception as e:
print(f"[Slack] Failed to cache audio: {e}", flush=True)
# Build source
source = self.build_source(
chat_id=channel_id,
chat_name=channel_id, # Will be resolved later if needed
chat_type="dm" if is_dm else "group",
user_id=user_id,
thread_id=thread_ts,
)
msg_event = MessageEvent(
text=text,
message_type=msg_type,
source=source,
raw_message=event,
message_id=ts,
media_urls=media_urls,
media_types=media_types,
reply_to_message_id=thread_ts if thread_ts != ts else None,
)
await self.handle_message(msg_event)
async def _handle_slash_command(self, command: dict) -> None:
"""Handle /hermes slash command."""
text = command.get("text", "").strip()
user_id = command.get("user_id", "")
channel_id = command.get("channel_id", "")
# Map common slash subcommands to gateway commands
if text in ("new", "reset"):
text = "/reset"
elif text == "status":
text = "/status"
elif text == "stop":
text = "/stop"
elif text:
pass # Treat as a regular question
else:
text = "/help"
source = self.build_source(
chat_id=channel_id,
chat_type="dm", # Slash commands are always in DM-like context
user_id=user_id,
)
event = MessageEvent(
text=text,
message_type=MessageType.COMMAND if text.startswith("/") else MessageType.TEXT,
source=source,
raw_message=command,
)
await self.handle_message(event)
async def _download_slack_file(self, url: str, ext: str, audio: bool = False) -> str:
"""Download a Slack file using the bot token for auth."""
import httpx
bot_token = self.config.token
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(
url,
headers={"Authorization": f"Bearer {bot_token}"},
)
response.raise_for_status()
if audio:
from gateway.platforms.base import cache_audio_from_bytes
return cache_audio_from_bytes(response.content, ext)
else:
from gateway.platforms.base import cache_image_from_bytes
return cache_image_from_bytes(response.content, ext)

View File

@@ -1,484 +0,0 @@
"""
Telegram platform adapter.
Uses python-telegram-bot library for:
- Receiving messages from users/groups
- Sending responses back
- Handling media and commands
"""
import asyncio
from typing import Dict, List, Optional, Any
try:
from telegram import Update, Bot, Message
from telegram.ext import (
Application,
CommandHandler,
MessageHandler as TelegramMessageHandler,
ContextTypes,
filters,
)
from telegram.constants import ParseMode, ChatType
TELEGRAM_AVAILABLE = True
except ImportError:
TELEGRAM_AVAILABLE = False
Update = Any
Bot = Any
Message = Any
Application = Any
ContextTypes = Any
import sys
sys.path.insert(0, str(__file__).rsplit("/", 3)[0])
from gateway.config import Platform, PlatformConfig
from gateway.platforms.base import (
BasePlatformAdapter,
MessageEvent,
MessageType,
SendResult,
cache_image_from_bytes,
cache_audio_from_bytes,
)
def check_telegram_requirements() -> bool:
"""Check if Telegram dependencies are available."""
return TELEGRAM_AVAILABLE
class TelegramAdapter(BasePlatformAdapter):
"""
Telegram bot adapter.
Handles:
- Receiving messages from users and groups
- Sending responses with Telegram markdown
- Forum topics (thread_id support)
- Media messages
"""
# Telegram message limits
MAX_MESSAGE_LENGTH = 4096
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.TELEGRAM)
self._app: Optional[Application] = None
self._bot: Optional[Bot] = None
async def connect(self) -> bool:
"""Connect to Telegram and start polling for updates."""
if not TELEGRAM_AVAILABLE:
print(f"[{self.name}] python-telegram-bot not installed. Run: pip install python-telegram-bot")
return False
if not self.config.token:
print(f"[{self.name}] No bot token configured")
return False
try:
# Build the application
self._app = Application.builder().token(self.config.token).build()
self._bot = self._app.bot
# Register handlers
self._app.add_handler(TelegramMessageHandler(
filters.TEXT & ~filters.COMMAND,
self._handle_text_message
))
self._app.add_handler(TelegramMessageHandler(
filters.COMMAND,
self._handle_command
))
self._app.add_handler(TelegramMessageHandler(
filters.PHOTO | filters.VIDEO | filters.AUDIO | filters.VOICE | filters.Document.ALL | filters.Sticker.ALL,
self._handle_media_message
))
# Start polling in background
await self._app.initialize()
await self._app.start()
await self._app.updater.start_polling(allowed_updates=Update.ALL_TYPES)
self._running = True
print(f"[{self.name}] Connected and polling for updates")
return True
except Exception as e:
print(f"[{self.name}] Failed to connect: {e}")
return False
async def disconnect(self) -> None:
"""Stop polling and disconnect."""
if self._app:
try:
await self._app.updater.stop()
await self._app.stop()
await self._app.shutdown()
except Exception as e:
print(f"[{self.name}] Error during disconnect: {e}")
self._running = False
self._app = None
self._bot = None
print(f"[{self.name}] Disconnected")
async def send(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> SendResult:
"""Send a message to a Telegram chat."""
if not self._bot:
return SendResult(success=False, error="Not connected")
try:
# Format and split message if needed
formatted = self.format_message(content)
chunks = self.truncate_message(formatted, self.MAX_MESSAGE_LENGTH)
message_ids = []
thread_id = metadata.get("thread_id") if metadata else None
for i, chunk in enumerate(chunks):
# Try Markdown first, fall back to plain text if it fails
try:
msg = await self._bot.send_message(
chat_id=int(chat_id),
text=chunk,
parse_mode=ParseMode.MARKDOWN,
reply_to_message_id=int(reply_to) if reply_to and i == 0 else None,
message_thread_id=int(thread_id) if thread_id else None,
)
except Exception as md_error:
# Markdown parsing failed, try plain text
if "parse" in str(md_error).lower() or "markdown" in str(md_error).lower():
msg = await self._bot.send_message(
chat_id=int(chat_id),
text=chunk,
parse_mode=None, # Plain text
reply_to_message_id=int(reply_to) if reply_to and i == 0 else None,
message_thread_id=int(thread_id) if thread_id else None,
)
else:
raise # Re-raise if not a parse error
message_ids.append(str(msg.message_id))
return SendResult(
success=True,
message_id=message_ids[0] if message_ids else None,
raw_response={"message_ids": message_ids}
)
except Exception as e:
return SendResult(success=False, error=str(e))
async def send_voice(
self,
chat_id: str,
audio_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send audio as a native Telegram voice message or audio file."""
if not self._bot:
return SendResult(success=False, error="Not connected")
try:
import os
if not os.path.exists(audio_path):
return SendResult(success=False, error=f"Audio file not found: {audio_path}")
with open(audio_path, "rb") as audio_file:
# .ogg files -> send as voice (round playable bubble)
if audio_path.endswith(".ogg") or audio_path.endswith(".opus"):
msg = await self._bot.send_voice(
chat_id=int(chat_id),
voice=audio_file,
caption=caption[:1024] if caption else None,
reply_to_message_id=int(reply_to) if reply_to else None,
)
else:
# .mp3 and others -> send as audio file
msg = await self._bot.send_audio(
chat_id=int(chat_id),
audio=audio_file,
caption=caption[:1024] if caption else None,
reply_to_message_id=int(reply_to) if reply_to else None,
)
return SendResult(success=True, message_id=str(msg.message_id))
except Exception as e:
print(f"[{self.name}] Failed to send voice/audio: {e}")
return await super().send_voice(chat_id, audio_path, caption, reply_to)
async def send_image(
self,
chat_id: str,
image_url: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send an image natively as a Telegram photo."""
if not self._bot:
return SendResult(success=False, error="Not connected")
try:
# Telegram can send photos directly from URLs
msg = await self._bot.send_photo(
chat_id=int(chat_id),
photo=image_url,
caption=caption[:1024] if caption else None, # Telegram caption limit
reply_to_message_id=int(reply_to) if reply_to else None,
)
return SendResult(success=True, message_id=str(msg.message_id))
except Exception as e:
print(f"[{self.name}] Failed to send photo, falling back to URL: {e}")
# Fallback: send as text link
return await super().send_image(chat_id, image_url, caption, reply_to)
async def send_typing(self, chat_id: str) -> None:
"""Send typing indicator."""
if self._bot:
try:
await self._bot.send_chat_action(
chat_id=int(chat_id),
action="typing"
)
except Exception:
pass # Ignore typing indicator failures
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
"""Get information about a Telegram chat."""
if not self._bot:
return {"name": "Unknown", "type": "dm"}
try:
chat = await self._bot.get_chat(int(chat_id))
chat_type = "dm"
if chat.type == ChatType.GROUP:
chat_type = "group"
elif chat.type == ChatType.SUPERGROUP:
chat_type = "group"
if chat.is_forum:
chat_type = "forum"
elif chat.type == ChatType.CHANNEL:
chat_type = "channel"
return {
"name": chat.title or chat.full_name or str(chat_id),
"type": chat_type,
"username": chat.username,
"is_forum": getattr(chat, "is_forum", False),
}
except Exception as e:
return {"name": str(chat_id), "type": "dm", "error": str(e)}
def format_message(self, content: str) -> str:
"""
Format message for Telegram.
Telegram uses a subset of markdown. We'll use the simpler
Markdown mode (not MarkdownV2) for compatibility.
"""
# Basic escaping for Telegram Markdown
# In Markdown mode (not V2), only certain characters need escaping
return content
async def _handle_text_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle incoming text messages."""
if not update.message or not update.message.text:
return
event = self._build_message_event(update.message, MessageType.TEXT)
await self.handle_message(event)
async def _handle_command(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle incoming command messages."""
if not update.message or not update.message.text:
return
event = self._build_message_event(update.message, MessageType.COMMAND)
await self.handle_message(event)
async def _handle_media_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle incoming media messages, downloading images to local cache."""
if not update.message:
return
msg = update.message
# Determine media type
if msg.sticker:
msg_type = MessageType.STICKER
elif msg.photo:
msg_type = MessageType.PHOTO
elif msg.video:
msg_type = MessageType.VIDEO
elif msg.audio:
msg_type = MessageType.AUDIO
elif msg.voice:
msg_type = MessageType.VOICE
else:
msg_type = MessageType.DOCUMENT
event = self._build_message_event(msg, msg_type)
# Add caption as text
if msg.caption:
event.text = msg.caption
# Handle stickers: describe via vision tool with caching
if msg.sticker:
await self._handle_sticker(msg, event)
await self.handle_message(event)
return
# Download photo to local image cache so the vision tool can access it
# even after Telegram's ephemeral file URLs expire (~1 hour).
if msg.photo:
try:
# msg.photo is a list of PhotoSize sorted by size; take the largest
photo = msg.photo[-1]
file_obj = await photo.get_file()
# Download the image bytes directly into memory
image_bytes = await file_obj.download_as_bytearray()
# Determine extension from the file path if available
ext = ".jpg"
if file_obj.file_path:
for candidate in [".png", ".webp", ".gif", ".jpeg", ".jpg"]:
if file_obj.file_path.lower().endswith(candidate):
ext = candidate
break
# Save to cache and populate media_urls with the local path
cached_path = cache_image_from_bytes(bytes(image_bytes), ext=ext)
event.media_urls = [cached_path]
event.media_types = [f"image/{ext.lstrip('.')}"]
print(f"[Telegram] Cached user photo: {cached_path}", flush=True)
except Exception as e:
print(f"[Telegram] Failed to cache photo: {e}", flush=True)
# Download voice/audio messages to cache for STT transcription
if msg.voice:
try:
file_obj = await msg.voice.get_file()
audio_bytes = await file_obj.download_as_bytearray()
cached_path = cache_audio_from_bytes(bytes(audio_bytes), ext=".ogg")
event.media_urls = [cached_path]
event.media_types = ["audio/ogg"]
print(f"[Telegram] Cached user voice: {cached_path}", flush=True)
except Exception as e:
print(f"[Telegram] Failed to cache voice: {e}", flush=True)
elif msg.audio:
try:
file_obj = await msg.audio.get_file()
audio_bytes = await file_obj.download_as_bytearray()
cached_path = cache_audio_from_bytes(bytes(audio_bytes), ext=".mp3")
event.media_urls = [cached_path]
event.media_types = ["audio/mp3"]
print(f"[Telegram] Cached user audio: {cached_path}", flush=True)
except Exception as e:
print(f"[Telegram] Failed to cache audio: {e}", flush=True)
await self.handle_message(event)
async def _handle_sticker(self, msg: Message, event: "MessageEvent") -> None:
"""
Describe a Telegram sticker via vision analysis, with caching.
For static stickers (WEBP), we download, analyze with vision, and cache
the description by file_unique_id. For animated/video stickers, we inject
a placeholder noting the emoji.
"""
from gateway.sticker_cache import (
get_cached_description,
cache_sticker_description,
build_sticker_injection,
build_animated_sticker_injection,
STICKER_VISION_PROMPT,
)
sticker = msg.sticker
emoji = sticker.emoji or ""
set_name = sticker.set_name or ""
# Animated and video stickers can't be analyzed as static images
if sticker.is_animated or sticker.is_video:
event.text = build_animated_sticker_injection(emoji)
return
# Check the cache first
cached = get_cached_description(sticker.file_unique_id)
if cached:
event.text = build_sticker_injection(
cached["description"], cached.get("emoji", emoji), cached.get("set_name", set_name)
)
print(f"[Telegram] Sticker cache hit: {sticker.file_unique_id}", flush=True)
return
# Cache miss -- download and analyze
try:
file_obj = await sticker.get_file()
image_bytes = await file_obj.download_as_bytearray()
cached_path = cache_image_from_bytes(bytes(image_bytes), ext=".webp")
print(f"[Telegram] Analyzing sticker: {cached_path}", flush=True)
from tools.vision_tools import vision_analyze_tool
import json as _json
result_json = await vision_analyze_tool(
image_url=cached_path,
user_prompt=STICKER_VISION_PROMPT,
)
result = _json.loads(result_json)
if result.get("success"):
description = result.get("analysis", "a sticker")
cache_sticker_description(sticker.file_unique_id, description, emoji, set_name)
event.text = build_sticker_injection(description, emoji, set_name)
else:
# Vision failed -- use emoji as fallback
event.text = build_sticker_injection(
f"a sticker with emoji {emoji}" if emoji else "a sticker",
emoji, set_name,
)
except Exception as e:
print(f"[Telegram] Sticker analysis error: {e}", flush=True)
event.text = build_sticker_injection(
f"a sticker with emoji {emoji}" if emoji else "a sticker",
emoji, set_name,
)
def _build_message_event(self, message: Message, msg_type: MessageType) -> MessageEvent:
"""Build a MessageEvent from a Telegram message."""
chat = message.chat
user = message.from_user
# Determine chat type
chat_type = "dm"
if chat.type in (ChatType.GROUP, ChatType.SUPERGROUP):
chat_type = "group"
elif chat.type == ChatType.CHANNEL:
chat_type = "channel"
# Build source
source = self.build_source(
chat_id=str(chat.id),
chat_name=chat.title or (chat.full_name if hasattr(chat, "full_name") else None),
chat_type=chat_type,
user_id=str(user.id) if user else None,
user_name=user.full_name if user else None,
thread_id=str(message.message_thread_id) if message.message_thread_id else None,
)
return MessageEvent(
text=message.text or "",
message_type=msg_type,
source=source,
raw_message=message,
message_id=str(message.message_id),
timestamp=message.date,
)

View File

@@ -1,360 +0,0 @@
"""
WhatsApp platform adapter.
WhatsApp integration is more complex than Telegram/Discord because:
- No official bot API for personal accounts
- Business API requires Meta Business verification
- Most solutions use web-based automation
This adapter supports multiple backends:
1. WhatsApp Business API (requires Meta verification)
2. whatsapp-web.js (via Node.js subprocess) - for personal accounts
3. Baileys (via Node.js subprocess) - alternative for personal accounts
For simplicity, we'll implement a generic interface that can work
with different backends via a bridge pattern.
"""
import asyncio
import json
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Any
import sys
sys.path.insert(0, str(__file__).rsplit("/", 3)[0])
from gateway.config import Platform, PlatformConfig
from gateway.platforms.base import (
BasePlatformAdapter,
MessageEvent,
MessageType,
SendResult,
cache_image_from_url,
cache_audio_from_url,
)
def check_whatsapp_requirements() -> bool:
"""
Check if WhatsApp dependencies are available.
WhatsApp requires a Node.js bridge for most implementations.
"""
# Check for Node.js
try:
result = subprocess.run(
["node", "--version"],
capture_output=True,
text=True,
timeout=5
)
return result.returncode == 0
except Exception:
return False
class WhatsAppAdapter(BasePlatformAdapter):
"""
WhatsApp adapter.
This implementation uses a simple HTTP bridge pattern where:
1. A Node.js process runs the WhatsApp Web client
2. Messages are forwarded via HTTP/IPC to this Python adapter
3. Responses are sent back through the bridge
The actual Node.js bridge implementation can vary:
- whatsapp-web.js based
- Baileys based
- Business API based
Configuration:
- bridge_script: Path to the Node.js bridge script
- bridge_port: Port for HTTP communication (default: 3000)
- session_path: Path to store WhatsApp session data
"""
# WhatsApp message limits
MAX_MESSAGE_LENGTH = 65536 # WhatsApp allows longer messages
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.WHATSAPP)
self._bridge_process: Optional[subprocess.Popen] = None
self._bridge_port: int = config.extra.get("bridge_port", 3000)
self._bridge_script: Optional[str] = config.extra.get("bridge_script")
self._session_path: Path = Path(config.extra.get(
"session_path",
Path.home() / ".hermes" / "whatsapp" / "session"
))
self._message_queue: asyncio.Queue = asyncio.Queue()
async def connect(self) -> bool:
"""
Start the WhatsApp bridge.
This launches the Node.js bridge process and waits for it to be ready.
"""
if not check_whatsapp_requirements():
print(f"[{self.name}] Node.js not found. WhatsApp requires Node.js.")
return False
if not self._bridge_script:
print(f"[{self.name}] No bridge script configured.")
print(f"[{self.name}] Set 'bridge_script' in whatsapp.extra config.")
print(f"[{self.name}] See docs/messaging.md for WhatsApp setup instructions.")
return False
bridge_path = Path(self._bridge_script)
if not bridge_path.exists():
print(f"[{self.name}] Bridge script not found: {bridge_path}")
return False
try:
# Ensure session directory exists
self._session_path.mkdir(parents=True, exist_ok=True)
# Start the bridge process
self._bridge_process = subprocess.Popen(
[
"node",
str(bridge_path),
"--port", str(self._bridge_port),
"--session", str(self._session_path),
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
# Wait for bridge to be ready (look for ready signal)
# This is a simplified version - real implementation would
# wait for an HTTP health check or specific stdout message
await asyncio.sleep(5)
if self._bridge_process.poll() is not None:
stderr = self._bridge_process.stderr.read() if self._bridge_process.stderr else ""
print(f"[{self.name}] Bridge process died: {stderr}")
return False
# Start message polling task
asyncio.create_task(self._poll_messages())
self._running = True
print(f"[{self.name}] Bridge started on port {self._bridge_port}")
print(f"[{self.name}] Scan QR code if prompted (check bridge output)")
return True
except Exception as e:
print(f"[{self.name}] Failed to start bridge: {e}")
return False
async def disconnect(self) -> None:
"""Stop the WhatsApp bridge."""
if self._bridge_process:
try:
self._bridge_process.terminate()
await asyncio.sleep(1)
if self._bridge_process.poll() is None:
self._bridge_process.kill()
except Exception as e:
print(f"[{self.name}] Error stopping bridge: {e}")
self._running = False
self._bridge_process = None
print(f"[{self.name}] Disconnected")
async def send(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> SendResult:
"""Send a message via the WhatsApp bridge."""
if not self._running:
return SendResult(success=False, error="Not connected")
try:
import aiohttp
async with aiohttp.ClientSession() as session:
payload = {
"chatId": chat_id,
"message": content,
}
if reply_to:
payload["replyTo"] = reply_to
async with session.post(
f"http://localhost:{self._bridge_port}/send",
json=payload,
timeout=aiohttp.ClientTimeout(total=30)
) as resp:
if resp.status == 200:
data = await resp.json()
return SendResult(
success=True,
message_id=data.get("messageId"),
raw_response=data
)
else:
error = await resp.text()
return SendResult(success=False, error=error)
except ImportError:
return SendResult(
success=False,
error="aiohttp not installed. Run: pip install aiohttp"
)
except Exception as e:
return SendResult(success=False, error=str(e))
async def send_typing(self, chat_id: str) -> None:
"""Send typing indicator via bridge."""
if not self._running:
return
try:
import aiohttp
async with aiohttp.ClientSession() as session:
await session.post(
f"http://localhost:{self._bridge_port}/typing",
json={"chatId": chat_id},
timeout=aiohttp.ClientTimeout(total=5)
)
except Exception:
pass # Ignore typing indicator failures
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
"""Get information about a WhatsApp chat."""
if not self._running:
return {"name": "Unknown", "type": "dm"}
try:
import aiohttp
async with aiohttp.ClientSession() as session:
async with session.get(
f"http://localhost:{self._bridge_port}/chat/{chat_id}",
timeout=aiohttp.ClientTimeout(total=10)
) as resp:
if resp.status == 200:
data = await resp.json()
return {
"name": data.get("name", chat_id),
"type": "group" if data.get("isGroup") else "dm",
"participants": data.get("participants", []),
}
except Exception:
pass
return {"name": chat_id, "type": "dm"}
async def _poll_messages(self) -> None:
"""Poll the bridge for incoming messages."""
try:
import aiohttp
except ImportError:
print(f"[{self.name}] aiohttp not installed, message polling disabled")
return
while self._running:
try:
async with aiohttp.ClientSession() as session:
async with session.get(
f"http://localhost:{self._bridge_port}/messages",
timeout=aiohttp.ClientTimeout(total=30)
) as resp:
if resp.status == 200:
messages = await resp.json()
for msg_data in messages:
event = await self._build_message_event(msg_data)
if event:
await self.handle_message(event)
except asyncio.CancelledError:
break
except Exception as e:
print(f"[{self.name}] Poll error: {e}")
await asyncio.sleep(5)
await asyncio.sleep(1) # Poll interval
async def _build_message_event(self, data: Dict[str, Any]) -> Optional[MessageEvent]:
"""Build a MessageEvent from bridge message data, downloading images to cache."""
try:
# Determine message type
msg_type = MessageType.TEXT
if data.get("hasMedia"):
media_type = data.get("mediaType", "")
if "image" in media_type:
msg_type = MessageType.PHOTO
elif "video" in media_type:
msg_type = MessageType.VIDEO
elif "audio" in media_type or "ptt" in media_type: # ptt = voice note
msg_type = MessageType.VOICE
else:
msg_type = MessageType.DOCUMENT
# Determine chat type
is_group = data.get("isGroup", False)
chat_type = "group" if is_group else "dm"
# Build source
source = self.build_source(
chat_id=data.get("chatId", ""),
chat_name=data.get("chatName"),
chat_type=chat_type,
user_id=data.get("senderId"),
user_name=data.get("senderName"),
)
# Download image media URLs to the local cache so the vision tool
# can access them reliably regardless of URL expiration.
raw_urls = data.get("mediaUrls", [])
cached_urls = []
media_types = []
for url in raw_urls:
if msg_type == MessageType.PHOTO and url.startswith(("http://", "https://")):
try:
cached_path = await cache_image_from_url(url, ext=".jpg")
cached_urls.append(cached_path)
media_types.append("image/jpeg")
print(f"[{self.name}] Cached user image: {cached_path}", flush=True)
except Exception as e:
print(f"[{self.name}] Failed to cache image: {e}", flush=True)
cached_urls.append(url)
media_types.append("image/jpeg")
elif msg_type == MessageType.VOICE and url.startswith(("http://", "https://")):
try:
cached_path = await cache_audio_from_url(url, ext=".ogg")
cached_urls.append(cached_path)
media_types.append("audio/ogg")
print(f"[{self.name}] Cached user voice: {cached_path}", flush=True)
except Exception as e:
print(f"[{self.name}] Failed to cache voice: {e}", flush=True)
cached_urls.append(url)
media_types.append("audio/ogg")
else:
cached_urls.append(url)
media_types.append("unknown")
return MessageEvent(
text=data.get("body", ""),
message_type=msg_type,
source=source,
raw_message=data,
message_id=data.get("messageId"),
media_urls=cached_urls,
media_types=media_types,
)
except Exception as e:
print(f"[{self.name}] Error building event: {e}")
return None
# Note: A reference Node.js bridge script would be provided in scripts/whatsapp-bridge/
# It would use whatsapp-web.js or Baileys to:
# 1. Handle WhatsApp Web authentication (QR code)
# 2. Listen for incoming messages
# 3. Expose HTTP endpoints for send/receive/status

File diff suppressed because it is too large Load Diff

View File

@@ -1,533 +0,0 @@
"""
Session management for the gateway.
Handles:
- Session context tracking (where messages come from)
- Session storage (conversations persisted to disk)
- Reset policy evaluation (when to start fresh)
- Dynamic system prompt injection (agent knows its context)
"""
import os
import json
import uuid
from pathlib import Path
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any
from .config import (
Platform,
GatewayConfig,
SessionResetPolicy,
HomeChannel,
)
@dataclass
class SessionSource:
"""
Describes where a message originated from.
This information is used to:
1. Route responses back to the right place
2. Inject context into the system prompt
3. Track origin for cron job delivery
"""
platform: Platform
chat_id: str
chat_name: Optional[str] = None
chat_type: str = "dm" # "dm", "group", "channel", "thread"
user_id: Optional[str] = None
user_name: Optional[str] = None
thread_id: Optional[str] = None # For forum topics, Discord threads, etc.
@property
def description(self) -> str:
"""Human-readable description of the source."""
if self.platform == Platform.LOCAL:
return "CLI terminal"
parts = []
if self.chat_type == "dm":
parts.append(f"DM with {self.user_name or self.user_id or 'user'}")
elif self.chat_type == "group":
parts.append(f"group: {self.chat_name or self.chat_id}")
elif self.chat_type == "channel":
parts.append(f"channel: {self.chat_name or self.chat_id}")
else:
parts.append(self.chat_name or self.chat_id)
if self.thread_id:
parts.append(f"thread: {self.thread_id}")
return ", ".join(parts)
def to_dict(self) -> Dict[str, Any]:
return {
"platform": self.platform.value,
"chat_id": self.chat_id,
"chat_name": self.chat_name,
"chat_type": self.chat_type,
"user_id": self.user_id,
"user_name": self.user_name,
"thread_id": self.thread_id,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SessionSource":
return cls(
platform=Platform(data["platform"]),
chat_id=str(data["chat_id"]),
chat_name=data.get("chat_name"),
chat_type=data.get("chat_type", "dm"),
user_id=data.get("user_id"),
user_name=data.get("user_name"),
thread_id=data.get("thread_id"),
)
@classmethod
def local_cli(cls) -> "SessionSource":
"""Create a source representing the local CLI."""
return cls(
platform=Platform.LOCAL,
chat_id="cli",
chat_name="CLI terminal",
chat_type="dm",
)
@dataclass
class SessionContext:
"""
Full context for a session, used for dynamic system prompt injection.
The agent receives this information to understand:
- Where messages are coming from
- What platforms are available
- Where it can deliver scheduled task outputs
"""
source: SessionSource
connected_platforms: List[Platform]
home_channels: Dict[Platform, HomeChannel]
# Session metadata
session_key: str = ""
session_id: str = ""
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
def to_dict(self) -> Dict[str, Any]:
return {
"source": self.source.to_dict(),
"connected_platforms": [p.value for p in self.connected_platforms],
"home_channels": {
p.value: hc.to_dict() for p, hc in self.home_channels.items()
},
"session_key": self.session_key,
"session_id": self.session_id,
"created_at": self.created_at.isoformat() if self.created_at else None,
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
}
def build_session_context_prompt(context: SessionContext) -> str:
"""
Build the dynamic system prompt section that tells the agent about its context.
This is injected into the system prompt so the agent knows:
- Where messages are coming from
- What platforms are connected
- Where it can deliver scheduled task outputs
"""
lines = [
"## Current Session Context",
"",
]
# Source info
platform_name = context.source.platform.value.title()
if context.source.platform == Platform.LOCAL:
lines.append(f"**Source:** {platform_name} (the machine running this agent)")
else:
lines.append(f"**Source:** {platform_name} ({context.source.description})")
# Connected platforms
platforms_list = ["local (files on this machine)"]
for p in context.connected_platforms:
if p != Platform.LOCAL:
platforms_list.append(f"{p.value}: Connected ✓")
lines.append(f"**Connected Platforms:** {', '.join(platforms_list)}")
# Home channels
if context.home_channels:
lines.append("")
lines.append("**Home Channels (default destinations):**")
for platform, home in context.home_channels.items():
lines.append(f" - {platform.value}: {home.name} (ID: {home.chat_id})")
# Delivery options for scheduled tasks
lines.append("")
lines.append("**Delivery options for scheduled tasks:**")
# Origin delivery
if context.source.platform == Platform.LOCAL:
lines.append("- `\"origin\"` → Local output (saved to files)")
else:
lines.append(f"- `\"origin\"` → Back to this chat ({context.source.chat_name or context.source.chat_id})")
# Local always available
lines.append("- `\"local\"` → Save to local files only (~/.hermes/cron/output/)")
# Platform home channels
for platform, home in context.home_channels.items():
lines.append(f"- `\"{platform.value}\"` → Home channel ({home.name})")
# Note about explicit targeting
lines.append("")
lines.append("*For explicit targeting, use `\"platform:chat_id\"` format if the user provides a specific chat ID.*")
return "\n".join(lines)
@dataclass
class SessionEntry:
"""
Entry in the session store.
Maps a session key to its current session ID and metadata.
"""
session_key: str
session_id: str
created_at: datetime
updated_at: datetime
# Origin metadata for delivery routing
origin: Optional[SessionSource] = None
# Display metadata
display_name: Optional[str] = None
platform: Optional[Platform] = None
chat_type: str = "dm"
# Token tracking
input_tokens: int = 0
output_tokens: int = 0
total_tokens: int = 0
def to_dict(self) -> Dict[str, Any]:
result = {
"session_key": self.session_key,
"session_id": self.session_id,
"created_at": self.created_at.isoformat(),
"updated_at": self.updated_at.isoformat(),
"display_name": self.display_name,
"platform": self.platform.value if self.platform else None,
"chat_type": self.chat_type,
"input_tokens": self.input_tokens,
"output_tokens": self.output_tokens,
"total_tokens": self.total_tokens,
}
if self.origin:
result["origin"] = self.origin.to_dict()
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SessionEntry":
origin = None
if "origin" in data and data["origin"]:
origin = SessionSource.from_dict(data["origin"])
platform = None
if data.get("platform"):
try:
platform = Platform(data["platform"])
except ValueError:
pass
return cls(
session_key=data["session_key"],
session_id=data["session_id"],
created_at=datetime.fromisoformat(data["created_at"]),
updated_at=datetime.fromisoformat(data["updated_at"]),
origin=origin,
display_name=data.get("display_name"),
platform=platform,
chat_type=data.get("chat_type", "dm"),
input_tokens=data.get("input_tokens", 0),
output_tokens=data.get("output_tokens", 0),
total_tokens=data.get("total_tokens", 0),
)
class SessionStore:
"""
Manages session storage and retrieval.
Sessions are stored in:
- sessions.json: Index mapping session keys to session IDs
- {session_id}.jsonl: Conversation transcripts
"""
def __init__(self, sessions_dir: Path, config: GatewayConfig,
has_active_processes_fn=None):
self.sessions_dir = sessions_dir
self.config = config
self._entries: Dict[str, SessionEntry] = {}
self._loaded = False
# Optional callback to check if a session has active background processes.
# When set, sessions with running processes are exempt from reset.
self._has_active_processes_fn = has_active_processes_fn
def _ensure_loaded(self) -> None:
"""Load sessions from disk if not already loaded."""
if self._loaded:
return
self.sessions_dir.mkdir(parents=True, exist_ok=True)
sessions_file = self.sessions_dir / "sessions.json"
if sessions_file.exists():
try:
with open(sessions_file, "r") as f:
data = json.load(f)
for key, entry_data in data.items():
self._entries[key] = SessionEntry.from_dict(entry_data)
except Exception as e:
print(f"[gateway] Warning: Failed to load sessions: {e}")
self._loaded = True
def _save(self) -> None:
"""Save sessions index to disk."""
self.sessions_dir.mkdir(parents=True, exist_ok=True)
sessions_file = self.sessions_dir / "sessions.json"
data = {key: entry.to_dict() for key, entry in self._entries.items()}
with open(sessions_file, "w") as f:
json.dump(data, f, indent=2)
def _generate_session_key(self, source: SessionSource) -> str:
"""Generate a session key from a source."""
platform = source.platform.value
if source.chat_type == "dm":
# DMs share the main session per platform
return f"agent:main:{platform}:dm"
else:
# Groups/channels get their own keys
return f"agent:main:{platform}:{source.chat_type}:{source.chat_id}"
def _should_reset(self, entry: SessionEntry, source: SessionSource) -> bool:
"""
Check if a session should be reset based on policy.
Returns True if the session is stale and should start fresh.
Sessions with active background processes are never reset.
"""
# Don't reset sessions that have active background processes
if self._has_active_processes_fn:
session_key = self._generate_session_key(source)
if self._has_active_processes_fn(session_key):
return False
policy = self.config.get_reset_policy(
platform=source.platform,
session_type=source.chat_type
)
now = datetime.now()
# Check idle timeout
if policy.mode in ("idle", "both"):
idle_deadline = entry.updated_at + timedelta(minutes=policy.idle_minutes)
if now > idle_deadline:
return True
# Check daily reset
if policy.mode in ("daily", "both"):
# Find the most recent reset boundary
today_reset = now.replace(
hour=policy.at_hour,
minute=0,
second=0,
microsecond=0
)
if now.hour < policy.at_hour:
# Reset boundary was yesterday
today_reset -= timedelta(days=1)
if entry.updated_at < today_reset:
return True
return False
def get_or_create_session(
self,
source: SessionSource,
force_new: bool = False
) -> SessionEntry:
"""
Get an existing session or create a new one.
Evaluates reset policy to determine if the existing session is stale.
"""
self._ensure_loaded()
session_key = self._generate_session_key(source)
now = datetime.now()
# Check for existing session
if session_key in self._entries and not force_new:
entry = self._entries[session_key]
# Check if session should be reset
if not self._should_reset(entry, source):
# Update timestamp and return existing
entry.updated_at = now
self._save()
return entry
# Create new session
session_id = f"{now.strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
entry = SessionEntry(
session_key=session_key,
session_id=session_id,
created_at=now,
updated_at=now,
origin=source,
display_name=source.chat_name,
platform=source.platform,
chat_type=source.chat_type,
)
self._entries[session_key] = entry
self._save()
return entry
def update_session(
self,
session_key: str,
input_tokens: int = 0,
output_tokens: int = 0
) -> None:
"""Update a session's metadata after an interaction."""
self._ensure_loaded()
if session_key in self._entries:
entry = self._entries[session_key]
entry.updated_at = datetime.now()
entry.input_tokens += input_tokens
entry.output_tokens += output_tokens
entry.total_tokens = entry.input_tokens + entry.output_tokens
self._save()
def reset_session(self, session_key: str) -> Optional[SessionEntry]:
"""Force reset a session, creating a new session ID."""
self._ensure_loaded()
if session_key not in self._entries:
return None
old_entry = self._entries[session_key]
now = datetime.now()
session_id = f"{now.strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
new_entry = SessionEntry(
session_key=session_key,
session_id=session_id,
created_at=now,
updated_at=now,
origin=old_entry.origin,
display_name=old_entry.display_name,
platform=old_entry.platform,
chat_type=old_entry.chat_type,
)
self._entries[session_key] = new_entry
self._save()
return new_entry
def list_sessions(self, active_minutes: Optional[int] = None) -> List[SessionEntry]:
"""
List all sessions, optionally filtered by activity.
Args:
active_minutes: If provided, only return sessions updated within this many minutes
"""
self._ensure_loaded()
entries = list(self._entries.values())
if active_minutes is not None:
cutoff = datetime.now() - timedelta(minutes=active_minutes)
entries = [e for e in entries if e.updated_at >= cutoff]
# Sort by most recently updated
entries.sort(key=lambda e: e.updated_at, reverse=True)
return entries
def get_transcript_path(self, session_id: str) -> Path:
"""Get the path to a session's transcript file."""
return self.sessions_dir / f"{session_id}.jsonl"
def append_to_transcript(self, session_id: str, message: Dict[str, Any]) -> None:
"""Append a message to a session's transcript."""
transcript_path = self.get_transcript_path(session_id)
with open(transcript_path, "a") as f:
f.write(json.dumps(message, ensure_ascii=False) + "\n")
def load_transcript(self, session_id: str) -> List[Dict[str, Any]]:
"""Load all messages from a session's transcript."""
transcript_path = self.get_transcript_path(session_id)
if not transcript_path.exists():
return []
messages = []
with open(transcript_path, "r") as f:
for line in f:
line = line.strip()
if line:
messages.append(json.loads(line))
return messages
def build_session_context(
source: SessionSource,
config: GatewayConfig,
session_entry: Optional[SessionEntry] = None
) -> SessionContext:
"""
Build a full session context from a source and config.
This is used to inject context into the agent's system prompt.
"""
connected = config.get_connected_platforms()
home_channels = {}
for platform in connected:
home = config.get_home_channel(platform)
if home:
home_channels[platform] = home
context = SessionContext(
source=source,
connected_platforms=connected,
home_channels=home_channels,
)
if session_entry:
context.session_key = session_entry.session_key
context.session_id = session_entry.session_id
context.created_at = session_entry.created_at
context.updated_at = session_entry.updated_at
return context

View File

@@ -1,111 +0,0 @@
"""
Sticker description cache for Telegram.
When users send stickers, we describe them via the vision tool and cache
the descriptions keyed by file_unique_id so we don't re-analyze the same
sticker image on every send. Descriptions are concise (1-2 sentences).
Cache location: ~/.hermes/sticker_cache.json
"""
import json
import os
import time
from pathlib import Path
from typing import Optional
CACHE_PATH = Path(os.path.expanduser("~/.hermes/sticker_cache.json"))
# Vision prompt for describing stickers -- kept concise to save tokens
STICKER_VISION_PROMPT = (
"Describe this sticker in 1-2 sentences. Focus on what it depicts -- "
"character, action, emotion. Be concise and objective."
)
def _load_cache() -> dict:
"""Load the sticker cache from disk."""
if CACHE_PATH.exists():
try:
return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return {}
return {}
def _save_cache(cache: dict) -> None:
"""Save the sticker cache to disk."""
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
CACHE_PATH.write_text(
json.dumps(cache, indent=2, ensure_ascii=False),
encoding="utf-8",
)
def get_cached_description(file_unique_id: str) -> Optional[dict]:
"""
Look up a cached sticker description.
Returns:
dict with keys {description, emoji, set_name, cached_at} or None.
"""
cache = _load_cache()
return cache.get(file_unique_id)
def cache_sticker_description(
file_unique_id: str,
description: str,
emoji: str = "",
set_name: str = "",
) -> None:
"""
Store a sticker description in the cache.
Args:
file_unique_id: Telegram's stable sticker identifier.
description: Vision-generated description text.
emoji: Associated emoji (e.g. "😀").
set_name: Sticker set name if available.
"""
cache = _load_cache()
cache[file_unique_id] = {
"description": description,
"emoji": emoji,
"set_name": set_name,
"cached_at": time.time(),
}
_save_cache(cache)
def build_sticker_injection(
description: str,
emoji: str = "",
set_name: str = "",
) -> str:
"""
Build the warm-style injection text for a sticker description.
Returns a string like:
[The user sent a sticker 😀 from "MyPack"~ It shows: "A cat waving" (=^.w.^=)]
"""
context = ""
if set_name and emoji:
context = f" {emoji} from \"{set_name}\""
elif emoji:
context = f" {emoji}"
return f"[The user sent a sticker{context}~ It shows: \"{description}\" (=^.w.^=)]"
def build_animated_sticker_injection(emoji: str = "") -> str:
"""
Build injection text for animated/video stickers we can't analyze.
"""
if emoji:
return (
f"[The user sent an animated sticker {emoji}~ "
f"I can't see animated ones yet, but the emoji suggests: {emoji}]"
)
return "[The user sent an animated sticker~ I can't see animated ones yet]"

12
hermes
View File

@@ -1,12 +0,0 @@
#!/usr/bin/env python3
"""
Hermes Agent CLI Launcher
This is a convenience wrapper to launch the Hermes CLI.
Usage: ./hermes [options]
"""
if __name__ == "__main__":
from cli import main
import fire
fire.Fire(main)

View File

@@ -1,14 +0,0 @@
"""
Hermes CLI - Unified command-line interface for Hermes Agent.
Provides subcommands for:
- hermes chat - Interactive chat (same as ./hermes)
- hermes gateway - Run gateway in foreground
- hermes gateway start - Start gateway service
- hermes gateway stop - Stop gateway service
- hermes setup - Interactive setup wizard
- hermes status - Show status of all components
- hermes cron - Manage cron jobs
"""
__version__ = "0.1.0"

View File

@@ -1,897 +0,0 @@
"""
Configuration management for Hermes Agent.
Config files are stored in ~/.hermes/ for easy access:
- ~/.hermes/config.yaml - All settings (model, toolsets, terminal, etc.)
- ~/.hermes/.env - API keys and secrets
This module provides:
- hermes config - Show current configuration
- hermes config edit - Open config in editor
- hermes config set - Set a specific value
- hermes config wizard - Re-run setup wizard
"""
import os
import sys
import subprocess
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
import yaml
# ANSI colors
class Colors:
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
BLUE = "\033[34m"
MAGENTA = "\033[35m"
CYAN = "\033[36m"
def color(text: str, *codes) -> str:
if not sys.stdout.isatty():
return text
return "".join(codes) + text + Colors.RESET
# =============================================================================
# Config paths
# =============================================================================
def get_hermes_home() -> Path:
"""Get the Hermes home directory (~/.hermes)."""
return Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
def get_config_path() -> Path:
"""Get the main config file path."""
return get_hermes_home() / "config.yaml"
def get_env_path() -> Path:
"""Get the .env file path (for API keys)."""
return get_hermes_home() / ".env"
def get_project_root() -> Path:
"""Get the project installation directory."""
return Path(__file__).parent.parent.resolve()
def ensure_hermes_home():
"""Ensure ~/.hermes directory structure exists."""
home = get_hermes_home()
(home / "cron").mkdir(parents=True, exist_ok=True)
(home / "sessions").mkdir(parents=True, exist_ok=True)
(home / "logs").mkdir(parents=True, exist_ok=True)
# =============================================================================
# Config loading/saving
# =============================================================================
DEFAULT_CONFIG = {
"model": "anthropic/claude-opus-4.6",
"toolsets": ["hermes-cli"],
"max_turns": 100,
"terminal": {
"backend": "local",
"cwd": ".", # Use current directory
"timeout": 180,
"docker_image": "nikolaik/python-nodejs:python3.11-nodejs20",
"singularity_image": "docker://nikolaik/python-nodejs:python3.11-nodejs20",
"modal_image": "nikolaik/python-nodejs:python3.11-nodejs20",
},
"browser": {
"inactivity_timeout": 120,
},
"compression": {
"enabled": True,
"threshold": 0.85,
"summary_model": "google/gemini-3-flash-preview",
},
"display": {
"compact": False,
"personality": "kawaii",
},
# Text-to-speech configuration
"tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai"
"edge": {
"voice": "en-US-AriaNeural",
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
},
"elevenlabs": {
"voice_id": "pNInz6obpgDQGcFmaJgB", # Adam
"model_id": "eleven_multilingual_v2",
},
"openai": {
"model": "gpt-4o-mini-tts",
"voice": "alloy",
# Voices: alloy, echo, fable, onyx, nova, shimmer
},
},
"stt": {
"enabled": True,
"model": "whisper-1",
},
"human_delay": {
"mode": "off",
"min_ms": 800,
"max_ms": 2500,
},
# Permanently allowed dangerous command patterns (added via "always" approval)
"command_allowlist": [],
# Config schema version - bump this when adding new required fields
"_config_version": 2,
}
# =============================================================================
# Config Migration System
# =============================================================================
# Required environment variables with metadata for migration prompts
REQUIRED_ENV_VARS = {
"OPENROUTER_API_KEY": {
"description": "OpenRouter API key (required for vision, web scraping, and tools)",
"prompt": "OpenRouter API key",
"url": "https://openrouter.ai/keys",
"required": True,
"password": True,
},
}
# Optional environment variables that enhance functionality
OPTIONAL_ENV_VARS = {
"FIRECRAWL_API_KEY": {
"description": "Firecrawl API key for web search and scraping",
"prompt": "Firecrawl API key",
"url": "https://firecrawl.dev/",
"tools": ["web_search", "web_extract"],
"password": True,
},
"BROWSERBASE_API_KEY": {
"description": "Browserbase API key for browser automation",
"prompt": "Browserbase API key",
"url": "https://browserbase.com/",
"tools": ["browser_navigate", "browser_click", "etc."],
"password": True,
},
"BROWSERBASE_PROJECT_ID": {
"description": "Browserbase project ID",
"prompt": "Browserbase project ID",
"url": "https://browserbase.com/",
"tools": ["browser_navigate", "browser_click", "etc."],
"password": False,
},
"FAL_KEY": {
"description": "FAL API key for image generation",
"prompt": "FAL API key",
"url": "https://fal.ai/",
"tools": ["image_generate"],
"password": True,
},
"TINKER_API_KEY": {
"description": "Tinker API key for RL training",
"prompt": "Tinker API key",
"url": "https://tinker-console.thinkingmachines.ai/keys",
"tools": ["rl_start_training", "rl_check_status", "rl_stop_training"],
"password": True,
},
"WANDB_API_KEY": {
"description": "Weights & Biases API key for experiment tracking",
"prompt": "WandB API key",
"url": "https://wandb.ai/authorize",
"tools": ["rl_get_results", "rl_check_status"],
"password": True,
},
"OPENAI_BASE_URL": {
"description": "Custom OpenAI-compatible API endpoint (for VLLM/SGLang/etc.)",
"prompt": "OpenAI-compatible base URL (only if running your own endpoint)",
"url": None,
"password": False,
"advanced": True, # Hide from standard migrate flow
},
"HERMES_OPENAI_API_KEY": {
"description": "OpenAI API key for voice transcription (Whisper) and OpenAI TTS",
"prompt": "OpenAI API Key (for Whisper STT + TTS)",
"url": "https://platform.openai.com/api-keys",
"tools": ["voice_transcription", "openai_tts"],
"password": True,
},
"SLACK_BOT_TOKEN": {
"description": "Slack bot integration",
"prompt": "Slack Bot Token (xoxb-...)",
"url": "https://api.slack.com/apps",
"tools": ["slack"],
"password": True,
},
"SLACK_APP_TOKEN": {
"description": "Slack Socket Mode connection",
"prompt": "Slack App Token (xapp-...)",
"url": "https://api.slack.com/apps",
"tools": ["slack"],
"password": True,
},
# Messaging platform tokens
"TELEGRAM_BOT_TOKEN": {
"description": "Telegram bot token from @BotFather",
"prompt": "Telegram bot token",
"url": "https://t.me/BotFather",
"password": True,
},
"TELEGRAM_ALLOWED_USERS": {
"description": "Comma-separated Telegram user IDs allowed to use the bot (get ID from @userinfobot)",
"prompt": "Allowed Telegram user IDs (comma-separated)",
"url": "https://t.me/userinfobot",
"password": False,
},
"DISCORD_BOT_TOKEN": {
"description": "Discord bot token from Developer Portal",
"prompt": "Discord bot token",
"url": "https://discord.com/developers/applications",
"password": True,
},
"DISCORD_ALLOWED_USERS": {
"description": "Comma-separated Discord user IDs allowed to use the bot",
"prompt": "Allowed Discord user IDs (comma-separated)",
"url": None,
"password": False,
},
# Text-to-speech (premium providers)
"ELEVENLABS_API_KEY": {
"description": "ElevenLabs API key for premium text-to-speech voices",
"prompt": "ElevenLabs API key",
"url": "https://elevenlabs.io/",
"password": True,
},
# Terminal configuration
"MESSAGING_CWD": {
"description": "Working directory for terminal commands via messaging (Telegram/Discord/etc). CLI always uses current directory.",
"prompt": "Messaging working directory (default: home)",
"url": None,
"password": False,
},
"SUDO_PASSWORD": {
"description": "Sudo password for terminal commands requiring root access",
"prompt": "Sudo password",
"url": None,
"password": True,
},
# Agent configuration
"HERMES_MAX_ITERATIONS": {
"description": "Maximum tool-calling iterations per conversation (default: 60)",
"prompt": "Max iterations",
"url": None,
"password": False,
},
"HERMES_TOOL_PROGRESS": {
"description": "Send tool progress messages in messaging channels (true/false)",
"prompt": "Enable tool progress messages",
"url": None,
"password": False,
},
"HERMES_TOOL_PROGRESS_MODE": {
"description": "Progress mode: 'all' (every tool) or 'new' (only when tool changes)",
"prompt": "Progress mode (all/new)",
"url": None,
"password": False,
},
}
def get_missing_env_vars(required_only: bool = False) -> List[Dict[str, Any]]:
"""
Check which environment variables are missing.
Returns list of dicts with var info for missing variables.
"""
missing = []
# Check required vars
for var_name, info in REQUIRED_ENV_VARS.items():
if not get_env_value(var_name):
missing.append({"name": var_name, **info, "is_required": True})
# Check optional vars (if not required_only)
if not required_only:
for var_name, info in OPTIONAL_ENV_VARS.items():
if not get_env_value(var_name):
missing.append({"name": var_name, **info, "is_required": False})
return missing
def get_missing_config_fields() -> List[Dict[str, Any]]:
"""
Check which config fields are missing or outdated.
Returns list of missing/outdated fields.
"""
config = load_config()
missing = []
# Check for new top-level keys in DEFAULT_CONFIG
for key, default_value in DEFAULT_CONFIG.items():
if key.startswith('_'):
continue # Skip internal keys
if key not in config:
missing.append({
"key": key,
"default": default_value,
"description": f"New config section: {key}",
})
elif isinstance(default_value, dict):
# Check nested keys
for subkey, subvalue in default_value.items():
if subkey not in config.get(key, {}):
missing.append({
"key": f"{key}.{subkey}",
"default": subvalue,
"description": f"New config option: {key}.{subkey}",
})
return missing
def check_config_version() -> Tuple[int, int]:
"""
Check config version.
Returns (current_version, latest_version).
"""
config = load_config()
current = config.get("_config_version", 0)
latest = DEFAULT_CONFIG.get("_config_version", 1)
return current, latest
def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, Any]:
"""
Migrate config to latest version, prompting for new required fields.
Args:
interactive: If True, prompt user for missing values
quiet: If True, suppress output
Returns:
Dict with migration results: {"env_added": [...], "config_added": [...], "warnings": [...]}
"""
results = {"env_added": [], "config_added": [], "warnings": []}
# Check config version
current_ver, latest_ver = check_config_version()
if current_ver < latest_ver and not quiet:
print(f"Config version: {current_ver}{latest_ver}")
# Check for missing required env vars
missing_env = get_missing_env_vars(required_only=True)
if missing_env and not quiet:
print("\n⚠️ Missing required environment variables:")
for var in missing_env:
print(f"{var['name']}: {var['description']}")
if interactive and missing_env:
print("\nLet's configure them now:\n")
for var in missing_env:
if var.get("url"):
print(f" Get your key at: {var['url']}")
if var.get("password"):
import getpass
value = getpass.getpass(f" {var['prompt']}: ")
else:
value = input(f" {var['prompt']}: ").strip()
if value:
save_env_value(var["name"], value)
results["env_added"].append(var["name"])
print(f" ✓ Saved {var['name']}")
else:
results["warnings"].append(f"Skipped {var['name']} - some features may not work")
print()
# Check for missing optional env vars and offer to configure interactively
# Skip "advanced" vars (like OPENAI_BASE_URL) -- those are for power users
missing_optional = get_missing_env_vars(required_only=False)
required_names = {v["name"] for v in missing_env} if missing_env else set()
missing_optional = [
v for v in missing_optional
if v["name"] not in required_names and not v.get("advanced")
]
if interactive and missing_optional:
print(" Would you like to configure any optional keys now?")
try:
answer = input(" Configure optional keys? [y/N]: ").strip().lower()
except (EOFError, KeyboardInterrupt):
answer = "n"
if answer in ("y", "yes"):
print()
for var in missing_optional:
desc = var.get("description", "")
if var.get("url"):
print(f" {desc}")
print(f" Get your key at: {var['url']}")
else:
print(f" {desc}")
if var.get("password"):
import getpass
value = getpass.getpass(f" {var['prompt']} (Enter to skip): ")
else:
value = input(f" {var['prompt']} (Enter to skip): ").strip()
if value:
save_env_value(var["name"], value)
results["env_added"].append(var["name"])
print(f" ✓ Saved {var['name']}")
print()
# Check for missing config fields
missing_config = get_missing_config_fields()
if missing_config:
config = load_config()
for field in missing_config:
key = field["key"]
default = field["default"]
# Add with default value
if "." in key:
# Nested key
parent, child = key.split(".", 1)
if parent not in config:
config[parent] = {}
config[parent][child] = default
else:
config[key] = default
results["config_added"].append(key)
if not quiet:
print(f" ✓ Added {key} = {default}")
# Update version and save
config["_config_version"] = latest_ver
save_config(config)
elif current_ver < latest_ver:
# Just update version
config = load_config()
config["_config_version"] = latest_ver
save_config(config)
return results
def load_config() -> Dict[str, Any]:
"""Load configuration from ~/.hermes/config.yaml."""
import copy
config_path = get_config_path()
# Deep copy to avoid mutating DEFAULT_CONFIG
config = copy.deepcopy(DEFAULT_CONFIG)
if config_path.exists():
try:
with open(config_path) as f:
user_config = yaml.safe_load(f) or {}
# Deep merge user values over defaults
for key, value in user_config.items():
if isinstance(value, dict) and key in config and isinstance(config[key], dict):
config[key].update(value)
else:
config[key] = value
except Exception as e:
print(f"Warning: Failed to load config: {e}")
return config
def save_config(config: Dict[str, Any]):
"""Save configuration to ~/.hermes/config.yaml."""
ensure_hermes_home()
config_path = get_config_path()
with open(config_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
def load_env() -> Dict[str, str]:
"""Load environment variables from ~/.hermes/.env."""
env_path = get_env_path()
env_vars = {}
if env_path.exists():
with open(env_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, _, value = line.partition('=')
env_vars[key.strip()] = value.strip().strip('"\'')
return env_vars
def save_env_value(key: str, value: str):
"""Save or update a value in ~/.hermes/.env."""
ensure_hermes_home()
env_path = get_env_path()
# Load existing
lines = []
if env_path.exists():
with open(env_path) as f:
lines = f.readlines()
# Find and update or append
found = False
for i, line in enumerate(lines):
if line.strip().startswith(f"{key}="):
lines[i] = f"{key}={value}\n"
found = True
break
if not found:
# Ensure there's a newline at the end of the file before appending
if lines and not lines[-1].endswith("\n"):
lines[-1] += "\n"
lines.append(f"{key}={value}\n")
with open(env_path, 'w') as f:
f.writelines(lines)
def get_env_value(key: str) -> Optional[str]:
"""Get a value from ~/.hermes/.env or environment."""
# Check environment first
if key in os.environ:
return os.environ[key]
# Then check .env file
env_vars = load_env()
return env_vars.get(key)
# =============================================================================
# Config display
# =============================================================================
def redact_key(key: str) -> str:
"""Redact an API key for display."""
if not key:
return color("(not set)", Colors.DIM)
if len(key) < 12:
return "***"
return key[:4] + "..." + key[-4:]
def show_config():
"""Display current configuration."""
config = load_config()
env_vars = load_env()
print()
print(color("┌─────────────────────────────────────────────────────────┐", Colors.CYAN))
print(color("│ 🦋 Hermes Configuration │", Colors.CYAN))
print(color("└─────────────────────────────────────────────────────────┘", Colors.CYAN))
# Paths
print()
print(color("◆ Paths", Colors.CYAN, Colors.BOLD))
print(f" Config: {get_config_path()}")
print(f" Secrets: {get_env_path()}")
print(f" Install: {get_project_root()}")
# API Keys
print()
print(color("◆ API Keys", Colors.CYAN, Colors.BOLD))
keys = [
("OPENROUTER_API_KEY", "OpenRouter"),
("ANTHROPIC_API_KEY", "Anthropic"),
("HERMES_OPENAI_API_KEY", "OpenAI (STT/TTS)"),
("FIRECRAWL_API_KEY", "Firecrawl"),
("BROWSERBASE_API_KEY", "Browserbase"),
("FAL_KEY", "FAL"),
]
for env_key, name in keys:
value = get_env_value(env_key)
print(f" {name:<14} {redact_key(value)}")
# Model settings
print()
print(color("◆ Model", Colors.CYAN, Colors.BOLD))
print(f" Model: {config.get('model', 'not set')}")
print(f" Max turns: {config.get('max_turns', 100)}")
print(f" Toolsets: {', '.join(config.get('toolsets', ['all']))}")
# Terminal
print()
print(color("◆ Terminal", Colors.CYAN, Colors.BOLD))
terminal = config.get('terminal', {})
print(f" Backend: {terminal.get('backend', 'local')}")
print(f" Working dir: {terminal.get('cwd', '.')}")
print(f" Timeout: {terminal.get('timeout', 60)}s")
if terminal.get('backend') == 'docker':
print(f" Docker image: {terminal.get('docker_image', 'python:3.11-slim')}")
elif terminal.get('backend') == 'singularity':
print(f" Image: {terminal.get('singularity_image', 'docker://python:3.11')}")
elif terminal.get('backend') == 'modal':
print(f" Modal image: {terminal.get('modal_image', 'python:3.11')}")
modal_token = get_env_value('MODAL_TOKEN_ID')
print(f" Modal token: {'configured' if modal_token else '(not set)'}")
elif terminal.get('backend') == 'ssh':
ssh_host = get_env_value('TERMINAL_SSH_HOST')
ssh_user = get_env_value('TERMINAL_SSH_USER')
print(f" SSH host: {ssh_host or '(not set)'}")
print(f" SSH user: {ssh_user or '(not set)'}")
# Compression
print()
print(color("◆ Context Compression", Colors.CYAN, Colors.BOLD))
compression = config.get('compression', {})
enabled = compression.get('enabled', True)
print(f" Enabled: {'yes' if enabled else 'no'}")
if enabled:
print(f" Threshold: {compression.get('threshold', 0.85) * 100:.0f}%")
print(f" Model: {compression.get('summary_model', 'google/gemini-3-flash-preview')}")
# Messaging
print()
print(color("◆ Messaging Platforms", Colors.CYAN, Colors.BOLD))
telegram_token = get_env_value('TELEGRAM_BOT_TOKEN')
discord_token = get_env_value('DISCORD_BOT_TOKEN')
print(f" Telegram: {'configured' if telegram_token else color('not configured', Colors.DIM)}")
print(f" Discord: {'configured' if discord_token else color('not configured', Colors.DIM)}")
print()
print(color("" * 60, Colors.DIM))
print(color(" hermes config edit # Edit config file", Colors.DIM))
print(color(" hermes config set KEY VALUE", Colors.DIM))
print(color(" hermes setup # Run setup wizard", Colors.DIM))
print()
def edit_config():
"""Open config file in user's editor."""
config_path = get_config_path()
# Ensure config exists
if not config_path.exists():
save_config(DEFAULT_CONFIG)
print(f"Created {config_path}")
# Find editor
editor = os.getenv('EDITOR') or os.getenv('VISUAL')
if not editor:
# Try common editors
for cmd in ['nano', 'vim', 'vi', 'code', 'notepad']:
import shutil
if shutil.which(cmd):
editor = cmd
break
if not editor:
print(f"No editor found. Config file is at:")
print(f" {config_path}")
return
print(f"Opening {config_path} in {editor}...")
subprocess.run([editor, str(config_path)])
def set_config_value(key: str, value: str):
"""Set a configuration value."""
# Check if it's an API key (goes to .env)
api_keys = [
'OPENROUTER_API_KEY', 'ANTHROPIC_API_KEY', 'HERMES_OPENAI_API_KEY',
'FIRECRAWL_API_KEY', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID',
'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN',
'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY',
'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN',
]
if key.upper() in api_keys or key.upper().startswith('TERMINAL_SSH'):
save_env_value(key.upper(), value)
print(f"✓ Set {key} in {get_env_path()}")
return
# Otherwise it goes to config.yaml
# Read the raw user config (not merged with defaults) to avoid
# dumping all default values back to the file
config_path = get_config_path()
user_config = {}
if config_path.exists():
try:
with open(config_path) as f:
user_config = yaml.safe_load(f) or {}
except Exception:
user_config = {}
# Handle nested keys (e.g., "tts.provider")
parts = key.split('.')
current = user_config
for part in parts[:-1]:
if part not in current or not isinstance(current.get(part), dict):
current[part] = {}
current = current[part]
# Convert value to appropriate type
if value.lower() in ('true', 'yes', 'on'):
value = True
elif value.lower() in ('false', 'no', 'off'):
value = False
elif value.isdigit():
value = int(value)
elif value.replace('.', '', 1).isdigit():
value = float(value)
current[parts[-1]] = value
# Write only user config back (not the full merged defaults)
ensure_hermes_home()
with open(config_path, 'w') as f:
yaml.dump(user_config, f, default_flow_style=False, sort_keys=False)
print(f"✓ Set {key} = {value} in {config_path}")
# =============================================================================
# Command handler
# =============================================================================
def config_command(args):
"""Handle config subcommands."""
subcmd = getattr(args, 'config_command', None)
if subcmd is None or subcmd == "show":
show_config()
elif subcmd == "edit":
edit_config()
elif subcmd == "set":
key = getattr(args, 'key', None)
value = getattr(args, 'value', None)
if not key or not value:
print("Usage: hermes config set KEY VALUE")
print()
print("Examples:")
print(" hermes config set model anthropic/claude-sonnet-4")
print(" hermes config set terminal.backend docker")
print(" hermes config set OPENROUTER_API_KEY sk-or-...")
sys.exit(1)
set_config_value(key, value)
elif subcmd == "path":
print(get_config_path())
elif subcmd == "env-path":
print(get_env_path())
elif subcmd == "migrate":
print()
print(color("🔄 Checking configuration for updates...", Colors.CYAN, Colors.BOLD))
print()
# Check what's missing
missing_env = get_missing_env_vars(required_only=False)
missing_config = get_missing_config_fields()
current_ver, latest_ver = check_config_version()
if not missing_env and not missing_config and current_ver >= latest_ver:
print(color("✓ Configuration is up to date!", Colors.GREEN))
print()
return
# Show what needs to be updated
if current_ver < latest_ver:
print(f" Config version: {current_ver}{latest_ver}")
if missing_config:
print(f"\n {len(missing_config)} new config option(s) will be added with defaults")
required_missing = [v for v in missing_env if v.get("is_required")]
optional_missing = [
v for v in missing_env
if not v.get("is_required") and not v.get("advanced")
]
if required_missing:
print(f"\n ⚠️ {len(required_missing)} required API key(s) missing:")
for var in required_missing:
print(f"{var['name']}")
if optional_missing:
print(f"\n {len(optional_missing)} optional API key(s) not configured:")
for var in optional_missing:
tools = var.get("tools", [])
tools_str = f" (enables: {', '.join(tools[:2])})" if tools else ""
print(f"{var['name']}{tools_str}")
print()
# Run migration
results = migrate_config(interactive=True, quiet=False)
print()
if results["env_added"] or results["config_added"]:
print(color("✓ Configuration updated!", Colors.GREEN))
if results["warnings"]:
print()
for warning in results["warnings"]:
print(color(f" ⚠️ {warning}", Colors.YELLOW))
print()
elif subcmd == "check":
# Non-interactive check for what's missing
print()
print(color("📋 Configuration Status", Colors.CYAN, Colors.BOLD))
print()
current_ver, latest_ver = check_config_version()
if current_ver >= latest_ver:
print(f" Config version: {current_ver}")
else:
print(color(f" Config version: {current_ver}{latest_ver} (update available)", Colors.YELLOW))
print()
print(color(" Required:", Colors.BOLD))
for var_name in REQUIRED_ENV_VARS:
if get_env_value(var_name):
print(f"{var_name}")
else:
print(color(f"{var_name} (missing)", Colors.RED))
print()
print(color(" Optional:", Colors.BOLD))
for var_name, info in OPTIONAL_ENV_VARS.items():
if get_env_value(var_name):
print(f"{var_name}")
else:
tools = info.get("tools", [])
tools_str = f"{', '.join(tools[:2])}" if tools else ""
print(color(f"{var_name}{tools_str}", Colors.DIM))
missing_config = get_missing_config_fields()
if missing_config:
print()
print(color(f" {len(missing_config)} new config option(s) available", Colors.YELLOW))
print(f" Run 'hermes config migrate' to add them")
print()
else:
print(f"Unknown config command: {subcmd}")
print()
print("Available commands:")
print(" hermes config Show current configuration")
print(" hermes config edit Open config in editor")
print(" hermes config set K V Set a config value")
print(" hermes config check Check for missing/outdated config")
print(" hermes config migrate Update config with new options")
print(" hermes config path Show config file path")
print(" hermes config env-path Show .env file path")
sys.exit(1)

View File

@@ -1,131 +0,0 @@
"""
Cron subcommand for hermes CLI.
Handles: hermes cron [list|daemon|tick]
"""
import json
import sys
import time
from pathlib import Path
from datetime import datetime
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
sys.path.insert(0, str(PROJECT_ROOT))
# ANSI colors
class Colors:
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
CYAN = "\033[36m"
def color(text: str, *codes) -> str:
if not sys.stdout.isatty():
return text
return "".join(codes) + text + Colors.RESET
def cron_list(show_all: bool = False):
"""List all scheduled jobs."""
from cron.jobs import list_jobs
jobs = list_jobs(include_disabled=show_all)
if not jobs:
print(color("No scheduled jobs.", Colors.DIM))
print(color("Create one with: hermes cron add <schedule> <prompt>", Colors.DIM))
return
print()
print(color("┌─────────────────────────────────────────────────────────────────────────┐", Colors.CYAN))
print(color("│ Scheduled Jobs │", Colors.CYAN))
print(color("└─────────────────────────────────────────────────────────────────────────┘", Colors.CYAN))
print()
for job in jobs:
job_id = job.get("id", "?")[:8]
name = job.get("name", "(unnamed)")
schedule = job.get("schedule_display", job.get("schedule", {}).get("value", "?"))
enabled = job.get("enabled", True)
next_run = job.get("next_run_at", "?")
# Repeat info
repeat_info = job.get("repeat", {})
repeat_times = repeat_info.get("times")
repeat_completed = repeat_info.get("completed", 0)
if repeat_times:
repeat_str = f"{repeat_completed}/{repeat_times}"
else:
repeat_str = ""
# Delivery targets
deliver = job.get("deliver", ["local"])
if isinstance(deliver, str):
deliver = [deliver]
deliver_str = ", ".join(deliver)
# Status indicator
if not enabled:
status = color("[disabled]", Colors.RED)
else:
status = color("[active]", Colors.GREEN)
print(f" {color(job_id, Colors.YELLOW)} {status}")
print(f" Name: {name}")
print(f" Schedule: {schedule}")
print(f" Repeat: {repeat_str}")
print(f" Next run: {next_run}")
print(f" Deliver: {deliver_str}")
print()
def cron_daemon(interval: int = 60):
"""Run the cron daemon."""
from cron.scheduler import start_daemon
print(color("┌─────────────────────────────────────────────────────────┐", Colors.CYAN))
print(color("│ 🦋 Hermes Cron Daemon │", Colors.CYAN))
print(color("├─────────────────────────────────────────────────────────┤", Colors.CYAN))
print(color("│ Press Ctrl+C to stop │", Colors.CYAN))
print(color("└─────────────────────────────────────────────────────────┘", Colors.CYAN))
print()
try:
start_daemon(interval=interval)
except KeyboardInterrupt:
print()
print(color("Cron daemon stopped.", Colors.YELLOW))
def cron_tick():
"""Run due jobs once (for system cron integration)."""
from cron.scheduler import tick
print(f"[{datetime.now().isoformat()}] Running cron tick...")
tick()
def cron_command(args):
"""Handle cron subcommands."""
subcmd = getattr(args, 'cron_command', None)
if subcmd is None or subcmd == "list":
show_all = getattr(args, 'all', False)
cron_list(show_all)
elif subcmd == "daemon":
interval = getattr(args, 'interval', 60)
cron_daemon(interval)
elif subcmd == "tick":
cron_tick()
else:
print(f"Unknown cron command: {subcmd}")
print("Usage: hermes cron [list|daemon|tick]")
sys.exit(1)

View File

@@ -1,402 +0,0 @@
"""
Doctor command for hermes CLI.
Diagnoses issues with Hermes Agent setup.
"""
import os
import sys
import subprocess
import shutil
from pathlib import Path
from hermes_cli.config import get_project_root, get_hermes_home, get_env_path
PROJECT_ROOT = get_project_root()
HERMES_HOME = get_hermes_home()
# Load environment variables from ~/.hermes/.env so API key checks work
from dotenv import load_dotenv
_env_path = get_env_path()
if _env_path.exists():
load_dotenv(_env_path)
# Also try project .env as fallback
load_dotenv(PROJECT_ROOT / ".env", override=False)
# ANSI colors
class Colors:
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
CYAN = "\033[36m"
def color(text: str, *codes) -> str:
if not sys.stdout.isatty():
return text
return "".join(codes) + text + Colors.RESET
def check_ok(text: str, detail: str = ""):
print(f" {color('', Colors.GREEN)} {text}" + (f" {color(detail, Colors.DIM)}" if detail else ""))
def check_warn(text: str, detail: str = ""):
print(f" {color('', Colors.YELLOW)} {text}" + (f" {color(detail, Colors.DIM)}" if detail else ""))
def check_fail(text: str, detail: str = ""):
print(f" {color('', Colors.RED)} {text}" + (f" {color(detail, Colors.DIM)}" if detail else ""))
def check_info(text: str):
print(f" {color('', Colors.CYAN)} {text}")
def run_doctor(args):
"""Run diagnostic checks."""
should_fix = getattr(args, 'fix', False)
issues = []
print()
print(color("┌─────────────────────────────────────────────────────────┐", Colors.CYAN))
print(color("│ 🩺 Hermes Doctor │", Colors.CYAN))
print(color("└─────────────────────────────────────────────────────────┘", Colors.CYAN))
# =========================================================================
# Check: Python version
# =========================================================================
print()
print(color("◆ Python Environment", Colors.CYAN, Colors.BOLD))
py_version = sys.version_info
if py_version >= (3, 11):
check_ok(f"Python {py_version.major}.{py_version.minor}.{py_version.micro}")
elif py_version >= (3, 10):
check_ok(f"Python {py_version.major}.{py_version.minor}.{py_version.micro}")
check_warn("Python 3.11+ recommended for RL Training tools (tinker requires >= 3.11)")
elif py_version >= (3, 8):
check_warn(f"Python {py_version.major}.{py_version.minor}.{py_version.micro}", "(3.10+ recommended)")
else:
check_fail(f"Python {py_version.major}.{py_version.minor}.{py_version.micro}", "(3.10+ required)")
issues.append("Upgrade Python to 3.10+")
# Check if in virtual environment
in_venv = sys.prefix != sys.base_prefix
if in_venv:
check_ok("Virtual environment active")
else:
check_warn("Not in virtual environment", "(recommended)")
# =========================================================================
# Check: Required packages
# =========================================================================
print()
print(color("◆ Required Packages", Colors.CYAN, Colors.BOLD))
required_packages = [
("openai", "OpenAI SDK"),
("rich", "Rich (terminal UI)"),
("dotenv", "python-dotenv"),
("yaml", "PyYAML"),
("httpx", "HTTPX"),
]
optional_packages = [
("croniter", "Croniter (cron expressions)"),
("telegram", "python-telegram-bot"),
("discord", "discord.py"),
]
for module, name in required_packages:
try:
__import__(module)
check_ok(name)
except ImportError:
check_fail(name, "(missing)")
issues.append(f"Install {name}: uv pip install {module}")
for module, name in optional_packages:
try:
__import__(module)
check_ok(name, "(optional)")
except ImportError:
check_warn(name, "(optional, not installed)")
# =========================================================================
# Check: Configuration files
# =========================================================================
print()
print(color("◆ Configuration Files", Colors.CYAN, Colors.BOLD))
# Check ~/.hermes/.env (primary location for user config)
env_path = HERMES_HOME / '.env'
if env_path.exists():
check_ok("~/.hermes/.env file exists")
# Check for common issues
content = env_path.read_text()
if "OPENROUTER_API_KEY" in content or "ANTHROPIC_API_KEY" in content:
check_ok("API key configured")
else:
check_warn("No API key found in ~/.hermes/.env")
issues.append("Run 'hermes setup' to configure API keys")
else:
# Also check project root as fallback
fallback_env = PROJECT_ROOT / '.env'
if fallback_env.exists():
check_ok(".env file exists (in project directory)")
else:
check_fail("~/.hermes/.env file missing")
check_info("Run 'hermes setup' to create one")
issues.append("Run 'hermes setup' to create .env")
# Check ~/.hermes/config.yaml (primary) or project cli-config.yaml (fallback)
config_path = HERMES_HOME / 'config.yaml'
if config_path.exists():
check_ok("~/.hermes/config.yaml exists")
else:
fallback_config = PROJECT_ROOT / 'cli-config.yaml'
if fallback_config.exists():
check_ok("cli-config.yaml exists (in project directory)")
else:
check_warn("config.yaml not found", "(using defaults)")
# =========================================================================
# Check: Directory structure
# =========================================================================
print()
print(color("◆ Directory Structure", Colors.CYAN, Colors.BOLD))
hermes_home = Path.home() / ".hermes"
if hermes_home.exists():
check_ok("~/.hermes directory exists")
else:
check_warn("~/.hermes not found", "(will be created on first use)")
# Check for SOUL.md persona file
soul_path = hermes_home / "SOUL.md"
if soul_path.exists():
content = soul_path.read_text(encoding="utf-8").strip()
# Check if it's just the template comments (no real content)
lines = [l for l in content.splitlines() if l.strip() and not l.strip().startswith(("<!--", "-->", "#"))]
if lines:
check_ok("~/.hermes/SOUL.md exists (persona configured)")
else:
check_info("~/.hermes/SOUL.md exists but is empty — edit it to customize personality")
else:
check_warn("~/.hermes/SOUL.md not found", "(create it to give Hermes a custom personality)")
if should_fix:
soul_path.parent.mkdir(parents=True, exist_ok=True)
soul_path.write_text("# Hermes Agent Persona\n\n<!-- Edit this file to customize how Hermes communicates. -->\n", encoding="utf-8")
check_ok("Created ~/.hermes/SOUL.md")
logs_dir = PROJECT_ROOT / "logs"
if logs_dir.exists():
check_ok("logs/ directory exists")
else:
check_warn("logs/ not found", "(will be created on first use)")
# =========================================================================
# Check: External tools
# =========================================================================
print()
print(color("◆ External Tools", Colors.CYAN, Colors.BOLD))
# Git
if shutil.which("git"):
check_ok("git")
else:
check_warn("git not found", "(optional)")
# ripgrep (optional, for faster file search)
if shutil.which("rg"):
check_ok("ripgrep (rg)", "(faster file search)")
else:
check_warn("ripgrep (rg) not found", "(file search uses grep fallback)")
check_info("Install for faster search: sudo apt install ripgrep")
# Docker (optional)
terminal_env = os.getenv("TERMINAL_ENV", "local")
if terminal_env == "docker":
if shutil.which("docker"):
# Check if docker daemon is running
result = subprocess.run(["docker", "info"], capture_output=True)
if result.returncode == 0:
check_ok("docker", "(daemon running)")
else:
check_fail("docker daemon not running")
issues.append("Start Docker daemon")
else:
check_fail("docker not found", "(required for TERMINAL_ENV=docker)")
issues.append("Install Docker or change TERMINAL_ENV")
else:
if shutil.which("docker"):
check_ok("docker", "(optional)")
else:
check_warn("docker not found", "(optional)")
# SSH (if using ssh backend)
if terminal_env == "ssh":
ssh_host = os.getenv("TERMINAL_SSH_HOST")
if ssh_host:
# Try to connect
result = subprocess.run(
["ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", ssh_host, "echo ok"],
capture_output=True,
text=True
)
if result.returncode == 0:
check_ok(f"SSH connection to {ssh_host}")
else:
check_fail(f"SSH connection to {ssh_host}")
issues.append(f"Check SSH configuration for {ssh_host}")
else:
check_fail("TERMINAL_SSH_HOST not set", "(required for TERMINAL_ENV=ssh)")
issues.append("Set TERMINAL_SSH_HOST in .env")
# Node.js + agent-browser (for browser automation tools)
if shutil.which("node"):
check_ok("Node.js")
# Check if agent-browser is installed
agent_browser_path = PROJECT_ROOT / "node_modules" / "agent-browser"
if agent_browser_path.exists():
check_ok("agent-browser (Node.js)", "(browser automation)")
else:
check_warn("agent-browser not installed", "(run: npm install)")
else:
check_warn("Node.js not found", "(optional, needed for browser tools)")
# =========================================================================
# Check: API connectivity
# =========================================================================
print()
print(color("◆ API Connectivity", Colors.CYAN, Colors.BOLD))
openrouter_key = os.getenv("OPENROUTER_API_KEY")
if openrouter_key:
try:
import httpx
response = httpx.get(
"https://openrouter.ai/api/v1/models",
headers={"Authorization": f"Bearer {openrouter_key}"},
timeout=10
)
if response.status_code == 200:
check_ok("OpenRouter API")
elif response.status_code == 401:
check_fail("OpenRouter API", "(invalid API key)")
issues.append("Check OPENROUTER_API_KEY in .env")
else:
check_fail("OpenRouter API", f"(HTTP {response.status_code})")
except Exception as e:
check_fail("OpenRouter API", f"({e})")
issues.append("Check network connectivity")
else:
check_warn("OpenRouter API", "(not configured)")
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
if anthropic_key:
try:
import httpx
response = httpx.get(
"https://api.anthropic.com/v1/models",
headers={
"x-api-key": anthropic_key,
"anthropic-version": "2023-06-01"
},
timeout=10
)
if response.status_code == 200:
check_ok("Anthropic API")
elif response.status_code == 401:
check_fail("Anthropic API", "(invalid API key)")
else:
# Note: Anthropic may not have /models endpoint
check_warn("Anthropic API", "(couldn't verify)")
except Exception as e:
check_warn("Anthropic API", f"({e})")
# =========================================================================
# Check: Submodules
# =========================================================================
print()
print(color("◆ Submodules", Colors.CYAN, Colors.BOLD))
# mini-swe-agent (terminal tool backend)
mini_swe_dir = PROJECT_ROOT / "mini-swe-agent"
if mini_swe_dir.exists() and (mini_swe_dir / "pyproject.toml").exists():
try:
__import__("minisweagent")
check_ok("mini-swe-agent", "(terminal backend)")
except ImportError:
check_warn("mini-swe-agent found but not installed", "(run: uv pip install -e ./mini-swe-agent)")
issues.append("Install mini-swe-agent: uv pip install -e ./mini-swe-agent")
else:
check_warn("mini-swe-agent not found", "(run: git submodule update --init --recursive)")
# tinker-atropos (RL training backend)
tinker_dir = PROJECT_ROOT / "tinker-atropos"
if tinker_dir.exists() and (tinker_dir / "pyproject.toml").exists():
if py_version >= (3, 11):
try:
__import__("tinker_atropos")
check_ok("tinker-atropos", "(RL training backend)")
except ImportError:
check_warn("tinker-atropos found but not installed", "(run: uv pip install -e ./tinker-atropos)")
issues.append("Install tinker-atropos: uv pip install -e ./tinker-atropos")
else:
check_warn("tinker-atropos requires Python 3.11+", f"(current: {py_version.major}.{py_version.minor})")
else:
check_warn("tinker-atropos not found", "(run: git submodule update --init --recursive)")
# =========================================================================
# Check: Tool Availability
# =========================================================================
print()
print(color("◆ Tool Availability", Colors.CYAN, Colors.BOLD))
try:
# Add project root to path for imports
sys.path.insert(0, str(PROJECT_ROOT))
from model_tools import check_tool_availability, TOOLSET_REQUIREMENTS
available, unavailable = check_tool_availability()
for tid in available:
info = TOOLSET_REQUIREMENTS.get(tid, {})
check_ok(info.get("name", tid))
for item in unavailable:
if item["missing_vars"]:
vars_str = ", ".join(item["missing_vars"])
check_warn(item["name"], f"(missing {vars_str})")
else:
check_warn(item["name"], "(system dependency not met)")
# Count disabled tools with API key requirements
api_disabled = [u for u in unavailable if u["missing_vars"]]
if api_disabled:
issues.append("Run 'hermes setup' to configure missing API keys for full tool access")
except Exception as e:
check_warn("Could not check tool availability", f"({e})")
# =========================================================================
# Summary
# =========================================================================
print()
if issues:
print(color("" * 60, Colors.YELLOW))
print(color(f" Found {len(issues)} issue(s) to address:", Colors.YELLOW, Colors.BOLD))
print()
for i, issue in enumerate(issues, 1):
print(f" {i}. {issue}")
print()
if should_fix:
print(color(" Attempting auto-fix is not yet implemented.", Colors.DIM))
print(color(" Please resolve issues manually.", Colors.DIM))
else:
print(color("" * 60, Colors.GREEN))
print(color(" All checks passed! 🎉", Colors.GREEN, Colors.BOLD))
print()

View File

@@ -1,491 +0,0 @@
"""
Gateway subcommand for hermes CLI.
Handles: hermes gateway [run|start|stop|restart|status|install|uninstall]
"""
import asyncio
import os
import signal
import subprocess
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
# =============================================================================
# Process Management (for manual gateway runs)
# =============================================================================
def find_gateway_pids() -> list:
"""Find PIDs of running gateway processes."""
pids = []
try:
# Look for gateway processes with multiple patterns
patterns = [
"hermes_cli.main gateway",
"hermes gateway",
"gateway/run.py",
]
result = subprocess.run(
["ps", "aux"],
capture_output=True,
text=True
)
for line in result.stdout.split('\n'):
# Skip grep and current process
if 'grep' in line or str(os.getpid()) in line:
continue
for pattern in patterns:
if pattern in line:
parts = line.split()
if len(parts) > 1:
try:
pid = int(parts[1])
if pid not in pids:
pids.append(pid)
except ValueError:
continue
break
except Exception:
pass
return pids
def kill_gateway_processes(force: bool = False) -> int:
"""Kill any running gateway processes. Returns count killed."""
pids = find_gateway_pids()
killed = 0
for pid in pids:
try:
if force:
os.kill(pid, signal.SIGKILL)
else:
os.kill(pid, signal.SIGTERM)
killed += 1
except ProcessLookupError:
# Process already gone
pass
except PermissionError:
print(f"⚠ Permission denied to kill PID {pid}")
return killed
def is_linux() -> bool:
return sys.platform.startswith('linux')
def is_macos() -> bool:
return sys.platform == 'darwin'
def is_windows() -> bool:
return sys.platform == 'win32'
# =============================================================================
# Service Configuration
# =============================================================================
SERVICE_NAME = "hermes-gateway"
SERVICE_DESCRIPTION = "Hermes Agent Gateway - Messaging Platform Integration"
def get_systemd_unit_path() -> Path:
return Path.home() / ".config" / "systemd" / "user" / f"{SERVICE_NAME}.service"
def get_launchd_plist_path() -> Path:
return Path.home() / "Library" / "LaunchAgents" / "ai.hermes.gateway.plist"
def get_python_path() -> str:
venv_python = PROJECT_ROOT / "venv" / "bin" / "python"
if venv_python.exists():
return str(venv_python)
return sys.executable
def get_hermes_cli_path() -> str:
"""Get the path to the hermes CLI."""
# Check if installed via pip
import shutil
hermes_bin = shutil.which("hermes")
if hermes_bin:
return hermes_bin
# Fallback to direct module execution
return f"{get_python_path()} -m hermes_cli.main"
# =============================================================================
# Systemd (Linux)
# =============================================================================
def generate_systemd_unit() -> str:
python_path = get_python_path()
working_dir = str(PROJECT_ROOT)
return f"""[Unit]
Description={SERVICE_DESCRIPTION}
After=network.target
[Service]
Type=simple
ExecStart={python_path} -m hermes_cli.main gateway run
WorkingDirectory={working_dir}
Restart=on-failure
RestartSec=10
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=default.target
"""
def systemd_install(force: bool = False):
unit_path = get_systemd_unit_path()
if unit_path.exists() and not force:
print(f"Service already installed at: {unit_path}")
print("Use --force to reinstall")
return
unit_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Installing systemd service to: {unit_path}")
unit_path.write_text(generate_systemd_unit())
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
subprocess.run(["systemctl", "--user", "enable", SERVICE_NAME], check=True)
print()
print("✓ Service installed and enabled!")
print()
print("Next steps:")
print(f" hermes gateway start # Start the service")
print(f" hermes gateway status # Check status")
print(f" journalctl --user -u {SERVICE_NAME} -f # View logs")
print()
print("To enable lingering (keeps running after logout):")
print(" sudo loginctl enable-linger $USER")
def systemd_uninstall():
subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=False)
subprocess.run(["systemctl", "--user", "disable", SERVICE_NAME], check=False)
unit_path = get_systemd_unit_path()
if unit_path.exists():
unit_path.unlink()
print(f"✓ Removed {unit_path}")
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
print("✓ Service uninstalled")
def systemd_start():
subprocess.run(["systemctl", "--user", "start", SERVICE_NAME], check=True)
print("✓ Service started")
def systemd_stop():
subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=True)
print("✓ Service stopped")
def systemd_restart():
subprocess.run(["systemctl", "--user", "restart", SERVICE_NAME], check=True)
print("✓ Service restarted")
def systemd_status(deep: bool = False):
# Check if service unit file exists
unit_path = get_systemd_unit_path()
if not unit_path.exists():
print("✗ Gateway service is not installed")
print(" Run: hermes gateway install")
return
# Show detailed status first
subprocess.run(
["systemctl", "--user", "status", SERVICE_NAME, "--no-pager"],
capture_output=False
)
# Check if service is active
result = subprocess.run(
["systemctl", "--user", "is-active", SERVICE_NAME],
capture_output=True,
text=True
)
status = result.stdout.strip()
if status == "active":
print("✓ Gateway service is running")
else:
print("✗ Gateway service is stopped")
print(" Run: hermes gateway start")
if deep:
print()
print("Recent logs:")
subprocess.run([
"journalctl", "--user", "-u", SERVICE_NAME,
"-n", "20", "--no-pager"
])
# =============================================================================
# Launchd (macOS)
# =============================================================================
def generate_launchd_plist() -> str:
python_path = get_python_path()
working_dir = str(PROJECT_ROOT)
log_dir = Path.home() / ".hermes" / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
return f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>ai.hermes.gateway</string>
<key>ProgramArguments</key>
<array>
<string>{python_path}</string>
<string>-m</string>
<string>hermes_cli.main</string>
<string>gateway</string>
<string>run</string>
</array>
<key>WorkingDirectory</key>
<string>{working_dir}</string>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<dict>
<key>SuccessfulExit</key>
<false/>
</dict>
<key>StandardOutPath</key>
<string>{log_dir}/gateway.log</string>
<key>StandardErrorPath</key>
<string>{log_dir}/gateway.error.log</string>
</dict>
</plist>
"""
def launchd_install(force: bool = False):
plist_path = get_launchd_plist_path()
if plist_path.exists() and not force:
print(f"Service already installed at: {plist_path}")
print("Use --force to reinstall")
return
plist_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Installing launchd service to: {plist_path}")
plist_path.write_text(generate_launchd_plist())
subprocess.run(["launchctl", "load", str(plist_path)], check=True)
print()
print("✓ Service installed and loaded!")
print()
print("Next steps:")
print(" hermes gateway status # Check status")
print(" tail -f ~/.hermes/logs/gateway.log # View logs")
def launchd_uninstall():
plist_path = get_launchd_plist_path()
subprocess.run(["launchctl", "unload", str(plist_path)], check=False)
if plist_path.exists():
plist_path.unlink()
print(f"✓ Removed {plist_path}")
print("✓ Service uninstalled")
def launchd_start():
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
print("✓ Service started")
def launchd_stop():
subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True)
print("✓ Service stopped")
def launchd_restart():
launchd_stop()
launchd_start()
def launchd_status(deep: bool = False):
result = subprocess.run(
["launchctl", "list", "ai.hermes.gateway"],
capture_output=True,
text=True
)
if result.returncode == 0:
print("✓ Gateway service is loaded")
print(result.stdout)
else:
print("✗ Gateway service is not loaded")
if deep:
log_file = Path.home() / ".hermes" / "logs" / "gateway.log"
if log_file.exists():
print()
print("Recent logs:")
subprocess.run(["tail", "-20", str(log_file)])
# =============================================================================
# Gateway Runner
# =============================================================================
def run_gateway(verbose: bool = False):
"""Run the gateway in foreground."""
sys.path.insert(0, str(PROJECT_ROOT))
from gateway.run import start_gateway
print("┌─────────────────────────────────────────────────────────┐")
print("│ 🦋 Hermes Gateway Starting... │")
print("├─────────────────────────────────────────────────────────┤")
print("│ Press Ctrl+C to stop │")
print("└─────────────────────────────────────────────────────────┘")
print()
# Exit with code 1 if gateway fails to connect any platform,
# so systemd Restart=on-failure will retry on transient errors
success = asyncio.run(start_gateway())
if not success:
sys.exit(1)
# =============================================================================
# Main Command Handler
# =============================================================================
def gateway_command(args):
"""Handle gateway subcommands."""
subcmd = getattr(args, 'gateway_command', None)
# Default to run if no subcommand
if subcmd is None or subcmd == "run":
verbose = getattr(args, 'verbose', False)
run_gateway(verbose)
return
# Service management commands
if subcmd == "install":
force = getattr(args, 'force', False)
if is_linux():
systemd_install(force)
elif is_macos():
launchd_install(force)
else:
print("Service installation not supported on this platform.")
print("Run manually: hermes gateway run")
sys.exit(1)
elif subcmd == "uninstall":
if is_linux():
systemd_uninstall()
elif is_macos():
launchd_uninstall()
else:
print("Not supported on this platform.")
sys.exit(1)
elif subcmd == "start":
if is_linux():
systemd_start()
elif is_macos():
launchd_start()
else:
print("Not supported on this platform.")
sys.exit(1)
elif subcmd == "stop":
# Try service first, fall back to killing processes directly
service_available = False
if is_linux() and get_systemd_unit_path().exists():
try:
systemd_stop()
service_available = True
except subprocess.CalledProcessError:
pass # Fall through to process kill
elif is_macos() and get_launchd_plist_path().exists():
try:
launchd_stop()
service_available = True
except subprocess.CalledProcessError:
pass
if not service_available:
# Kill gateway processes directly
killed = kill_gateway_processes()
if killed:
print(f"✓ Stopped {killed} gateway process(es)")
else:
print("✗ No gateway processes found")
elif subcmd == "restart":
# Try service first, fall back to killing and restarting
service_available = False
if is_linux() and get_systemd_unit_path().exists():
try:
systemd_restart()
service_available = True
except subprocess.CalledProcessError:
pass
elif is_macos() and get_launchd_plist_path().exists():
try:
launchd_restart()
service_available = True
except subprocess.CalledProcessError:
pass
if not service_available:
# Manual restart: kill existing processes
killed = kill_gateway_processes()
if killed:
print(f"✓ Stopped {killed} gateway process(es)")
import time
time.sleep(2)
# Start fresh
print("Starting gateway...")
run_gateway(verbose=False)
elif subcmd == "status":
deep = getattr(args, 'deep', False)
# Check for service first
if is_linux() and get_systemd_unit_path().exists():
systemd_status(deep)
elif is_macos() and get_launchd_plist_path().exists():
launchd_status(deep)
else:
# Check for manually running processes
pids = find_gateway_pids()
if pids:
print(f"✓ Gateway is running (PID: {', '.join(map(str, pids))})")
print(" (Running manually, not as a system service)")
print()
print("To install as a service:")
print(" hermes gateway install")
else:
print("✗ Gateway is not running")
print()
print("To start:")
print(" hermes gateway # Run in foreground")
print(" hermes gateway install # Install as service")

View File

@@ -1,544 +0,0 @@
#!/usr/bin/env python3
"""
Hermes CLI - Main entry point.
Usage:
hermes # Interactive chat (default)
hermes chat # Interactive chat
hermes gateway # Run gateway in foreground
hermes gateway start # Start gateway as service
hermes gateway stop # Stop gateway service
hermes gateway status # Show gateway status
hermes gateway install # Install gateway service
hermes gateway uninstall # Uninstall gateway service
hermes setup # Interactive setup wizard
hermes status # Show status of all components
hermes cron # Manage cron jobs
hermes cron list # List cron jobs
hermes cron daemon # Run cron daemon
hermes doctor # Check configuration and dependencies
hermes version # Show version
hermes update # Update to latest version
hermes uninstall # Uninstall Hermes Agent
"""
import argparse
import os
import sys
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
sys.path.insert(0, str(PROJECT_ROOT))
# Load .env file
from dotenv import load_dotenv
env_path = PROJECT_ROOT / '.env'
if env_path.exists():
load_dotenv(dotenv_path=env_path)
from hermes_cli import __version__
def cmd_chat(args):
"""Run interactive chat CLI."""
# Import and run the CLI
from cli import main as cli_main
# Build kwargs from args
kwargs = {
"model": args.model,
"toolsets": args.toolsets,
"verbose": args.verbose,
"query": args.query,
}
# Filter out None values
kwargs = {k: v for k, v in kwargs.items() if v is not None}
cli_main(**kwargs)
def cmd_gateway(args):
"""Gateway management commands."""
from hermes_cli.gateway import gateway_command
gateway_command(args)
def cmd_setup(args):
"""Interactive setup wizard."""
from hermes_cli.setup import run_setup_wizard
run_setup_wizard(args)
def cmd_status(args):
"""Show status of all components."""
from hermes_cli.status import show_status
show_status(args)
def cmd_cron(args):
"""Cron job management."""
from hermes_cli.cron import cron_command
cron_command(args)
def cmd_doctor(args):
"""Check configuration and dependencies."""
from hermes_cli.doctor import run_doctor
run_doctor(args)
def cmd_config(args):
"""Configuration management."""
from hermes_cli.config import config_command
config_command(args)
def cmd_version(args):
"""Show version."""
print(f"Hermes Agent v{__version__}")
print(f"Project: {PROJECT_ROOT}")
# Show Python version
print(f"Python: {sys.version.split()[0]}")
# Check for key dependencies
try:
import openai
print(f"OpenAI SDK: {openai.__version__}")
except ImportError:
print("OpenAI SDK: Not installed")
def cmd_uninstall(args):
"""Uninstall Hermes Agent."""
from hermes_cli.uninstall import run_uninstall
run_uninstall(args)
def cmd_update(args):
"""Update Hermes Agent to the latest version."""
import subprocess
import shutil
print("🦋 Updating Hermes Agent...")
print()
# Check if we're in a git repo
git_dir = PROJECT_ROOT / '.git'
if not git_dir.exists():
print("✗ Not a git repository. Please reinstall:")
print(" curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash")
sys.exit(1)
# Fetch and pull
try:
print("→ Fetching updates...")
subprocess.run(["git", "fetch", "origin"], cwd=PROJECT_ROOT, check=True)
# Get current branch
result = subprocess.run(
["git", "rev-parse", "--abbrev-ref", "HEAD"],
cwd=PROJECT_ROOT,
capture_output=True,
text=True,
check=True
)
branch = result.stdout.strip()
# Check if there are updates
result = subprocess.run(
["git", "rev-list", f"HEAD..origin/{branch}", "--count"],
cwd=PROJECT_ROOT,
capture_output=True,
text=True,
check=True
)
commit_count = int(result.stdout.strip())
if commit_count == 0:
print("✓ Already up to date!")
return
print(f"→ Found {commit_count} new commit(s)")
print("→ Pulling updates...")
subprocess.run(["git", "pull", "origin", branch], cwd=PROJECT_ROOT, check=True)
# Reinstall Python dependencies (prefer uv for speed, fall back to pip)
print("→ Updating Python dependencies...")
uv_bin = shutil.which("uv")
if uv_bin:
subprocess.run(
[uv_bin, "pip", "install", "-e", ".", "--quiet"],
cwd=PROJECT_ROOT, check=True,
env={**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")}
)
else:
venv_pip = PROJECT_ROOT / "venv" / "bin" / "pip"
if venv_pip.exists():
subprocess.run([str(venv_pip), "install", "-e", ".", "--quiet"], cwd=PROJECT_ROOT, check=True)
else:
subprocess.run(["pip", "install", "-e", ".", "--quiet"], cwd=PROJECT_ROOT, check=True)
# Check for Node.js deps
if (PROJECT_ROOT / "package.json").exists():
import shutil
if shutil.which("npm"):
print("→ Updating Node.js dependencies...")
subprocess.run(["npm", "install", "--silent"], cwd=PROJECT_ROOT, check=False)
print()
print("✓ Code updated!")
# Check for config migrations
print()
print("→ Checking configuration for new options...")
from hermes_cli.config import (
get_missing_env_vars, get_missing_config_fields,
check_config_version, migrate_config
)
missing_env = get_missing_env_vars(required_only=True)
missing_config = get_missing_config_fields()
current_ver, latest_ver = check_config_version()
needs_migration = missing_env or missing_config or current_ver < latest_ver
if needs_migration:
print()
if missing_env:
print(f" ⚠️ {len(missing_env)} new required setting(s) need configuration")
if missing_config:
print(f" {len(missing_config)} new config option(s) available")
print()
response = input("Would you like to configure them now? [Y/n]: ").strip().lower()
if response in ('', 'y', 'yes'):
print()
results = migrate_config(interactive=True, quiet=False)
if results["env_added"] or results["config_added"]:
print()
print("✓ Configuration updated!")
else:
print()
print("Skipped. Run 'hermes config migrate' later to configure.")
else:
print(" ✓ Configuration is up to date")
print()
print("✓ Update complete!")
print()
print("Note: If you have the gateway service running, restart it:")
print(" hermes gateway restart")
except subprocess.CalledProcessError as e:
print(f"✗ Update failed: {e}")
sys.exit(1)
def main():
"""Main entry point for hermes CLI."""
parser = argparse.ArgumentParser(
prog="hermes",
description="Hermes Agent - AI assistant with tool-calling capabilities",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
hermes Start interactive chat
hermes chat -q "Hello" Single query mode
hermes setup Run setup wizard
hermes config View configuration
hermes config edit Edit config in $EDITOR
hermes config set model gpt-4 Set a config value
hermes gateway Run messaging gateway
hermes gateway install Install as system service
hermes update Update to latest version
For more help on a command:
hermes <command> --help
"""
)
parser.add_argument(
"--version", "-V",
action="store_true",
help="Show version and exit"
)
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# =========================================================================
# chat command
# =========================================================================
chat_parser = subparsers.add_parser(
"chat",
help="Interactive chat with the agent",
description="Start an interactive chat session with Hermes Agent"
)
chat_parser.add_argument(
"-q", "--query",
help="Single query (non-interactive mode)"
)
chat_parser.add_argument(
"-m", "--model",
help="Model to use (e.g., anthropic/claude-sonnet-4)"
)
chat_parser.add_argument(
"-t", "--toolsets",
help="Comma-separated toolsets to enable"
)
chat_parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Verbose output"
)
chat_parser.set_defaults(func=cmd_chat)
# =========================================================================
# gateway command
# =========================================================================
gateway_parser = subparsers.add_parser(
"gateway",
help="Messaging gateway management",
description="Manage the messaging gateway (Telegram, Discord, WhatsApp)"
)
gateway_subparsers = gateway_parser.add_subparsers(dest="gateway_command")
# gateway run (default)
gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground")
gateway_run.add_argument("-v", "--verbose", action="store_true")
# gateway start
gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service")
# gateway stop
gateway_stop = gateway_subparsers.add_parser("stop", help="Stop gateway service")
# gateway restart
gateway_restart = gateway_subparsers.add_parser("restart", help="Restart gateway service")
# gateway status
gateway_status = gateway_subparsers.add_parser("status", help="Show gateway status")
gateway_status.add_argument("--deep", action="store_true", help="Deep status check")
# gateway install
gateway_install = gateway_subparsers.add_parser("install", help="Install gateway as service")
gateway_install.add_argument("--force", action="store_true", help="Force reinstall")
# gateway uninstall
gateway_uninstall = gateway_subparsers.add_parser("uninstall", help="Uninstall gateway service")
gateway_parser.set_defaults(func=cmd_gateway)
# =========================================================================
# setup command
# =========================================================================
setup_parser = subparsers.add_parser(
"setup",
help="Interactive setup wizard",
description="Configure Hermes Agent with an interactive wizard"
)
setup_parser.add_argument(
"--non-interactive",
action="store_true",
help="Non-interactive mode (use defaults/env vars)"
)
setup_parser.add_argument(
"--reset",
action="store_true",
help="Reset configuration to defaults"
)
setup_parser.set_defaults(func=cmd_setup)
# =========================================================================
# status command
# =========================================================================
status_parser = subparsers.add_parser(
"status",
help="Show status of all components",
description="Display status of Hermes Agent components"
)
status_parser.add_argument(
"--all",
action="store_true",
help="Show all details (redacted for sharing)"
)
status_parser.add_argument(
"--deep",
action="store_true",
help="Run deep checks (may take longer)"
)
status_parser.set_defaults(func=cmd_status)
# =========================================================================
# cron command
# =========================================================================
cron_parser = subparsers.add_parser(
"cron",
help="Cron job management",
description="Manage scheduled tasks"
)
cron_subparsers = cron_parser.add_subparsers(dest="cron_command")
# cron list
cron_list = cron_subparsers.add_parser("list", help="List scheduled jobs")
cron_list.add_argument("--all", action="store_true", help="Include disabled jobs")
# cron daemon
cron_daemon = cron_subparsers.add_parser("daemon", help="Run cron daemon")
cron_daemon.add_argument("--interval", type=int, default=60, help="Check interval in seconds")
# cron tick
cron_tick = cron_subparsers.add_parser("tick", help="Run due jobs once (for system cron)")
cron_parser.set_defaults(func=cmd_cron)
# =========================================================================
# doctor command
# =========================================================================
doctor_parser = subparsers.add_parser(
"doctor",
help="Check configuration and dependencies",
description="Diagnose issues with Hermes Agent setup"
)
doctor_parser.add_argument(
"--fix",
action="store_true",
help="Attempt to fix issues automatically"
)
doctor_parser.set_defaults(func=cmd_doctor)
# =========================================================================
# config command
# =========================================================================
config_parser = subparsers.add_parser(
"config",
help="View and edit configuration",
description="Manage Hermes Agent configuration"
)
config_subparsers = config_parser.add_subparsers(dest="config_command")
# config show (default)
config_show = config_subparsers.add_parser("show", help="Show current configuration")
# config edit
config_edit = config_subparsers.add_parser("edit", help="Open config file in editor")
# config set
config_set = config_subparsers.add_parser("set", help="Set a configuration value")
config_set.add_argument("key", nargs="?", help="Configuration key (e.g., model, terminal.backend)")
config_set.add_argument("value", nargs="?", help="Value to set")
# config path
config_path = config_subparsers.add_parser("path", help="Print config file path")
# config env-path
config_env = config_subparsers.add_parser("env-path", help="Print .env file path")
# config check
config_check = config_subparsers.add_parser("check", help="Check for missing/outdated config")
# config migrate
config_migrate = config_subparsers.add_parser("migrate", help="Update config with new options")
config_parser.set_defaults(func=cmd_config)
# =========================================================================
# pairing command
# =========================================================================
pairing_parser = subparsers.add_parser(
"pairing",
help="Manage DM pairing codes for user authorization",
description="Approve or revoke user access via pairing codes"
)
pairing_sub = pairing_parser.add_subparsers(dest="pairing_action")
pairing_list_parser = pairing_sub.add_parser("list", help="Show pending + approved users")
pairing_approve_parser = pairing_sub.add_parser("approve", help="Approve a pairing code")
pairing_approve_parser.add_argument("platform", help="Platform name (telegram, discord, slack, whatsapp)")
pairing_approve_parser.add_argument("code", help="Pairing code to approve")
pairing_revoke_parser = pairing_sub.add_parser("revoke", help="Revoke user access")
pairing_revoke_parser.add_argument("platform", help="Platform name")
pairing_revoke_parser.add_argument("user_id", help="User ID to revoke")
pairing_clear_parser = pairing_sub.add_parser("clear-pending", help="Clear all pending codes")
def cmd_pairing(args):
from hermes_cli.pairing import pairing_command
pairing_command(args)
pairing_parser.set_defaults(func=cmd_pairing)
# =========================================================================
# version command
# =========================================================================
version_parser = subparsers.add_parser(
"version",
help="Show version information"
)
version_parser.set_defaults(func=cmd_version)
# =========================================================================
# update command
# =========================================================================
update_parser = subparsers.add_parser(
"update",
help="Update Hermes Agent to the latest version",
description="Pull the latest changes from git and reinstall dependencies"
)
update_parser.set_defaults(func=cmd_update)
# =========================================================================
# uninstall command
# =========================================================================
uninstall_parser = subparsers.add_parser(
"uninstall",
help="Uninstall Hermes Agent",
description="Remove Hermes Agent from your system. Can keep configs/data for reinstall."
)
uninstall_parser.add_argument(
"--full",
action="store_true",
help="Full uninstall - remove everything including configs and data"
)
uninstall_parser.add_argument(
"--yes", "-y",
action="store_true",
help="Skip confirmation prompts"
)
uninstall_parser.set_defaults(func=cmd_uninstall)
# =========================================================================
# Parse and execute
# =========================================================================
args = parser.parse_args()
# Handle --version flag
if args.version:
cmd_version(args)
return
# Default to chat if no command specified
if args.command is None:
# No command = run chat
args.query = None
args.model = None
args.toolsets = None
args.verbose = False
cmd_chat(args)
return
# Execute the command
if hasattr(args, 'func'):
args.func(args)
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -1,100 +0,0 @@
"""
CLI commands for the DM pairing system.
Usage:
hermes pairing list # Show all pending + approved users
hermes pairing approve <platform> <code> # Approve a pairing code
hermes pairing revoke <platform> <user_id> # Revoke user access
hermes pairing clear-pending # Clear all expired/pending codes
"""
import sys
def pairing_command(args):
"""Handle hermes pairing subcommands."""
from gateway.pairing import PairingStore
store = PairingStore()
action = getattr(args, "pairing_action", None)
if action == "list":
_cmd_list(store)
elif action == "approve":
_cmd_approve(store, args.platform, args.code)
elif action == "revoke":
_cmd_revoke(store, args.platform, args.user_id)
elif action == "clear-pending":
_cmd_clear_pending(store)
else:
print("Usage: hermes pairing {list|approve|revoke|clear-pending}")
print("Run 'hermes pairing --help' for details.")
def _cmd_list(store):
"""List all pending and approved users."""
pending = store.list_pending()
approved = store.list_approved()
if not pending and not approved:
print("No pairing data found. No one has tried to pair yet~")
return
if pending:
print(f"\n Pending Pairing Requests ({len(pending)}):")
print(f" {'Platform':<12} {'Code':<10} {'User ID':<20} {'Name':<20} {'Age'}")
print(f" {'--------':<12} {'----':<10} {'-------':<20} {'----':<20} {'---'}")
for p in pending:
print(
f" {p['platform']:<12} {p['code']:<10} {p['user_id']:<20} "
f"{p.get('user_name', ''):<20} {p['age_minutes']}m ago"
)
else:
print("\n No pending pairing requests.")
if approved:
print(f"\n Approved Users ({len(approved)}):")
print(f" {'Platform':<12} {'User ID':<20} {'Name':<20}")
print(f" {'--------':<12} {'-------':<20} {'----':<20}")
for a in approved:
print(f" {a['platform']:<12} {a['user_id']:<20} {a.get('user_name', ''):<20}")
else:
print("\n No approved users.")
print()
def _cmd_approve(store, platform: str, code: str):
"""Approve a pairing code."""
platform = platform.lower().strip()
code = code.upper().strip()
result = store.approve_code(platform, code)
if result:
uid = result["user_id"]
name = result.get("user_name", "")
display = f"{name} ({uid})" if name else uid
print(f"\n Approved! User {display} on {platform} can now use the bot~")
print(f" They'll be recognized automatically on their next message.\n")
else:
print(f"\n Code '{code}' not found or expired for platform '{platform}'.")
print(f" Run 'hermes pairing list' to see pending codes.\n")
def _cmd_revoke(store, platform: str, user_id: str):
"""Revoke a user's access."""
platform = platform.lower().strip()
if store.revoke(platform, user_id):
print(f"\n Revoked access for user {user_id} on {platform}.\n")
else:
print(f"\n User {user_id} not found in approved list for {platform}.\n")
def _cmd_clear_pending(store):
"""Clear all pending pairing codes."""
count = store.clear_pending()
if count:
print(f"\n Cleared {count} pending pairing request(s).\n")
else:
print("\n No pending requests to clear.\n")

File diff suppressed because it is too large Load Diff

View File

@@ -1,251 +0,0 @@
"""
Status command for hermes CLI.
Shows the status of all Hermes Agent components.
"""
import os
import sys
import subprocess
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
# ANSI colors
class Colors:
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
CYAN = "\033[36m"
def color(text: str, *codes) -> str:
if not sys.stdout.isatty():
return text
return "".join(codes) + text + Colors.RESET
def check_mark(ok: bool) -> str:
if ok:
return color("", Colors.GREEN)
return color("", Colors.RED)
def redact_key(key: str) -> str:
"""Redact an API key for display."""
if not key:
return "(not set)"
if len(key) < 12:
return "***"
return key[:4] + "..." + key[-4:]
def show_status(args):
"""Show status of all Hermes Agent components."""
show_all = getattr(args, 'all', False)
deep = getattr(args, 'deep', False)
print()
print(color("┌─────────────────────────────────────────────────────────┐", Colors.CYAN))
print(color("│ 🦋 Hermes Agent Status │", Colors.CYAN))
print(color("└─────────────────────────────────────────────────────────┘", Colors.CYAN))
# =========================================================================
# Environment
# =========================================================================
print()
print(color("◆ Environment", Colors.CYAN, Colors.BOLD))
print(f" Project: {PROJECT_ROOT}")
print(f" Python: {sys.version.split()[0]}")
env_path = PROJECT_ROOT / '.env'
print(f" .env file: {check_mark(env_path.exists())} {'exists' if env_path.exists() else 'not found'}")
# =========================================================================
# API Keys
# =========================================================================
print()
print(color("◆ API Keys", Colors.CYAN, Colors.BOLD))
keys = {
"OpenRouter": "OPENROUTER_API_KEY",
"Anthropic": "ANTHROPIC_API_KEY",
"OpenAI": "OPENAI_API_KEY",
"Firecrawl": "FIRECRAWL_API_KEY",
"Browserbase": "BROWSERBASE_API_KEY",
"FAL": "FAL_KEY",
"Tinker": "TINKER_API_KEY",
"WandB": "WANDB_API_KEY",
"ElevenLabs": "ELEVENLABS_API_KEY",
}
for name, env_var in keys.items():
value = os.getenv(env_var, "")
has_key = bool(value)
display = redact_key(value) if not show_all else value
print(f" {name:<12} {check_mark(has_key)} {display}")
# =========================================================================
# Terminal Configuration
# =========================================================================
print()
print(color("◆ Terminal Backend", Colors.CYAN, Colors.BOLD))
terminal_env = os.getenv("TERMINAL_ENV", "")
if not terminal_env:
# Fall back to config file value when env var isn't set
# (hermes status doesn't go through cli.py's config loading)
try:
from hermes_cli.config import load_config
_cfg = load_config()
terminal_env = _cfg.get("terminal", {}).get("backend", "local")
except Exception:
terminal_env = "local"
print(f" Backend: {terminal_env}")
if terminal_env == "ssh":
ssh_host = os.getenv("TERMINAL_SSH_HOST", "")
ssh_user = os.getenv("TERMINAL_SSH_USER", "")
print(f" SSH Host: {ssh_host or '(not set)'}")
print(f" SSH User: {ssh_user or '(not set)'}")
elif terminal_env == "docker":
docker_image = os.getenv("TERMINAL_DOCKER_IMAGE", "python:3.11-slim")
print(f" Docker Image: {docker_image}")
sudo_password = os.getenv("SUDO_PASSWORD", "")
print(f" Sudo: {check_mark(bool(sudo_password))} {'enabled' if sudo_password else 'disabled'}")
# =========================================================================
# Messaging Platforms
# =========================================================================
print()
print(color("◆ Messaging Platforms", Colors.CYAN, Colors.BOLD))
platforms = {
"Telegram": ("TELEGRAM_BOT_TOKEN", "TELEGRAM_HOME_CHANNEL"),
"Discord": ("DISCORD_BOT_TOKEN", "DISCORD_HOME_CHANNEL"),
"WhatsApp": ("WHATSAPP_ENABLED", None),
}
for name, (token_var, home_var) in platforms.items():
token = os.getenv(token_var, "")
has_token = bool(token)
home_channel = ""
if home_var:
home_channel = os.getenv(home_var, "")
status = "configured" if has_token else "not configured"
if home_channel:
status += f" (home: {home_channel})"
print(f" {name:<12} {check_mark(has_token)} {status}")
# =========================================================================
# Gateway Status
# =========================================================================
print()
print(color("◆ Gateway Service", Colors.CYAN, Colors.BOLD))
if sys.platform.startswith('linux'):
result = subprocess.run(
["systemctl", "--user", "is-active", "hermes-gateway"],
capture_output=True,
text=True
)
is_active = result.stdout.strip() == "active"
print(f" Status: {check_mark(is_active)} {'running' if is_active else 'stopped'}")
print(f" Manager: systemd (user)")
elif sys.platform == 'darwin':
result = subprocess.run(
["launchctl", "list", "ai.hermes.gateway"],
capture_output=True,
text=True
)
is_loaded = result.returncode == 0
print(f" Status: {check_mark(is_loaded)} {'loaded' if is_loaded else 'not loaded'}")
print(f" Manager: launchd")
else:
print(f" Status: {color('N/A', Colors.DIM)}")
print(f" Manager: (not supported on this platform)")
# =========================================================================
# Cron Jobs
# =========================================================================
print()
print(color("◆ Scheduled Jobs", Colors.CYAN, Colors.BOLD))
jobs_file = Path.home() / ".hermes" / "cron" / "jobs.json"
if jobs_file.exists():
import json
try:
with open(jobs_file) as f:
data = json.load(f)
jobs = data.get("jobs", [])
enabled_jobs = [j for j in jobs if j.get("enabled", True)]
print(f" Jobs: {len(enabled_jobs)} active, {len(jobs)} total")
except:
print(f" Jobs: (error reading jobs file)")
else:
print(f" Jobs: 0")
# =========================================================================
# Sessions
# =========================================================================
print()
print(color("◆ Sessions", Colors.CYAN, Colors.BOLD))
sessions_file = Path.home() / ".hermes" / "sessions" / "sessions.json"
if sessions_file.exists():
import json
try:
with open(sessions_file) as f:
data = json.load(f)
print(f" Active: {len(data)} session(s)")
except:
print(f" Active: (error reading sessions file)")
else:
print(f" Active: 0")
# =========================================================================
# Deep checks
# =========================================================================
if deep:
print()
print(color("◆ Deep Checks", Colors.CYAN, Colors.BOLD))
# Check OpenRouter connectivity
openrouter_key = os.getenv("OPENROUTER_API_KEY", "")
if openrouter_key:
try:
import httpx
response = httpx.get(
"https://openrouter.ai/api/v1/models",
headers={"Authorization": f"Bearer {openrouter_key}"},
timeout=10
)
ok = response.status_code == 200
print(f" OpenRouter: {check_mark(ok)} {'reachable' if ok else f'error ({response.status_code})'}")
except Exception as e:
print(f" OpenRouter: {check_mark(False)} error: {e}")
# Check gateway port
try:
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(1)
result = sock.connect_ex(('127.0.0.1', 18789))
sock.close()
# Port in use = gateway likely running
port_in_use = result == 0
# This is informational, not necessarily bad
print(f" Port 18789: {'in use' if port_in_use else 'available'}")
except:
pass
print()
print(color("" * 60, Colors.DIM))
print(color(" Run 'hermes doctor' for detailed diagnostics", Colors.DIM))
print(color(" Run 'hermes setup' to configure", Colors.DIM))
print()

View File

@@ -1,341 +0,0 @@
"""
Hermes Agent Uninstaller.
Provides options for:
- Full uninstall: Remove everything including configs and data
- Keep data: Remove code but keep ~/.hermes/ (configs, sessions, logs)
"""
import os
import sys
import shutil
import subprocess
from pathlib import Path
from typing import Optional
# ANSI colors
class Colors:
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
BLUE = "\033[34m"
MAGENTA = "\033[35m"
CYAN = "\033[36m"
def color(text: str, *codes) -> str:
"""Apply color codes to text (only in TTY)."""
if not sys.stdout.isatty():
return text
return "".join(codes) + text + Colors.RESET
def log_info(msg: str):
print(f"{color('', Colors.CYAN)} {msg}")
def log_success(msg: str):
print(f"{color('', Colors.GREEN)} {msg}")
def log_warn(msg: str):
print(f"{color('', Colors.YELLOW)} {msg}")
def log_error(msg: str):
print(f"{color('', Colors.RED)} {msg}")
def get_project_root() -> Path:
"""Get the project installation directory."""
return Path(__file__).parent.parent.resolve()
def get_hermes_home() -> Path:
"""Get the Hermes home directory (~/.hermes)."""
return Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
def find_shell_configs() -> list:
"""Find shell configuration files that might have PATH entries."""
home = Path.home()
configs = []
candidates = [
home / ".bashrc",
home / ".bash_profile",
home / ".profile",
home / ".zshrc",
home / ".zprofile",
]
for config in candidates:
if config.exists():
configs.append(config)
return configs
def remove_path_from_shell_configs():
"""Remove Hermes PATH entries from shell configuration files."""
configs = find_shell_configs()
removed_from = []
for config_path in configs:
try:
content = config_path.read_text()
original_content = content
# Remove lines containing hermes-agent or hermes PATH entries
new_lines = []
skip_next = False
for line in content.split('\n'):
# Skip the "# Hermes Agent" comment and following line
if '# Hermes Agent' in line or '# hermes-agent' in line:
skip_next = True
continue
if skip_next and ('hermes' in line.lower() and 'PATH' in line):
skip_next = False
continue
skip_next = False
# Remove any PATH line containing hermes
if 'hermes' in line.lower() and ('PATH=' in line or 'path=' in line.lower()):
continue
new_lines.append(line)
new_content = '\n'.join(new_lines)
# Clean up multiple blank lines
while '\n\n\n' in new_content:
new_content = new_content.replace('\n\n\n', '\n\n')
if new_content != original_content:
config_path.write_text(new_content)
removed_from.append(config_path)
except Exception as e:
log_warn(f"Could not update {config_path}: {e}")
return removed_from
def remove_wrapper_script():
"""Remove the hermes wrapper script if it exists."""
wrapper_paths = [
Path.home() / ".local" / "bin" / "hermes",
Path("/usr/local/bin/hermes"),
]
removed = []
for wrapper in wrapper_paths:
if wrapper.exists():
try:
# Check if it's our wrapper (contains hermes_cli reference)
content = wrapper.read_text()
if 'hermes_cli' in content or 'hermes-agent' in content:
wrapper.unlink()
removed.append(wrapper)
except Exception as e:
log_warn(f"Could not remove {wrapper}: {e}")
return removed
def uninstall_gateway_service():
"""Stop and uninstall the gateway service if running."""
import platform
if platform.system() != "Linux":
return False
service_file = Path.home() / ".config" / "systemd" / "user" / "hermes-gateway.service"
if not service_file.exists():
return False
try:
# Stop the service
subprocess.run(
["systemctl", "--user", "stop", "hermes-gateway"],
capture_output=True,
check=False
)
# Disable the service
subprocess.run(
["systemctl", "--user", "disable", "hermes-gateway"],
capture_output=True,
check=False
)
# Remove service file
service_file.unlink()
# Reload systemd
subprocess.run(
["systemctl", "--user", "daemon-reload"],
capture_output=True,
check=False
)
return True
except Exception as e:
log_warn(f"Could not fully remove gateway service: {e}")
return False
def run_uninstall(args):
"""
Run the uninstall process.
Options:
- Full uninstall: removes code + ~/.hermes/ (configs, data, logs)
- Keep data: removes code but keeps ~/.hermes/ for future reinstall
"""
project_root = get_project_root()
hermes_home = get_hermes_home()
print()
print(color("┌─────────────────────────────────────────────────────────┐", Colors.MAGENTA, Colors.BOLD))
print(color("│ 🦋 Hermes Agent Uninstaller │", Colors.MAGENTA, Colors.BOLD))
print(color("└─────────────────────────────────────────────────────────┘", Colors.MAGENTA, Colors.BOLD))
print()
# Show what will be affected
print(color("Current Installation:", Colors.CYAN, Colors.BOLD))
print(f" Code: {project_root}")
print(f" Config: {hermes_home / 'config.yaml'}")
print(f" Secrets: {hermes_home / '.env'}")
print(f" Data: {hermes_home / 'cron/'}, {hermes_home / 'sessions/'}, {hermes_home / 'logs/'}")
print()
# Ask for confirmation
print(color("Uninstall Options:", Colors.YELLOW, Colors.BOLD))
print()
print(" 1) " + color("Keep data", Colors.GREEN) + " - Remove code only, keep configs/sessions/logs")
print(" (Recommended - you can reinstall later with your settings intact)")
print()
print(" 2) " + color("Full uninstall", Colors.RED) + " - Remove everything including all data")
print(" (Warning: This deletes all configs, sessions, and logs permanently)")
print()
print(" 3) " + color("Cancel", Colors.CYAN) + " - Don't uninstall")
print()
try:
choice = input(color("Select option [1/2/3]: ", Colors.BOLD)).strip()
except (KeyboardInterrupt, EOFError):
print()
print("Cancelled.")
return
if choice == "3" or choice.lower() in ("c", "cancel", "q", "quit", "n", "no"):
print()
print("Uninstall cancelled.")
return
full_uninstall = (choice == "2")
# Final confirmation
print()
if full_uninstall:
print(color("⚠️ WARNING: This will permanently delete ALL Hermes data!", Colors.RED, Colors.BOLD))
print(color(" Including: configs, API keys, sessions, scheduled jobs, logs", Colors.RED))
else:
print("This will remove the Hermes code but keep your configuration and data.")
print()
try:
confirm = input(f"Type '{color('yes', Colors.YELLOW)}' to confirm: ").strip().lower()
except (KeyboardInterrupt, EOFError):
print()
print("Cancelled.")
return
if confirm != "yes":
print()
print("Uninstall cancelled.")
return
print()
print(color("Uninstalling...", Colors.CYAN, Colors.BOLD))
print()
# 1. Stop and uninstall gateway service
log_info("Checking for gateway service...")
if uninstall_gateway_service():
log_success("Gateway service stopped and removed")
else:
log_info("No gateway service found")
# 2. Remove PATH entries from shell configs
log_info("Removing PATH entries from shell configs...")
removed_configs = remove_path_from_shell_configs()
if removed_configs:
for config in removed_configs:
log_success(f"Updated {config}")
else:
log_info("No PATH entries found to remove")
# 3. Remove wrapper script
log_info("Removing hermes command...")
removed_wrappers = remove_wrapper_script()
if removed_wrappers:
for wrapper in removed_wrappers:
log_success(f"Removed {wrapper}")
else:
log_info("No wrapper script found")
# 4. Remove installation directory (code)
log_info(f"Removing installation directory...")
# Check if we're running from within the install dir
# We need to be careful here
try:
if project_root.exists():
# If the install is inside ~/.hermes/, just remove the hermes-agent subdir
if hermes_home in project_root.parents or project_root.parent == hermes_home:
shutil.rmtree(project_root)
log_success(f"Removed {project_root}")
else:
# Installation is somewhere else entirely
shutil.rmtree(project_root)
log_success(f"Removed {project_root}")
except Exception as e:
log_warn(f"Could not fully remove {project_root}: {e}")
log_info("You may need to manually remove it")
# 5. Optionally remove ~/.hermes/ data directory
if full_uninstall:
log_info("Removing configuration and data...")
try:
if hermes_home.exists():
shutil.rmtree(hermes_home)
log_success(f"Removed {hermes_home}")
except Exception as e:
log_warn(f"Could not fully remove {hermes_home}: {e}")
log_info("You may need to manually remove it")
else:
log_info(f"Keeping configuration and data in {hermes_home}")
# Done
print()
print(color("┌─────────────────────────────────────────────────────────┐", Colors.GREEN, Colors.BOLD))
print(color("│ ✓ Uninstall Complete! │", Colors.GREEN, Colors.BOLD))
print(color("└─────────────────────────────────────────────────────────┘", Colors.GREEN, Colors.BOLD))
print()
if not full_uninstall:
print(color("Your configuration and data have been preserved:", Colors.CYAN))
print(f" {hermes_home}/")
print()
print("To reinstall later with your existing settings:")
print(color(" curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash", Colors.DIM))
print()
print(color("Reload your shell to complete the process:", Colors.YELLOW))
print(" source ~/.bashrc # or ~/.zshrc")
print()
print("Thank you for using Hermes Agent! 🦋")
print()

View File

@@ -2,14 +2,14 @@
"""
Image Generation Tools Module
This module provides image generation tools using FAL.ai's FLUX 2 Pro model with
This module provides image generation tools using FAL.ai's FLUX.1 Krea model with
automatic upscaling via FAL.ai's Clarity Upscaler for enhanced image quality.
Available tools:
- image_generate_tool: Generate images from text prompts with automatic upscaling
Features:
- High-quality image generation using FLUX 2 Pro model
- High-quality image generation using FLUX.1 Krea model
- Automatic 2x upscaling using Clarity Upscaler for enhanced quality
- Comprehensive parameter control (size, steps, guidance, etc.)
- Proper error handling and validation with fallback to original images
@@ -38,25 +38,13 @@ from typing import Dict, Any, Optional, Union
import fal_client
# Configuration for image generation
DEFAULT_MODEL = "fal-ai/flux-2-pro"
DEFAULT_ASPECT_RATIO = "landscape"
DEFAULT_MODEL = "fal-ai/flux/krea"
DEFAULT_IMAGE_SIZE = "landscape_4_3"
DEFAULT_NUM_INFERENCE_STEPS = 50
DEFAULT_GUIDANCE_SCALE = 4.5
DEFAULT_NUM_IMAGES = 1
DEFAULT_OUTPUT_FORMAT = "png"
# Safety settings
ENABLE_SAFETY_CHECKER = False
SAFETY_TOLERANCE = "5" # Maximum tolerance (1-5, where 5 is most permissive)
# Aspect ratio mapping - simplified choices for model to select
ASPECT_RATIO_MAP = {
"landscape": "landscape_16_9",
"square": "square_hd",
"portrait": "portrait_16_9"
}
VALID_ASPECT_RATIOS = list(ASPECT_RATIO_MAP.keys())
# Configuration for automatic upscaling
UPSCALER_MODEL = "fal-ai/clarity-upscaler"
UPSCALER_FACTOR = 2
@@ -68,7 +56,7 @@ UPSCALER_RESEMBLANCE = 0.6
UPSCALER_GUIDANCE_SCALE = 4
UPSCALER_NUM_INFERENCE_STEPS = 18
# Valid parameter values for validation based on FLUX 2 Pro documentation
# Valid parameter values for validation based on FLUX Krea documentation
VALID_IMAGE_SIZES = [
"square_hd", "square", "portrait_4_3", "portrait_16_9", "landscape_4_3", "landscape_16_9"
]
@@ -145,7 +133,7 @@ def _validate_parameters(
acceleration: str = "none"
) -> Dict[str, Any]:
"""
Validate and normalize image generation parameters for FLUX 2 Pro model.
Validate and normalize image generation parameters for FLUX Krea model.
Args:
image_size: Either a preset string or custom size dict
@@ -186,7 +174,7 @@ def _validate_parameters(
raise ValueError("num_inference_steps must be an integer between 1 and 100")
validated["num_inference_steps"] = num_inference_steps
# Validate guidance_scale (FLUX 2 Pro default is 4.5)
# Validate guidance_scale (FLUX Krea default is 4.5)
if not isinstance(guidance_scale, (int, float)) or guidance_scale < 0.1 or guidance_scale > 20.0:
raise ValueError("guidance_scale must be a number between 0.1 and 20.0")
validated["guidance_scale"] = float(guidance_scale)
@@ -266,28 +254,34 @@ async def _upscale_image(image_url: str, original_prompt: str) -> Dict[str, Any]
async def image_generate_tool(
prompt: str,
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
image_size: Union[str, Dict[str, int]] = DEFAULT_IMAGE_SIZE,
num_inference_steps: int = DEFAULT_NUM_INFERENCE_STEPS,
guidance_scale: float = DEFAULT_GUIDANCE_SCALE,
num_images: int = DEFAULT_NUM_IMAGES,
enable_safety_checker: bool = True,
output_format: str = DEFAULT_OUTPUT_FORMAT,
acceleration: str = "none",
allow_nsfw_images: bool = True,
seed: Optional[int] = None
) -> str:
"""
Generate images from text prompts using FAL.ai's FLUX 2 Pro model with automatic upscaling.
Generate images from text prompts using FAL.ai's FLUX.1 Krea model with automatic upscaling.
This tool uses FAL.ai's FLUX 2 Pro model for high-quality text-to-image generation
This tool uses FAL.ai's FLUX.1 Krea model for high-quality text-to-image generation
with extensive customization options. Generated images are automatically upscaled 2x
using FAL.ai's Clarity Upscaler for enhanced quality. The final upscaled images are
returned as URLs that can be displayed using <img src="{URL}"></img> tags.
Args:
prompt (str): The text prompt describing the desired image
aspect_ratio (str): Image aspect ratio - "landscape", "square", or "portrait" (default: "landscape")
num_inference_steps (int): Number of denoising steps (1-50, default: 50)
image_size (Union[str, Dict[str, int]]): Preset size or custom {"width": int, "height": int}
num_inference_steps (int): Number of denoising steps (1-50, default: 28)
guidance_scale (float): How closely to follow prompt (0.1-20.0, default: 4.5)
num_images (int): Number of images to generate (1-4, default: 1)
enable_safety_checker (bool): Enable content safety filtering (default: True)
output_format (str): Image format "jpeg" or "png" (default: "png")
acceleration (str): Generation speed "none", "regular", or "high" (default: "none")
allow_nsfw_images (bool): Allow generation of NSFW content (default: True)
seed (Optional[int]): Random seed for reproducible results (optional)
Returns:
@@ -297,22 +291,17 @@ async def image_generate_tool(
"image": str or None # URL of the upscaled image, or None if failed
}
"""
# Validate and map aspect_ratio to actual image_size
aspect_ratio_lower = aspect_ratio.lower().strip() if aspect_ratio else DEFAULT_ASPECT_RATIO
if aspect_ratio_lower not in ASPECT_RATIO_MAP:
print(f"⚠️ Invalid aspect_ratio '{aspect_ratio}', defaulting to '{DEFAULT_ASPECT_RATIO}'")
aspect_ratio_lower = DEFAULT_ASPECT_RATIO
image_size = ASPECT_RATIO_MAP[aspect_ratio_lower]
debug_call_data = {
"parameters": {
"prompt": prompt,
"aspect_ratio": aspect_ratio,
"image_size": image_size,
"num_inference_steps": num_inference_steps,
"guidance_scale": guidance_scale,
"num_images": num_images,
"enable_safety_checker": enable_safety_checker,
"output_format": output_format,
"acceleration": acceleration,
"allow_nsfw_images": allow_nsfw_images,
"seed": seed
},
"error": None,
@@ -324,31 +313,35 @@ async def image_generate_tool(
start_time = datetime.datetime.now()
try:
print(f"🎨 Generating {num_images} image(s) with FLUX 2 Pro: {prompt[:80]}{'...' if len(prompt) > 80 else ''}")
print(f"🎨 Generating {num_images} image(s) with FLUX Krea: {prompt[:80]}{'...' if len(prompt) > 80 else ''}")
# Validate prompt
if not prompt or not isinstance(prompt, str) or len(prompt.strip()) == 0:
raise ValueError("Prompt is required and must be a non-empty string")
if len(prompt) > 1000:
raise ValueError("Prompt must be 1000 characters or less")
# Check API key availability
if not os.getenv("FAL_KEY"):
raise ValueError("FAL_KEY environment variable not set")
# Validate other parameters
# Validate parameters
validated_params = _validate_parameters(
image_size, num_inference_steps, guidance_scale, num_images, output_format, "none"
image_size, num_inference_steps, guidance_scale, num_images, output_format, acceleration
)
# Prepare arguments for FAL.ai FLUX 2 Pro API
# Prepare arguments for FAL.ai FLUX Krea API
arguments = {
"prompt": prompt.strip(),
"image_size": validated_params["image_size"],
"num_inference_steps": validated_params["num_inference_steps"],
"guidance_scale": validated_params["guidance_scale"],
"num_images": validated_params["num_images"],
"enable_safety_checker": enable_safety_checker,
"output_format": validated_params["output_format"],
"enable_safety_checker": ENABLE_SAFETY_CHECKER,
"safety_tolerance": SAFETY_TOLERANCE,
"acceleration": validated_params["acceleration"],
"allow_nsfw_images": allow_nsfw_images,
"sync_mode": True # Use sync mode for immediate results
}
@@ -356,11 +349,12 @@ async def image_generate_tool(
if seed is not None and isinstance(seed, int):
arguments["seed"] = seed
print(f"🚀 Submitting generation request to FAL.ai FLUX 2 Pro...")
print(f"🚀 Submitting generation request to FAL.ai FLUX Krea...")
print(f" Model: {DEFAULT_MODEL}")
print(f" Aspect Ratio: {aspect_ratio_lower}{image_size}")
print(f" Size: {validated_params['image_size']}")
print(f" Steps: {validated_params['num_inference_steps']}")
print(f" Guidance: {validated_params['guidance_scale']}")
print(f" Acceleration: {validated_params['acceleration']}")
# Submit request to FAL.ai
handler = await fal_client.submit_async(
@@ -423,7 +417,7 @@ async def image_generate_tool(
_log_debug_call("image_generate_tool", debug_call_data)
_save_debug_log()
return json.dumps(response_data, indent=2, ensure_ascii=False)
return json.dumps(response_data, indent=2)
except Exception as e:
generation_time = (datetime.datetime.now() - start_time).total_seconds()
@@ -441,7 +435,7 @@ async def image_generate_tool(
_log_debug_call("image_generate_tool", debug_call_data)
_save_debug_log()
return json.dumps(response_data, indent=2, ensure_ascii=False)
return json.dumps(response_data, indent=2)
def check_fal_api_key() -> bool:
@@ -501,7 +495,7 @@ if __name__ == "__main__":
"""
Simple test/demo when run directly
"""
print("🎨 Image Generation Tools Module - FLUX 2 Pro + Auto Upscaling")
print("🎨 Image Generation Tools Module - FLUX.1 Krea + Auto Upscaling")
print("=" * 60)
# Check if API key is available

Submodule mini-swe-agent deleted from 07aa6a7385

View File

@@ -1,708 +0,0 @@
#!/usr/bin/env python3
"""
Mini-SWE-Agent Runner with Hermes Trajectory Format
This module provides a runner that uses mini-swe-agent's execution environments
(local, docker, modal) but outputs trajectories in the Hermes-Agent format
compatible with batch_runner.py and trajectory_compressor.py.
Features:
- Uses mini-swe-agent's Docker, Modal, or Local environments for command execution
- Outputs trajectories in Hermes format (from/value pairs with <tool_call>/<tool_response> XML)
- Compatible with the trajectory compression pipeline
- Supports batch processing from JSONL prompt files
Usage:
# Run a single task with local environment
python mini_swe_runner.py --task "Create a hello world Python script" --env local
# Run with Docker
python mini_swe_runner.py --task "List files in /tmp" --env docker --image python:3.11-slim
# Run with Modal (cloud)
python mini_swe_runner.py --task "Install numpy and test it" --env modal --image python:3.11-slim
# Batch mode from JSONL file
python mini_swe_runner.py --prompts_file prompts.jsonl --output_file trajectories.jsonl --env docker
"""
import json
import logging
import os
import sys
import time
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional, Literal
import fire
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Add mini-swe-agent to path if not installed
mini_swe_path = Path(__file__).parent / "mini-swe-agent" / "src"
if mini_swe_path.exists():
sys.path.insert(0, str(mini_swe_path))
# ============================================================================
# Terminal Tool Definition (matches Hermes-Agent format)
# ============================================================================
TERMINAL_TOOL_DEFINITION = {
"type": "function",
"function": {
"name": "terminal",
"description": """Execute bash commands in a sandboxed environment.
**Environment:**
- Isolated execution environment (local, Docker, or Modal cloud)
- Filesystem persists between tool calls within the same task
- Internet access available
**Command Execution:**
- Provide the command to execute via the 'command' parameter
- Optional 'timeout' parameter in seconds (default: 60)
**Examples:**
- Run command: `{"command": "ls -la"}`
- With timeout: `{"command": "long_task.sh", "timeout": 300}`
**Best Practices:**
- Use non-interactive commands (avoid vim, nano, interactive python)
- Pipe to cat if output might be large
- Install tools with apt-get or pip as needed
**Completion:**
- When task is complete, output: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by your result
""",
"parameters": {
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "The bash command to execute"
},
"timeout": {
"type": "integer",
"description": "Command timeout in seconds (default: 60)"
}
},
"required": ["command"]
}
}
}
# ============================================================================
# Environment Factory
# ============================================================================
def create_environment(
env_type: str = "local",
image: str = "python:3.11-slim",
cwd: str = "/tmp",
timeout: int = 60,
**kwargs
):
"""
Create an execution environment from mini-swe-agent.
Args:
env_type: One of "local", "docker", "modal"
image: Docker/Modal image name (ignored for local)
cwd: Working directory
timeout: Default command timeout
**kwargs: Additional environment-specific options
Returns:
Environment instance with execute() method
"""
if env_type == "local":
from minisweagent.environments.local import LocalEnvironment
return LocalEnvironment(cwd=cwd, timeout=timeout)
elif env_type == "docker":
from minisweagent.environments.docker import DockerEnvironment
return DockerEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs)
elif env_type == "modal":
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
return SwerexModalEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs)
else:
raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', or 'modal'")
# ============================================================================
# Mini-SWE Runner with Hermes Trajectory Format
# ============================================================================
class MiniSWERunner:
"""
Agent runner that uses mini-swe-agent environments but outputs
trajectories in Hermes-Agent format.
"""
def __init__(
self,
model: str = "anthropic/claude-sonnet-4-20250514",
base_url: str = None,
api_key: str = None,
env_type: str = "local",
image: str = "python:3.11-slim",
cwd: str = "/tmp",
max_iterations: int = 15,
command_timeout: int = 60,
verbose: bool = False,
):
"""
Initialize the Mini-SWE Runner.
Args:
model: Model name for OpenAI-compatible API
base_url: API base URL (optional, uses env vars if not provided)
api_key: API key (optional, uses env vars if not provided)
env_type: Environment type - "local", "docker", or "modal"
image: Docker/Modal image (ignored for local)
cwd: Working directory for commands
max_iterations: Maximum tool-calling iterations
command_timeout: Default timeout for commands
verbose: Enable verbose logging
"""
self.model = model
self.max_iterations = max_iterations
self.command_timeout = command_timeout
self.verbose = verbose
self.env_type = env_type
self.image = image
self.cwd = cwd
# Setup logging
logging.basicConfig(
level=logging.DEBUG if verbose else logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
self.logger = logging.getLogger(__name__)
# Initialize OpenAI client - defaults to OpenRouter
from openai import OpenAI
client_kwargs = {}
# Default to OpenRouter if no base_url provided
if base_url:
client_kwargs["base_url"] = base_url
else:
client_kwargs["base_url"] = "https://openrouter.ai/api/v1"
# Handle API key - OpenRouter is the primary provider
if api_key:
client_kwargs["api_key"] = api_key
else:
client_kwargs["api_key"] = os.getenv(
"OPENROUTER_API_KEY",
os.getenv("ANTHROPIC_API_KEY", os.getenv("OPENAI_API_KEY", ""))
)
self.client = OpenAI(**client_kwargs)
# Environment will be created per-task
self.env = None
# Tool definition
self.tools = [TERMINAL_TOOL_DEFINITION]
print(f"🤖 Mini-SWE Runner initialized")
print(f" Model: {self.model}")
print(f" Environment: {self.env_type}")
if self.env_type != "local":
print(f" Image: {self.image}")
print(f" Max iterations: {self.max_iterations}")
def _create_env(self):
"""Create the execution environment."""
print(f"🔧 Creating {self.env_type} environment...")
self.env = create_environment(
env_type=self.env_type,
image=self.image,
cwd=self.cwd,
timeout=self.command_timeout
)
print(f"✅ Environment ready")
def _cleanup_env(self):
"""Cleanup the execution environment."""
if self.env is not None:
if hasattr(self.env, 'cleanup'):
self.env.cleanup()
elif hasattr(self.env, 'stop'):
self.env.stop()
self.env = None
def _execute_command(self, command: str, timeout: int = None) -> Dict[str, Any]:
"""
Execute a command in the environment.
Args:
command: Bash command to execute
timeout: Optional timeout override
Returns:
Dict with 'output' and 'returncode'
"""
if self.env is None:
self._create_env()
try:
result = self.env.execute(command, timeout=timeout or self.command_timeout)
return {
"output": result.get("output", ""),
"exit_code": result.get("returncode", 0),
"error": None
}
except Exception as e:
return {
"output": "",
"exit_code": -1,
"error": str(e)
}
def _format_tools_for_system_message(self) -> str:
"""Format tool definitions for the system message."""
formatted_tools = []
for tool in self.tools:
func = tool["function"]
formatted_tools.append({
"name": func["name"],
"description": func.get("description", ""),
"parameters": func.get("parameters", {}),
"required": None
})
return json.dumps(formatted_tools, ensure_ascii=False)
def _convert_to_hermes_format(
self,
messages: List[Dict[str, Any]],
user_query: str,
completed: bool
) -> List[Dict[str, Any]]:
"""
Convert internal message format to Hermes trajectory format.
This produces the exact format used by batch_runner.py.
"""
trajectory = []
# System message with tool definitions
system_msg = (
"You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. "
"You may call one or more functions to assist with the user query. If available tools are not relevant in assisting "
"with user query, just respond in natural conversational language. Don't make assumptions about what values to plug "
"into functions. After calling & executing the functions, you will be provided with function results within "
"<tool_response> </tool_response> XML tags. Here are the available tools:\n"
f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n"
"For each function call return a JSON object, with the following pydantic model json schema for each:\n"
"{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, "
"'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n"
"Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n"
"Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>"
)
trajectory.append({"from": "system", "value": system_msg})
trajectory.append({"from": "human", "value": user_query})
# Process messages (skip first user message as we already added it)
i = 1
while i < len(messages):
msg = messages[i]
if msg["role"] == "assistant":
if "tool_calls" in msg and msg["tool_calls"]:
# Assistant message with tool calls
content = ""
# Add reasoning if present
if msg.get("reasoning"):
content = f"<think>{msg['reasoning']}</think>"
if msg.get("content"):
content += msg["content"] + "\n"
# Add tool calls in XML format
for tool_call in msg["tool_calls"]:
try:
arguments = json.loads(tool_call["function"]["arguments"]) \
if isinstance(tool_call["function"]["arguments"], str) \
else tool_call["function"]["arguments"]
except json.JSONDecodeError:
arguments = {}
tool_call_json = {
"name": tool_call["function"]["name"],
"arguments": arguments
}
content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
trajectory.append({"from": "gpt", "value": content.rstrip()})
# Collect subsequent tool responses
tool_responses = []
j = i + 1
while j < len(messages) and messages[j]["role"] == "tool":
tool_msg = messages[j]
tool_content = tool_msg["content"]
# Try to parse as JSON
try:
if tool_content.strip().startswith(("{", "[")):
tool_content = json.loads(tool_content)
except (json.JSONDecodeError, AttributeError):
pass
tool_response = f"<tool_response>\n"
tool_response += json.dumps({
"tool_call_id": tool_msg.get("tool_call_id", ""),
"name": msg["tool_calls"][len(tool_responses)]["function"]["name"] \
if len(tool_responses) < len(msg["tool_calls"]) else "unknown",
"content": tool_content
}, ensure_ascii=False)
tool_response += "\n</tool_response>"
tool_responses.append(tool_response)
j += 1
if tool_responses:
trajectory.append({"from": "tool", "value": "\n".join(tool_responses)})
i = j - 1
else:
# Regular assistant message (no tool calls)
content = ""
if msg.get("reasoning"):
content = f"<think>{msg['reasoning']}</think>"
content += msg.get("content") or ""
trajectory.append({"from": "gpt", "value": content})
elif msg["role"] == "user":
trajectory.append({"from": "human", "value": msg["content"]})
i += 1
return trajectory
def run_task(self, task: str) -> Dict[str, Any]:
"""
Run a single task and return the result with trajectory.
Args:
task: The task/prompt to execute
Returns:
Dict with trajectory, completion status, and metadata
"""
print(f"\n{'='*60}")
print(f"📝 Task: {task[:80]}{'...' if len(task) > 80 else ''}")
print(f"{'='*60}")
# Initialize environment
self._create_env()
# Message history
messages = [{"role": "user", "content": task}]
# System prompt for the LLM (ephemeral - not saved to trajectory)
system_prompt = """You are an AI agent that can execute bash commands to complete tasks.
When you need to run commands, use the 'terminal' tool with your bash command.
**Important:**
- When you have completed the task successfully, run: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by a summary
- Be concise and efficient in your approach
- Install any needed tools with apt-get or pip
- Avoid interactive commands (no vim, nano, less, etc.)
Complete the user's task step by step."""
api_call_count = 0
completed = False
final_response = None
try:
while api_call_count < self.max_iterations:
api_call_count += 1
print(f"\n🔄 API call #{api_call_count}/{self.max_iterations}")
# Prepare API messages
api_messages = [{"role": "system", "content": system_prompt}] + messages
# Make API call
try:
response = self.client.chat.completions.create(
model=self.model,
messages=api_messages,
tools=self.tools,
timeout=300.0
)
except Exception as e:
self.logger.error(f"API call failed: {e}")
break
assistant_message = response.choices[0].message
# Log assistant response
if assistant_message.content:
print(f"🤖 Assistant: {assistant_message.content[:100]}...")
# Check for tool calls
if assistant_message.tool_calls:
print(f"🔧 Tool calls: {len(assistant_message.tool_calls)}")
# Add assistant message with tool calls
messages.append({
"role": "assistant",
"content": assistant_message.content,
"tool_calls": [
{
"id": tc.id,
"type": tc.type,
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments
}
}
for tc in assistant_message.tool_calls
]
})
# Execute each tool call
for tc in assistant_message.tool_calls:
try:
args = json.loads(tc.function.arguments)
except json.JSONDecodeError:
args = {}
command = args.get("command", "echo 'No command provided'")
timeout = args.get("timeout", self.command_timeout)
print(f" 📞 terminal: {command[:60]}...")
# Execute command
result = self._execute_command(command, timeout)
# Format result
result_json = json.dumps({
"content": {
"output": result["output"],
"exit_code": result["exit_code"],
"error": result["error"]
}
}, ensure_ascii=False)
# Check for task completion signal
if "MINI_SWE_AGENT_FINAL_OUTPUT" in result["output"]:
print(f" ✅ Task completion signal detected!")
completed = True
# Add tool response
messages.append({
"role": "tool",
"content": result_json,
"tool_call_id": tc.id
})
print(f" ✅ exit_code={result['exit_code']}, output={len(result['output'])} chars")
# If task completed, we can stop
if completed:
final_response = assistant_message.content
break
else:
# No tool calls - final response
final_response = assistant_message.content or ""
messages.append({
"role": "assistant",
"content": final_response
})
completed = True
print(f"🎉 Agent finished (no more tool calls)")
break
if api_call_count >= self.max_iterations:
print(f"⚠️ Reached max iterations ({self.max_iterations})")
finally:
# Cleanup environment
self._cleanup_env()
# Convert to Hermes trajectory format
trajectory = self._convert_to_hermes_format(messages, task, completed)
return {
"conversations": trajectory,
"completed": completed,
"api_calls": api_call_count,
"metadata": {
"model": self.model,
"env_type": self.env_type,
"timestamp": datetime.now().isoformat()
}
}
def run_batch(
self,
prompts: List[str],
output_file: str
) -> List[Dict[str, Any]]:
"""
Run multiple tasks and save trajectories to a JSONL file.
Args:
prompts: List of task prompts
output_file: Output JSONL file path
Returns:
List of results
"""
results = []
print(f"\n📦 Running batch of {len(prompts)} tasks")
print(f"📁 Output: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
for i, prompt in enumerate(prompts, 1):
print(f"\n{'='*60}")
print(f"📋 Task {i}/{len(prompts)}")
print(f"{'='*60}")
try:
result = self.run_task(prompt)
results.append(result)
# Write to file immediately
f.write(json.dumps(result, ensure_ascii=False) + "\n")
f.flush()
print(f"✅ Task {i} completed (api_calls={result['api_calls']})")
except Exception as e:
self.logger.error(f"Error on task {i}: {e}")
error_result = {
"conversations": [],
"completed": False,
"api_calls": 0,
"error": str(e),
"metadata": {"timestamp": datetime.now().isoformat()}
}
results.append(error_result)
f.write(json.dumps(error_result, ensure_ascii=False) + "\n")
f.flush()
print(f"\n✅ Batch complete! {len(results)} trajectories saved to {output_file}")
return results
# ============================================================================
# CLI Interface
# ============================================================================
def main(
task: str = None,
prompts_file: str = None,
output_file: str = "mini-swe-agent-test1.jsonl",
model: str = "claude-sonnet-4-20250514",
base_url: str = None,
api_key: str = None,
env: str = "local",
image: str = "python:3.11-slim",
cwd: str = "/tmp",
max_iterations: int = 15,
timeout: int = 60,
verbose: bool = False,
):
"""
Run mini-swe-agent tasks with Hermes trajectory format output.
Args:
task: Single task to run (use this OR prompts_file)
prompts_file: JSONL file with prompts (each line: {"prompt": "..."})
output_file: Output JSONL file for trajectories
model: Model name (default: claude-sonnet-4-20250514)
base_url: API base URL (optional)
api_key: API key (optional, uses env vars)
env: Environment type - "local", "docker", or "modal"
image: Docker/Modal image (default: python:3.11-slim)
cwd: Working directory (default: /tmp)
max_iterations: Maximum tool-calling iterations (default: 15)
timeout: Command timeout in seconds (default: 60)
verbose: Enable verbose logging
Examples:
# Single task with local environment
python mini_swe_runner.py --task "Create hello.py that prints Hello World"
# Single task with Docker
python mini_swe_runner.py --task "List files" --env docker
# Batch from file
python mini_swe_runner.py --prompts_file tasks.jsonl --output_file results.jsonl
"""
print("🚀 Mini-SWE Runner with Hermes Trajectory Format")
print("=" * 60)
# Initialize runner
runner = MiniSWERunner(
model=model,
base_url=base_url,
api_key=api_key,
env_type=env,
image=image,
cwd=cwd,
max_iterations=max_iterations,
command_timeout=timeout,
verbose=verbose,
)
if task:
# Single task mode
result = runner.run_task(task)
# Save to file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(result, ensure_ascii=False) + "\n")
print(f"\n📁 Trajectory saved to: {output_file}")
print(f"✅ Completed: {result['completed']}")
print(f"📞 API calls: {result['api_calls']}")
print(f"💬 Turns: {len(result['conversations'])}")
elif prompts_file:
# Batch mode
prompts = []
with open(prompts_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
try:
entry = json.loads(line)
prompts.append(entry.get("prompt", entry.get("task", "")))
except json.JSONDecodeError:
prompts.append(line)
if not prompts:
print(f"❌ No prompts found in {prompts_file}")
return
runner.run_batch(prompts, output_file)
else:
print("❌ Please provide either --task or --prompts_file")
print(" Example: python mini_swe_runner.py --task 'Create a hello world script'")
if __name__ == "__main__":
fire.Fire(main)

File diff suppressed because it is too large Load Diff

243
mock_web_tools.py Normal file
View File

@@ -0,0 +1,243 @@
"""
Mock Web Tools for Testing WebSocket Reconnection
This module provides mock implementations of web_search and web_extract
that simulate long-running operations without making real API calls.
Perfect for testing WebSocket timeout/reconnection behavior without:
- Wasting API credits
- Waiting for real web crawling
- Network dependencies
"""
import time
import json
from typing import List
def mock_web_search(query: str, delay: int = 2) -> str:
"""
Mock web search that returns fake results after a delay.
Args:
query: Search query (ignored, just for API compatibility)
delay: Seconds to sleep (default: 2s)
Returns:
JSON string with fake search results
"""
print(f"🔍 [MOCK] Searching for: '{query}' (will take {delay}s)...")
time.sleep(delay)
result = {
"success": True,
"data": {
"web": [
{
"url": "https://example.com/article1",
"title": "Mock Article 1 - Water Utilities",
"description": "This is a mock search result for testing purposes. Real data would appear here.",
"category": None
},
{
"url": "https://example.com/article2",
"title": "Mock Article 2 - AI Data Centers",
"description": "Another mock result. This simulates web_search without making real API calls.",
"category": None
},
{
"url": "https://example.com/article3",
"title": "Mock Article 3 - Investment Opportunities",
"description": "Third mock result for testing. Query was: " + query,
"category": None
}
]
}
}
return json.dumps(result, indent=2)
def mock_web_extract(urls: List[str], delay: int = 60) -> str:
"""
Mock web extraction that simulates long-running crawl.
This is perfect for testing WebSocket timeout/reconnection because:
- Default 60s delay triggers the ~30s WebSocket timeout
- No actual web requests made
- No API credits consumed
- Predictable, reproducible behavior
Args:
urls: List of URLs to "extract" (ignored)
delay: Seconds to sleep (default: 60s to trigger timeout)
Returns:
JSON string with fake extraction results
"""
print(f"🌐 [MOCK] Extracting {len(urls)} URLs (will take {delay}s)...")
print(f"📊 [MOCK] This will test WebSocket reconnection (timeout at ~30s)")
# Simulate long-running operation
# Show progress so user knows it's working
for i in range(delay):
if i % 10 == 0 and i > 0:
print(f" ⏱️ [MOCK] {i}/{delay}s elapsed...")
time.sleep(1)
# Generate fake but realistic-looking content
result = {
"success": True,
"data": []
}
for idx, url in enumerate(urls, 1):
result["data"].append({
"url": url,
"title": f"Mock Extracted Content {idx}",
"content": f"# Mock Content from {url}\n\n"
f"This is simulated extracted content for testing purposes. "
f"In a real scenario, this would contain the full text from the webpage. "
f"\n\n## Key Points\n"
f"- Mock point 1 about water utilities\n"
f"- Mock point 2 about AI data centers\n"
f"- Mock point 3 about investment opportunities\n"
f"\n\nThis content took {delay} seconds to 'extract', which is long enough "
f"to trigger WebSocket timeout and test reconnection logic."
* 10, # Make it longer to simulate real extraction
"extracted_at": "2025-10-10T14:00:00Z"
})
json_result = json.dumps(result, indent=2)
size_kb = len(json_result) / 1024
print(f"✅ [MOCK] Extraction completed: {len(urls)} URLs, {size_kb:.1f} KB")
return json_result
def mock_web_crawl(start_url: str, max_pages: int = 10, delay: int = 30) -> str:
"""
Mock web crawling that simulates multi-page crawl.
Args:
start_url: Starting URL (ignored)
max_pages: Max pages to crawl (just affects result count)
delay: Seconds to sleep (default: 30s)
Returns:
JSON string with fake crawl results
"""
print(f"🕷️ [MOCK] Crawling from: {start_url} (max {max_pages} pages, {delay}s)...")
time.sleep(delay)
result = {
"success": True,
"data": {
"start_url": start_url,
"pages_crawled": min(max_pages, 5),
"pages": []
}
}
for i in range(min(max_pages, 5)):
result["data"]["pages"].append({
"url": f"{start_url}/page{i+1}",
"title": f"Mock Page {i+1}",
"content": f"Mock content from page {i+1}. " * 50
})
print(f"✅ [MOCK] Crawl completed: {len(result['data']['pages'])} pages")
return json.dumps(result, indent=2)
# Tool definitions for the agent (same format as real tools)
MOCK_WEB_TOOLS = [
{
"name": "web_search",
"description": "[MOCK] Search the web for information. Returns fake results after 2s delay. Perfect for quick tests.",
"input_schema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query"
},
"delay": {
"type": "integer",
"description": "Seconds to delay (default: 2)",
"default": 2
}
},
"required": ["query"]
}
},
{
"name": "web_extract",
"description": "[MOCK] Extract content from URLs. Simulates 60s delay to test WebSocket timeout/reconnection. Returns fake content without making real requests. PERFECT FOR TESTING!",
"input_schema": {
"type": "object",
"properties": {
"urls": {
"type": "array",
"items": {"type": "string"},
"description": "List of URLs to extract"
},
"delay": {
"type": "integer",
"description": "Seconds to delay (default: 60 to trigger timeout)",
"default": 60
}
},
"required": ["urls"]
}
},
{
"name": "web_crawl",
"description": "[MOCK] Crawl website starting from URL. Returns fake results after 30s delay.",
"input_schema": {
"type": "object",
"properties": {
"start_url": {
"type": "string",
"description": "Starting URL for crawl"
},
"max_pages": {
"type": "integer",
"description": "Max pages to crawl (default: 10)",
"default": 10
},
"delay": {
"type": "integer",
"description": "Seconds to delay (default: 30)",
"default": 30
}
},
"required": ["start_url"]
}
}
]
# Map function names to implementations
MOCK_TOOL_FUNCTIONS = {
"web_search": mock_web_search,
"web_extract": mock_web_extract,
"web_crawl": mock_web_crawl
}
if __name__ == "__main__":
# Demo/test the mock tools
print("Testing Mock Web Tools")
print("=" * 60)
print("\n1. Mock web_search (2s delay):")
result = mock_web_search("test query", delay=2)
print(f"Result length: {len(result)} chars\n")
print("\n2. Mock web_extract (5s delay for demo - normally 60s):")
result = mock_web_extract(["https://example.com"], delay=5)
print(f"Result length: {len(result)} chars\n")
print("\n✅ All mock tools working!")

File diff suppressed because it is too large Load Diff

527
output.txt Normal file

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More