Compare commits

..

22 Commits

Author SHA1 Message Date
Sam Herring
e3123be445 Removing old patches 2026-03-30 10:06:08 -07:00
Sam Herring
e46d5b2c13 Removing old files 2026-03-30 09:58:05 -07:00
Sam Herring
34cc666105 Updating with trainer config pieces 2026-03-30 09:46:24 -07:00
Sam Herring
d6832260f9 Fixing eval steps to be a set number of tasks 2026-03-30 09:46:24 -07:00
Sam Herring
d2652e980f Adding random jitter for agent temp to add variance into rollouts 2026-03-30 09:46:24 -07:00
Sam Herring
89cea9fd2d Test basic Atropos trainer 2026-03-30 09:46:24 -07:00
Sam Herring
143e72c145 Updating endless terminals env with silenced warnings 2026-03-30 09:46:24 -07:00
Sam Herring
51305b3f3d Tool call changes 2026-03-30 09:46:24 -07:00
Sam Herring
570e52b342 Monkey patching chat template kwargs 2026-03-30 09:46:24 -07:00
Sam Herring
d6e874491d Env changes for tool use 2026-03-30 09:46:24 -07:00
Sam Herring
dd3812dffe Adding tool call parser default 2026-03-30 09:46:24 -07:00
Sam Herring
6e17630bac Eval splits for holdout sets 2026-03-30 09:46:24 -07:00
Sam Herring
53b710b13f Changing return type to be ScoredDataGroup to account for multiple trajectories 2026-03-30 09:46:24 -07:00
Sam Herring
5b1e8059cb Added task sppecific metris and evals 2026-03-30 09:46:24 -07:00
Sam Herring
ff16a33cdd Wandb changes 2026-03-30 09:46:24 -07:00
Sam Herring
7cfb9eb1f6 Updating config 2026-03-30 09:46:24 -07:00
Sam Herring
c7b15f8ce1 Adding config init method 2026-03-30 09:46:24 -07:00
Sam Herring
7602c462ee Updating path vars and dataset loading 2026-03-30 09:46:24 -07:00
Sam Herring
e38c24363c Updating to use hermes-agent backend and parse container definition out of provided .sif files 2026-03-30 09:46:24 -07:00
Sam Herring
d768b244a5 Adding endless terminal environment after rebase: 2026-03-30 09:46:24 -07:00
Teknium
97d6813f51 fix(cache): use deterministic call_id fallbacks instead of random UUIDs (#3991)
When the API doesn't provide a call_id for tool calls, the fallback
generated a random uuid4 hex. This made every API call's input unique
when replayed, preventing OpenAI's prompt cache from matching the
prefix across turns.

Replaced all four uuid4 fallback sites with a deterministic hash of
(function_name, arguments, position_index). The same tool call now
always produces the same fallback call_id, preserving cache-friendly
input stability.

Affected code paths:
- _chat_messages_to_responses_input() — Codex input reconstruction
- _normalize_codex_response() — function_call and custom_tool_call
- _build_assistant_message() — assistant message construction
2026-03-30 09:43:56 -07:00
Teknium
37825189dd fix(skills): validate hub bundle paths before install (#3986)
Co-authored-by: Gutslabs <gutslabsxyz@gmail.com>
2026-03-30 08:37:19 -07:00
10 changed files with 1434 additions and 24 deletions

View File

@@ -13,6 +13,7 @@ Core layers:
Concrete environments:
- terminal_test_env/: Simple file-creation tasks for testing the stack
- hermes_swe_env/: SWE-bench style tasks with Modal sandboxes
- endless_terminals/: Terminal tasks from HuggingFace dataset with Apptainer containers
Benchmarks (eval-only):
- benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation

View File

@@ -0,0 +1,5 @@
"""Endless Terminals Environment - Terminal task training from HuggingFace dataset."""
from .endless_terminals_env import EndlessTerminalsEnv, EndlessTerminalsEnvConfig
__all__ = ["EndlessTerminalsEnv", "EndlessTerminalsEnvConfig"]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,91 @@
# Endless Terminals - Qwen3-4B-Instruct-2507
# Single config for both trainer (launch_training.py) and env (endless_terminals_env.py serve)
#
# Usage:
# Terminal 1: run-api
# Terminal 2: cd tinker-atropos && python launch_training.py --config ../environments/endless_terminals/tinker_qwen.yaml
# Terminal 3: python environments/endless_terminals/endless_terminals_env.py serve --config environments/endless_terminals/tinker_qwen.yaml
env:
# Toolsets
enabled_toolsets: ["terminal", "file"]
# Model / tokenizer
tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
# Agent configuration
max_agent_turns: 16
max_token_length: 2048
agent_temperature: 0.6
extra_body:
chat_template_kwargs:
enable_thinking: false
tool_call_parser: "hermes"
# Terminal backend
terminal_backend: "docker"
# Dataset settings
use_dataset: true
dataset_name: "obiwan96/endless-terminals"
dataset_split: "train"
dataset_cache_dir: "~/.cache/huggingface/datasets"
tasks_base_dir: "/Users/samherring/Desktop/Projects/Hermes-Agent/endless-terminals"
# Test execution
test_timeout_s: 180
default_docker_image: "ubuntu:22.04"
max_concurrent_containers: 16
# Training configuration
group_size: 16
batch_size: 64 # 4 groups × 16 rollouts per step
total_steps: 500
steps_per_eval: 5
min_items_sent_before_logging: 1
ensure_scores_are_not_same: true
max_num_workers: 2048
worker_timeout: 3600
inference_weight: 1.0
eval_limit_ratio: 0.1
rollout_server_url: "http://localhost:8000"
# Evaluation configuration
num_eval_tasks: 20
eval_split_ratio: 0.1
# Logging
use_wandb: true
wandb_name: "endless-terminals-qwen3-4b"
# System prompt
system_prompt: >
You are a skilled Linux system administrator and programmer.
You have access to a terminal and file tools to complete system administration
and programming tasks. Use the tools effectively to solve the given task,
and verify your solution works correctly before finishing.
Keep each command short and focused — break complex tasks into multiple steps
rather than writing long one-liners.
tinker:
lora_rank: 32
learning_rate: 0.0000005
max_token_trainer_length: 32768
checkpoint_dir: "./temp/"
save_checkpoint_interval: 50
wandb_project: "endless-terminals"
wandb_group: null
wandb_run_name: "qwen3-4b"
tool_call_parser: "hermes"
openai:
- model_name: "Qwen/Qwen3-4B-Instruct-2507"
base_url: "http://localhost:8001/v1"
api_key: "x"
weight: 1.0
num_requests_for_eval: 64
timeout: 600
server_type: "sglang"
slurm: false
testing: false

View File

@@ -298,7 +298,6 @@ class HermesAgentBaseEnv(BaseEnv):
return False
server = self.server.servers[0]
# If the server is an OpenAI server (not VLLM/SGLang), use direct mode
from atroposlib.envs.server_handling.openai_server import OpenAIServer
return not isinstance(server, OpenAIServer)

View File

@@ -48,7 +48,13 @@ class HermesToolCallParser(ToolCallParser):
if not raw_json.strip():
continue
tc_data = json.loads(raw_json)
try:
tc_data = json.loads(raw_json)
except json.JSONDecodeError:
# Fix invalid backslash escapes from shell commands in JSON strings
# e.g. \s \w \d \n (unescaped) → \\s \\w \\d \\n
fixed = re.sub(r'\\([^"\\/bfnrtu0-9\n])', r'\\\\\1', raw_json)
tc_data = json.loads(fixed)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",

View File

@@ -354,7 +354,14 @@ def do_install(identifier: str, category: str = "", force: bool = False,
extra_metadata.update(getattr(bundle, "metadata", {}) or {})
# Quarantine the bundle
q_path = quarantine_bundle(bundle)
try:
q_path = quarantine_bundle(bundle)
except ValueError as exc:
c.print(f"[bold red]Installation blocked:[/] {exc}\n")
from tools.skills_hub import append_audit_log
append_audit_log("BLOCKED", bundle.name, bundle.source,
bundle.trust_level, "invalid_path", str(exc))
return
c.print(f"[dim]Quarantined to {q_path.relative_to(q_path.parent.parent.parent)}[/]")
# Scan
@@ -414,7 +421,15 @@ def do_install(identifier: str, category: str = "", force: bool = False,
return
# Install
install_dir = install_from_quarantine(q_path, bundle.name, category, bundle, result)
try:
install_dir = install_from_quarantine(q_path, bundle.name, category, bundle, result)
except ValueError as exc:
c.print(f"[bold red]Installation blocked:[/] {exc}\n")
shutil.rmtree(q_path, ignore_errors=True)
from tools.skills_hub import append_audit_log
append_audit_log("BLOCKED", bundle.name, bundle.source,
bundle.trust_level, "invalid_path", str(exc))
return
from tools.skills_hub import SKILLS_DIR
c.print(f"[bold green]Installed:[/] {install_dir.relative_to(SKILLS_DIR)}")
c.print(f"[dim]Files: {', '.join(bundle.files.keys())}[/]\n")

View File

@@ -2907,6 +2907,19 @@ class AIAgent:
})
return converted or None
@staticmethod
def _deterministic_call_id(fn_name: str, arguments: str, index: int = 0) -> str:
"""Generate a deterministic call_id from tool call content.
Used as a fallback when the API doesn't provide a call_id.
Deterministic IDs prevent cache invalidation — random UUIDs would
make every API call's prefix unique, breaking OpenAI's prompt cache.
"""
import hashlib
seed = f"{fn_name}:{arguments}:{index}"
digest = hashlib.sha256(seed.encode("utf-8", errors="replace")).hexdigest()[:12]
return f"call_{digest}"
@staticmethod
def _split_responses_tool_id(raw_id: Any) -> tuple[Optional[str], Optional[str]]:
"""Split a stored tool id into (call_id, response_item_id)."""
@@ -3013,7 +3026,8 @@ class AIAgent:
):
call_id = f"call_{embedded_response_item_id[len('fc_'):]}"
else:
call_id = f"call_{uuid.uuid4().hex[:12]}"
_raw_args = str(fn.get("arguments", "{}"))
call_id = self._deterministic_call_id(fn_name, _raw_args, len(items))
call_id = call_id.strip()
arguments = fn.get("arguments", "{}")
@@ -3377,7 +3391,7 @@ class AIAgent:
embedded_call_id, _ = self._split_responses_tool_id(raw_item_id)
call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
if not isinstance(call_id, str) or not call_id.strip():
call_id = f"call_{uuid.uuid4().hex[:12]}"
call_id = self._deterministic_call_id(fn_name, arguments, len(tool_calls))
call_id = call_id.strip()
response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
response_item_id = self._derive_responses_function_call_id(call_id, response_item_id)
@@ -3398,7 +3412,7 @@ class AIAgent:
embedded_call_id, _ = self._split_responses_tool_id(raw_item_id)
call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
if not isinstance(call_id, str) or not call_id.strip():
call_id = f"call_{uuid.uuid4().hex[:12]}"
call_id = self._deterministic_call_id(fn_name, arguments, len(tool_calls))
call_id = call_id.strip()
response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
response_item_id = self._derive_responses_function_call_id(call_id, response_item_id)
@@ -4933,7 +4947,10 @@ class AIAgent:
if isinstance(raw_id, str) and raw_id.strip():
call_id = raw_id.strip()
else:
call_id = f"call_{uuid.uuid4().hex[:12]}"
_fn = getattr(tool_call, "function", None)
_fn_name = getattr(_fn, "name", "") if _fn else ""
_fn_args = getattr(_fn, "arguments", "{}") if _fn else "{}"
call_id = self._deterministic_call_id(_fn_name, _fn_args, len(tool_calls))
call_id = call_id.strip()
response_item_id = getattr(tool_call, "response_item_id", None)

View File

@@ -5,6 +5,7 @@ from pathlib import Path
from unittest.mock import patch, MagicMock
import httpx
import pytest
from tools.skills_hub import (
GitHubAuth,
@@ -648,6 +649,29 @@ class TestWellKnownSkillSource:
assert bundle.files["SKILL.md"] == "# Code Review\n"
assert bundle.files["references/checklist.md"] == "- [ ] security\n"
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch("tools.skills_hub.httpx.get")
def test_fetch_rejects_unsafe_file_paths_from_well_known_endpoint(self, mock_get, _mock_read_cache, _mock_write_cache):
def fake_get(url, *args, **kwargs):
if url.endswith("/index.json"):
return MagicMock(status_code=200, json=lambda: {
"skills": [{
"name": "code-review",
"description": "Review code",
"files": ["SKILL.md", "../../../escape.txt"],
}]
})
if url.endswith("/code-review/SKILL.md"):
return MagicMock(status_code=200, text="# Code Review\n")
raise AssertionError(url)
mock_get.side_effect = fake_get
bundle = self._source().fetch("well-known:https://example.com/.well-known/skills/code-review")
assert bundle is None
class TestCheckForSkillUpdates:
def test_bundle_content_hash_matches_installed_content_hash(self, tmp_path):
@@ -1143,6 +1167,61 @@ class TestQuarantineBundleBinaryAssets:
assert (q_path / "SKILL.md").read_text(encoding="utf-8").startswith("---")
assert (q_path / "assets" / "neutts-cli" / "samples" / "jo.wav").read_bytes() == b"RIFF\x00\x01fakewav"
def test_quarantine_bundle_rejects_traversal_file_paths(self, tmp_path):
import tools.skills_hub as hub
hub_dir = tmp_path / "skills" / ".hub"
with patch.object(hub, "SKILLS_DIR", tmp_path / "skills"), \
patch.object(hub, "HUB_DIR", hub_dir), \
patch.object(hub, "LOCK_FILE", hub_dir / "lock.json"), \
patch.object(hub, "QUARANTINE_DIR", hub_dir / "quarantine"), \
patch.object(hub, "AUDIT_LOG", hub_dir / "audit.log"), \
patch.object(hub, "TAPS_FILE", hub_dir / "taps.json"), \
patch.object(hub, "INDEX_CACHE_DIR", hub_dir / "index-cache"):
bundle = SkillBundle(
name="demo",
files={
"SKILL.md": "---\nname: demo\n---\n",
"../../../escape.txt": "owned",
},
source="well-known",
identifier="well-known:https://example.com/.well-known/skills/demo",
trust_level="community",
)
with pytest.raises(ValueError, match="Unsafe bundle file path"):
quarantine_bundle(bundle)
assert not (tmp_path / "skills" / "escape.txt").exists()
def test_quarantine_bundle_rejects_absolute_file_paths(self, tmp_path):
import tools.skills_hub as hub
hub_dir = tmp_path / "skills" / ".hub"
absolute_target = tmp_path / "outside.txt"
with patch.object(hub, "SKILLS_DIR", tmp_path / "skills"), \
patch.object(hub, "HUB_DIR", hub_dir), \
patch.object(hub, "LOCK_FILE", hub_dir / "lock.json"), \
patch.object(hub, "QUARANTINE_DIR", hub_dir / "quarantine"), \
patch.object(hub, "AUDIT_LOG", hub_dir / "audit.log"), \
patch.object(hub, "TAPS_FILE", hub_dir / "taps.json"), \
patch.object(hub, "INDEX_CACHE_DIR", hub_dir / "index-cache"):
bundle = SkillBundle(
name="demo",
files={
"SKILL.md": "---\nname: demo\n---\n",
str(absolute_target): "owned",
},
source="well-known",
identifier="well-known:https://example.com/.well-known/skills/demo",
trust_level="community",
)
with pytest.raises(ValueError, match="Unsafe bundle file path"):
quarantine_bundle(bundle)
assert not absolute_target.exists()
# ---------------------------------------------------------------------------
# GitHubSource._download_directory — tree API + fallback (#2940)

View File

@@ -24,7 +24,7 @@ import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from pathlib import Path, PurePosixPath
from hermes_constants import get_hermes_home
from typing import Any, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse, urlunparse
@@ -85,6 +85,43 @@ class SkillBundle:
metadata: Dict[str, Any] = field(default_factory=dict)
def _normalize_bundle_path(path_value: str, *, field_name: str, allow_nested: bool) -> str:
"""Normalize and validate bundle-controlled paths before touching disk."""
if not isinstance(path_value, str):
raise ValueError(f"Unsafe {field_name}: expected a string")
raw = path_value.strip()
if not raw:
raise ValueError(f"Unsafe {field_name}: empty path")
normalized = raw.replace("\\", "/")
path = PurePosixPath(normalized)
parts = [part for part in path.parts if part not in ("", ".")]
if normalized.startswith("/") or path.is_absolute():
raise ValueError(f"Unsafe {field_name}: {path_value}")
if not parts or any(part == ".." for part in parts):
raise ValueError(f"Unsafe {field_name}: {path_value}")
if re.fullmatch(r"[A-Za-z]:", parts[0]):
raise ValueError(f"Unsafe {field_name}: {path_value}")
if not allow_nested and len(parts) != 1:
raise ValueError(f"Unsafe {field_name}: {path_value}")
return "/".join(parts)
def _validate_skill_name(name: str) -> str:
return _normalize_bundle_path(name, field_name="skill name", allow_nested=False)
def _validate_category_name(category: str) -> str:
return _normalize_bundle_path(category, field_name="category", allow_nested=False)
def _validate_bundle_rel_path(rel_path: str) -> str:
return _normalize_bundle_path(rel_path, field_name="bundle file path", allow_nested=True)
# ---------------------------------------------------------------------------
# GitHub Authentication
# ---------------------------------------------------------------------------
@@ -701,6 +738,12 @@ class WellKnownSkillSource(SkillSource):
if not parsed:
return None
try:
skill_name = _validate_skill_name(parsed["skill_name"])
except ValueError:
logger.warning("Well-known skill identifier contained unsafe skill name: %s", identifier)
return None
entry = self._index_entry(parsed["index_url"], parsed["skill_name"])
if not entry:
return None
@@ -713,19 +756,28 @@ class WellKnownSkillSource(SkillSource):
for rel_path in files:
if not isinstance(rel_path, str) or not rel_path:
continue
text = self._fetch_text(f"{parsed['skill_url']}/{rel_path}")
try:
safe_rel_path = _validate_bundle_rel_path(rel_path)
except ValueError:
logger.warning(
"Well-known skill %s advertised unsafe file path: %r",
identifier,
rel_path,
)
return None
text = self._fetch_text(f"{parsed['skill_url']}/{safe_rel_path}")
if text is None:
return None
downloaded[rel_path] = text
downloaded[safe_rel_path] = text
if "SKILL.md" not in downloaded:
return None
return SkillBundle(
name=parsed["skill_name"],
name=skill_name,
files=downloaded,
source="well-known",
identifier=self._wrap_identifier(parsed["base_url"], parsed["skill_name"]),
identifier=self._wrap_identifier(parsed["base_url"], skill_name),
trust_level="community",
metadata={
"index_url": parsed["index_url"],
@@ -1752,9 +1804,10 @@ class ClawHubSource(SkillSource):
for info in zf.infolist():
if info.is_dir():
continue
# Sanitize path — strip leading slashes and ..
name = info.filename.lstrip("/")
if ".." in name or name.startswith("/"):
try:
name = _validate_bundle_rel_path(info.filename)
except ValueError:
logger.debug("Skipping unsafe ZIP member path: %s", info.filename)
continue
# Only extract text-sized files (skip large binaries)
if info.file_size > 500_000:
@@ -2423,13 +2476,19 @@ def ensure_hub_dirs() -> None:
def quarantine_bundle(bundle: SkillBundle) -> Path:
"""Write a skill bundle to the quarantine directory for scanning."""
ensure_hub_dirs()
dest = QUARANTINE_DIR / bundle.name
skill_name = _validate_skill_name(bundle.name)
validated_files: List[Tuple[str, Union[str, bytes]]] = []
for rel_path, file_content in bundle.files.items():
safe_rel_path = _validate_bundle_rel_path(rel_path)
validated_files.append((safe_rel_path, file_content))
dest = QUARANTINE_DIR / skill_name
if dest.exists():
shutil.rmtree(dest)
dest.mkdir(parents=True)
for rel_path, file_content in bundle.files.items():
file_dest = dest / rel_path
for rel_path, file_content in validated_files:
file_dest = dest.joinpath(*rel_path.split("/"))
file_dest.parent.mkdir(parents=True, exist_ok=True)
if isinstance(file_content, bytes):
file_dest.write_bytes(file_content)
@@ -2447,10 +2506,17 @@ def install_from_quarantine(
scan_result: ScanResult,
) -> Path:
"""Move a scanned skill from quarantine into the skills directory."""
if category:
install_dir = SKILLS_DIR / category / skill_name
safe_skill_name = _validate_skill_name(skill_name)
safe_category = _validate_category_name(category) if category else ""
quarantine_resolved = quarantine_path.resolve()
quarantine_root = QUARANTINE_DIR.resolve()
if not quarantine_resolved.is_relative_to(quarantine_root):
raise ValueError(f"Unsafe quarantine path: {quarantine_path}")
if safe_category:
install_dir = SKILLS_DIR / safe_category / safe_skill_name
else:
install_dir = SKILLS_DIR / skill_name
install_dir = SKILLS_DIR / safe_skill_name
if install_dir.exists():
shutil.rmtree(install_dir)
@@ -2461,7 +2527,7 @@ def install_from_quarantine(
# Record in lock file
lock = HubLockFile()
lock.record_install(
name=skill_name,
name=safe_skill_name,
source=bundle.source,
identifier=bundle.identifier,
trust_level=bundle.trust_level,
@@ -2473,7 +2539,7 @@ def install_from_quarantine(
)
append_audit_log(
"INSTALL", skill_name, bundle.source,
"INSTALL", safe_skill_name, bundle.source,
bundle.trust_level, scan_result.verdict,
content_hash(install_dir),
)