Compare commits

...

3 Commits

Author SHA1 Message Date
teknium1 975fd86dc4 fix: eliminate double LLM judge call and eval buffer pollution
evaluate() was calling _llm_judge twice per item (once via
compute_reward, once directly) — double the API cost for no benefit.
Now extracts correctness from compute_reward's buffer instead.

Also: compute_reward appends to training metric buffers during eval,
which would pollute wandb training charts. Now rolls back buffer
entries added during eval so training metrics stay clean.
2026-03-09 20:57:46 -07:00
teknium1 bf8350ac18 fix: evaluate() uses full agent loop with tools, not single-turn
The evaluate method was doing single-turn chat_completion (no tools),
which defeats the purpose of an agentic research benchmark. Fixed to
run the full HermesAgentLoop with web_search/web_extract tools.

Results comparison (Claude Sonnet 4.5, FRAMES benchmark):
  Without tools (broken): 0.56 mean correctness
  With agent loop + tools: 1.00 mean correctness, 0.994 reward

New eval metrics: mean_correctness, mean_reward, mean_tool_calls,
tool_usage_rate — all logged via evaluate_log() in lighteval format.
2026-03-09 19:53:28 -07:00
teknium1 320f881e0b fix: WebResearchEnv compute_reward extracts from AgentResult.messages
AgentResult has .messages (list of dicts), not .final_response or
.tool_calls. Fixed compute_reward to extract the final response
and tool names from the message history.

Verified with live process mode test:
  - Agent used 7 tool calls (web_search, web_extract)
  - Produced a 1106-char researched response about Winter Olympics
  - Reward: 0.384 (partial correctness via LLM judge)
  - JSONL output contains valid tokens, masks, scores, messages
2026-03-09 19:29:12 -07:00
+107 -32
View File
@@ -356,10 +356,19 @@ class WebResearchEnv(HermesAgentBaseEnv):
efficiency_weight * efficiency — penalizes wasteful tool usage
+ diversity_bonus — source diversity (≥2 distinct domains)
"""
final_response: str = result.final_response or ""
tools_used: list[str] = [
tc.tool_name for tc in (result.tool_calls or [])
] if hasattr(result, "tool_calls") and result.tool_calls else []
# Extract final response from messages (last assistant message with content)
final_response = ""
tools_used: list[str] = []
for msg in reversed(result.messages):
if msg.get("role") == "assistant" and msg.get("content") and not final_response:
final_response = msg["content"]
# Collect tool names from tool call messages
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
fn = tc.get("function", {}) if isinstance(tc, dict) else {}
name = fn.get("name", "")
if name:
tools_used.append(name)
tool_call_count: int = result.turns_used or len(tools_used)
cfg = self.config
@@ -416,8 +425,16 @@ class WebResearchEnv(HermesAgentBaseEnv):
# ------------------------------------------------------------------
async def evaluate(self, *args, **kwargs) -> None:
"""Run evaluation on the held-out split using the agent loop."""
"""Run evaluation on the held-out split using the full agent loop with tools.
Each eval item runs through the same agent loop as training —
the model can use web_search, web_extract, etc. to research answers.
This measures actual agentic research capability, not just knowledge.
"""
import time
import uuid
from environments.agent_loop import HermesAgentLoop
from environments.tool_context import ToolContext
items = self._eval_items
if not items:
@@ -427,43 +444,88 @@ class WebResearchEnv(HermesAgentBaseEnv):
eval_size = min(self.config.eval_size, len(items))
eval_items = items[:eval_size]
logger.info(f"Running eval on {len(eval_items)} questions...")
logger.info(f"Running eval on {len(eval_items)} questions (with agent loop + tools)...")
start_time = time.time()
samples = []
for item in eval_items:
# Resolve tools once for all eval items
tools, valid_names = self._resolve_tools_for_group()
for i, item in enumerate(eval_items):
task_id = str(uuid.uuid4())
logger.info(f"Eval [{i+1}/{len(eval_items)}]: {item['question'][:80]}...")
try:
# Use the base env's agent loop for eval (same as training)
prompt = self.format_prompt(item)
completion = await self.server.chat_completion(
messages=[
{"role": "system", "content": self.config.system_prompt or ""},
{"role": "user", "content": prompt},
],
n=1,
# Build messages
messages: List[Dict[str, Any]] = []
if self.config.system_prompt:
messages.append({"role": "system", "content": self.config.system_prompt})
messages.append({"role": "user", "content": self.format_prompt(item)})
# Run the full agent loop with tools
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=0.0, # Deterministic for eval
max_tokens=self.config.max_token_length,
temperature=0.0,
split="eval",
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
response_content = (
completion.choices[0].message.content if completion.choices else ""
)
# Extract final response and tool usage from messages
final_response = ""
tool_call_count = 0
for msg in reversed(result.messages):
if msg.get("role") == "assistant" and msg.get("content") and not final_response:
final_response = msg["content"]
if msg.get("role") == "assistant" and msg.get("tool_calls"):
tool_call_count += len(msg["tool_calls"])
# Score the response
correctness = await self._llm_judge(
question=item["question"],
expected=item["answer"],
model_answer=response_content,
# Compute reward (includes LLM judge for correctness)
# Temporarily save buffer lengths so we can extract the
# correctness score without calling judge twice, and avoid
# polluting training metric buffers with eval data.
buf_len = len(self._correctness_buffer)
ctx = ToolContext(task_id)
try:
reward = await self.compute_reward(item, result, ctx)
finally:
ctx.cleanup()
# Extract correctness from the buffer (compute_reward appended it)
# then remove eval entries from training buffers
correctness = (
self._correctness_buffer[buf_len]
if len(self._correctness_buffer) > buf_len
else 0.0
)
# Roll back buffers to avoid polluting training metrics
for buf in (
self._reward_buffer, self._correctness_buffer,
self._tool_usage_buffer, self._efficiency_buffer,
self._diversity_buffer,
):
if len(buf) > buf_len:
buf.pop()
samples.append({
"prompt": item["question"],
"response": response_content,
"response": final_response[:500],
"expected": item["answer"],
"correctness": correctness,
"reward": reward,
"tool_calls": tool_call_count,
"turns": result.turns_used,
})
logger.info(
f" → correctness={correctness:.2f}, reward={reward:.3f}, "
f"tools={tool_call_count}, turns={result.turns_used}"
)
except Exception as e:
logger.error(f"Eval error on item: {e}")
samples.append({
@@ -471,20 +533,33 @@ class WebResearchEnv(HermesAgentBaseEnv):
"response": f"ERROR: {e}",
"expected": item["answer"],
"correctness": 0.0,
"reward": 0.0,
"tool_calls": 0,
"turns": 0,
})
end_time = time.time()
# Compute metrics
# Compute aggregate metrics
correctness_scores = [s["correctness"] for s in samples]
rewards = [s["reward"] for s in samples]
tool_counts = [s["tool_calls"] for s in samples]
n = len(samples)
eval_metrics = {
"eval/mean_correctness": (
sum(correctness_scores) / len(correctness_scores)
if correctness_scores else 0.0
),
"eval/n_items": len(samples),
"eval/mean_correctness": sum(correctness_scores) / n if n else 0.0,
"eval/mean_reward": sum(rewards) / n if n else 0.0,
"eval/mean_tool_calls": sum(tool_counts) / n if n else 0.0,
"eval/tool_usage_rate": sum(1 for t in tool_counts if t > 0) / n if n else 0.0,
"eval/n_items": n,
}
logger.info(
f"Eval complete — correctness={eval_metrics['eval/mean_correctness']:.3f}, "
f"reward={eval_metrics['eval/mean_reward']:.3f}, "
f"tool_usage={eval_metrics['eval/tool_usage_rate']:.0%}"
)
await self.evaluate_log(
metrics=eval_metrics,
samples=samples,