prevent leakage of morph instances between tasks

fix leakage
update snapshot
2025-11-04 03:32:43 -05:00 · 2025-11-03 17:42:23 -05:00 · 2025-11-02 23:13:49 -05:00 · 2025-11-02 06:03:21 +00:00 · 2025-11-01 22:39:21 -07:00
9 changed files with 890 additions and 695 deletions
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -166,9 +166,9 @@ def _process_single_prompt(
            verbose_logging=config.get("verbose", False),
            ephemeral_system_prompt=config.get("ephemeral_system_prompt")
        )
-        
-        # Run the agent
-        result = agent.run_conversation(prompt)
+
+        # Run the agent with task_id to ensure each task gets its own isolated VM
+        result = agent.run_conversation(prompt, task_id=f"task_{prompt_index}")
        
        # Extract tool usage statistics
        tool_stats = _extract_tool_stats(result["messages"])
--- a/model_tools.py
+++ b/model_tools.py
@@ -28,7 +28,7 @@ Usage:

 import json
 import asyncio
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Optional

 from tools.web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key
 from tools.terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
@@ -480,14 +480,15 @@ def handle_web_function_call(function_name: str, function_args: Dict[str, Any])
    else:
        return json.dumps({"error": f"Unknown web function: {function_name}"})

-def handle_terminal_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+def handle_terminal_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
    """
    Handle function calls for terminal tools.
-    
+
    Args:
        function_name (str): Name of the terminal function to call
        function_args (Dict): Arguments for the function
-    
+        task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional)
+
    Returns:
        str: Function result as JSON string
    """
@@ -498,8 +499,8 @@ def handle_terminal_function_call(function_name: str, function_args: Dict[str, A
        idle_threshold = function_args.get("idle_threshold", 5.0)
        timeout = function_args.get("timeout")

-        return terminal_tool(command, input_keys, None, background, idle_threshold, timeout)
-    
+        return terminal_tool(command, input_keys, None, background, idle_threshold, timeout, task_id)
+
    else:
        return json.dumps({"error": f"Unknown terminal function: {function_name}"})

@@ -614,21 +615,22 @@ def handle_image_function_call(function_name: str, function_args: Dict[str, Any]
        return json.dumps({"error": f"Unknown image generation function: {function_name}"})


-def handle_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+def handle_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
    """
    Main function call dispatcher that routes calls to appropriate toolsets.
-    
+
    This function determines which toolset a function belongs to and dispatches
    the call to the appropriate handler. This makes it easy to add new toolsets
    without changing the main calling interface.
-    
+
    Args:
        function_name (str): Name of the function to call
        function_args (Dict): Arguments for the function
-    
+        task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional)
+
    Returns:
        str: Function result as JSON string
-    
+
    Raises:
        None: Returns error as JSON string instead of raising exceptions
    """
@@ -636,28 +638,28 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any]) -> s
        # Route web tools
        if function_name in ["web_search", "web_extract", "web_crawl"]:
            return handle_web_function_call(function_name, function_args)
-        
+
        # Route terminal tools
        elif function_name in ["terminal"]:
-            return handle_terminal_function_call(function_name, function_args)
-        
+            return handle_terminal_function_call(function_name, function_args, task_id)
+
        # Route vision tools
        elif function_name in ["vision_analyze"]:
            return handle_vision_function_call(function_name, function_args)
-        
+
        # Route MoA tools
        elif function_name in ["mixture_of_agents"]:
            return handle_moa_function_call(function_name, function_args)
-        
+
        # Route image generation tools
        elif function_name in ["image_generate"]:
            return handle_image_function_call(function_name, function_args)
-        
+
        else:
            error_msg = f"Unknown function: {function_name}"
            print(f"❌ {error_msg}")
            return json.dumps({"error": error_msg})
-    
+
    except Exception as e:
        error_msg = f"Error executing {function_name}: {str(e)}"
        print(f"❌ {error_msg}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,4 @@ openai
 fal-client
 python-dotenv
 fire
-requests
+httpx
--- a/run_agent.py
+++ b/run_agent.py
@@ -43,6 +43,7 @@ else:

 # Import our tool system
 from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
+from tools.terminal_tool import cleanup_vm


 class AIAgent:
@@ -54,9 +55,9 @@ class AIAgent:
    """
    
    def __init__(
-        self, 
-        base_url: str = None, 
-        api_key: str = None, 
+        self,
+        base_url: str = None,
+        api_key: str = None,
        model: str = "gpt-4",
        max_iterations: int = 10,
        tool_delay: float = 1.0,
@@ -68,7 +69,7 @@ class AIAgent:
    ):
        """
        Initialize the AI Agent.
-        
+
        Args:
            base_url (str): Base URL for the model API (optional)
            api_key (str): API key for authentication (optional, uses env var if not provided)
@@ -87,7 +88,7 @@ class AIAgent:
        self.save_trajectories = save_trajectories
        self.verbose_logging = verbose_logging
        self.ephemeral_system_prompt = ephemeral_system_prompt
-        
+
        # Store toolset filtering options
        self.enabled_toolsets = enabled_toolsets
        self.disabled_toolsets = disabled_toolsets
@@ -342,22 +343,27 @@ class AIAgent:
            print(f"⚠️ Failed to save trajectory: {e}")
    
    def run_conversation(
-        self, 
-        user_message: str, 
-        system_message: str = None, 
-        conversation_history: List[Dict[str, Any]] = None
+        self,
+        user_message: str,
+        system_message: str = None,
+        conversation_history: List[Dict[str, Any]] = None,
+        task_id: str = None
    ) -> Dict[str, Any]:
        """
        Run a complete conversation with tool calling until completion.
-        
+
        Args:
            user_message (str): The user's message/question
            system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
            conversation_history (List[Dict]): Previous conversation messages (optional)
-            
+            task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
+
        Returns:
            Dict: Complete conversation result with final response and message history
        """
+        # Generate unique task_id if not provided to isolate VMs between concurrent tasks
+        import uuid
+        effective_task_id = task_id or str(uuid.uuid4())
        # Initialize conversation
        messages = conversation_history or []
        
@@ -469,12 +475,12 @@ class AIAgent:
                            function_args = {}
                        
                        print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
-                        
+
                        tool_start_time = time.time()
-                        
-                        # Execute the tool
-                        function_result = handle_function_call(function_name, function_args)
-                        
+
+                        # Execute the tool with task_id to isolate VMs between concurrent tasks
+                        function_result = handle_function_call(function_name, function_args, effective_task_id)
+
                        tool_duration = time.time() - tool_start_time
                        result_preview = function_result[:200] if len(function_result) > 200 else function_result
                        
@@ -537,10 +543,17 @@ class AIAgent:
        
        # Determine if conversation completed successfully
        completed = final_response is not None and api_call_count < self.max_iterations
-        
+
        # Save trajectory if enabled
        self._save_trajectory(messages, user_message, completed)
-        
+
+        # Clean up VM for this task after conversation completes
+        try:
+            cleanup_vm(effective_task_id)
+        except Exception as e:
+            if self.verbose_logging:
+                logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}")
+
        return {
            "final_response": final_response,
            "messages": messages,
--- a/run_datagen_megascience.sh
+++ b/run_datagen_megascience.sh
@@ -0,0 +1,12 @@
+python batch_runner.py \
+  --dataset_file="hermes-agent-megascience-data/hermes_agent_megascience_eval.jsonl" \
+  --batch_size=10 \
+  --run_name="megascience_eval_gpt5_2" \
+  --distribution="science" \
+  --model="gpt-5" \
+  --base_url="https://api.openai.com/v1" \
+  --api_key="${OPENAI_API_KEY}" \
+  --num_workers=5 \
+  --max_turns=30 \
+  --verbose \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use a tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should not be confident in your own reasoning, knowledge, or calculations without using a tool to verify or validate your work."
--- a/run_datagen_megascience_glm4-6.sh
+++ b/run_datagen_megascience_glm4-6.sh
@@ -0,0 +1,12 @@
+python batch_runner.py \
+  --dataset_file="hermes-agent-megascience-data/hermes_agent_megascience_eval.jsonl" \
+  --batch_size=10 \
+  --run_name="megascience_eval_glm4-6-fixedterminal" \
+  --distribution="science" \
+  --model="z-ai/glm-4.6" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --api_key="${OPENROUTER_API_KEY}" \
+  --num_workers=5 \
+  --max_turns=30 \
+  --verbose \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use a tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work."
--- a/tests/test_web_tools.py
+++ b/tests/test_web_tools.py
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@@ -4,8 +4,12 @@ Terminal Tool Module

 This module provides a single terminal tool using Hecate's VM infrastructure.
 It wraps Hecate's functionality to provide a simple interface for executing commands
-on Morph VMs with automatic lifecycle management. VMs live for 5 minutes after last use.
-Timer resets with each use.
+on Morph VMs with automatic lifecycle management.
+
+VM Lifecycle:
+- VMs have a TTL (time to live) set at creation (default: 20 minutes)
+- VMs are also cleaned up locally after 5 minutes of inactivity
+- Timer resets with each use

 Available tool:
 - terminal_tool: Execute commands with optional interactive session support
@@ -24,6 +28,8 @@ import json
 import os
 import uuid
 import threading
+import time
+import atexit
 from typing import Optional, Dict, Any

 # Detailed description for the terminal tool based on Hermes Terminal system prompt
@@ -75,9 +81,137 @@ When commands enter interactive mode (vim, nano, less, git prompts, package mana

 # Global state for VM lifecycle management
 # These persist across tool calls to enable session continuity
-_active_instance = None
-_active_context = None
+# Changed to dictionaries keyed by task_id to prevent leakage between concurrent tasks
+_active_instances: Dict[str, Any] = {}
+_active_contexts: Dict[str, Any] = {}
+_last_activity: Dict[str, float] = {}  # Track last activity time for each VM
 _instance_lock = threading.Lock()
+_cleanup_thread = None
+_cleanup_running = False
+
+def _cleanup_inactive_vms(vm_lifetime_seconds: int = 300):
+    """
+    Clean up VMs that have been inactive for longer than vm_lifetime_seconds.
+    This function should be called periodically by a background thread.
+
+    Args:
+        vm_lifetime_seconds: Maximum lifetime in seconds for inactive VMs (default: 300)
+    """
+    global _active_instances, _active_contexts, _last_activity
+
+    current_time = time.time()
+    tasks_to_cleanup = []
+
+    with _instance_lock:
+        # Find all VMs that have been inactive for too long
+        for task_id, last_time in list(_last_activity.items()):
+            if current_time - last_time > vm_lifetime_seconds:
+                tasks_to_cleanup.append(task_id)
+
+        # Clean up the inactive VMs
+        for task_id in tasks_to_cleanup:
+            try:
+                if task_id in _active_instances:
+                    instance = _active_instances[task_id]
+                    # Terminate the VM instance
+                    if hasattr(instance, 'terminate'):
+                        instance.terminate()
+                    elif hasattr(instance, 'stop'):
+                        instance.stop()
+                    elif hasattr(instance, 'delete'):
+                        instance.delete()
+
+                    # Remove from tracking dictionaries
+                    del _active_instances[task_id]
+                    print(f"[VM Cleanup] Terminated inactive VM for task: {task_id}")
+
+                if task_id in _active_contexts:
+                    del _active_contexts[task_id]
+
+                if task_id in _last_activity:
+                    del _last_activity[task_id]
+
+            except Exception as e:
+                print(f"[VM Cleanup] Error cleaning up VM for task {task_id}: {e}")
+
+def _cleanup_thread_worker():
+    """
+    Background thread worker that periodically cleans up inactive VMs.
+    Runs every 60 seconds.
+    """
+    global _cleanup_running
+
+    while _cleanup_running:
+        try:
+            vm_lifetime = int(os.getenv("HECATE_VM_LIFETIME_SECONDS", "300"))
+            _cleanup_inactive_vms(vm_lifetime)
+        except Exception as e:
+            print(f"[VM Cleanup] Error in cleanup thread: {e}")
+
+        # Sleep for 60 seconds, but check every second if we should stop
+        for _ in range(60):
+            if not _cleanup_running:
+                break
+            time.sleep(1)
+
+def _start_cleanup_thread():
+    """
+    Start the background cleanup thread if it's not already running.
+    """
+    global _cleanup_thread, _cleanup_running
+
+    with _instance_lock:
+        if _cleanup_thread is None or not _cleanup_thread.is_alive():
+            _cleanup_running = True
+            _cleanup_thread = threading.Thread(target=_cleanup_thread_worker, daemon=True)
+            _cleanup_thread.start()
+
+def _stop_cleanup_thread():
+    """
+    Stop the background cleanup thread.
+    """
+    global _cleanup_running
+    _cleanup_running = False
+    if _cleanup_thread is not None:
+        _cleanup_thread.join(timeout=5)
+
+def cleanup_vm(task_id: str):
+    """
+    Manually clean up a specific VM by task_id.
+    This should be called when a task is completed.
+
+    Args:
+        task_id: The task ID of the VM to clean up
+    """
+    global _active_instances, _active_contexts, _last_activity
+
+    with _instance_lock:
+        try:
+            if task_id in _active_instances:
+                instance = _active_instances[task_id]
+                # Terminate the VM instance
+                if hasattr(instance, 'terminate'):
+                    instance.terminate()
+                elif hasattr(instance, 'stop'):
+                    instance.stop()
+                elif hasattr(instance, 'delete'):
+                    instance.delete()
+
+                # Remove from tracking dictionaries
+                del _active_instances[task_id]
+                print(f"[VM Cleanup] Manually terminated VM for task: {task_id}")
+
+            if task_id in _active_contexts:
+                del _active_contexts[task_id]
+
+            if task_id in _last_activity:
+                del _last_activity[task_id]
+
+        except Exception as e:
+            print(f"[VM Cleanup] Error manually cleaning up VM for task {task_id}: {e}")
+
+# Register cleanup on program exit
+atexit.register(_stop_cleanup_thread)

 def terminal_tool(
    command: Optional[str] = None,
@@ -85,23 +219,25 @@ def terminal_tool(
    session_id: Optional[str] = None,
    background: bool = False,
    idle_threshold: float = 5.0,
-    timeout: Optional[int] = None
+    timeout: Optional[int] = None,
+    task_id: Optional[str] = None
 ) -> str:
    """
    Execute a command on a Morph VM with optional interactive session support.
-    
+
    This tool uses Hecate's VM lifecycle management to automatically create
    and manage VMs. VMs are reused within the configured lifetime window
    and automatically cleaned up after inactivity.
-    
+
    Args:
        command: The command to execute (optional if continuing existing session)
        input_keys: Keystrokes to send to interactive session (e.g., "hello\\n")
        session_id: ID of existing session to continue (optional)
-        background: Whether to run the command in the background (default: False) 
+        background: Whether to run the command in the background (default: False)
        idle_threshold: Seconds to wait for output before considering session idle (default: 5.0)
        timeout: Command timeout in seconds (optional)
-    
+        task_id: Unique identifier for this task to isolate VMs between concurrent tasks (optional)
+
    Returns:
        str: JSON string containing command output, session info, exit code, and any errors
    
@@ -120,7 +256,7 @@ def terminal_tool(
        # Run a background task
        >>> result = terminal_tool(command="sleep 60", background=True)
    """
-    global _active_instance, _active_context
+    global _active_instances, _active_contexts

    try:
        # Import required modules lazily so this module can be imported
@@ -135,15 +271,14 @@ def terminal_tool(
            return json.dumps({
                "output": "",
                "screen": "",
-                "session_id": None,
                "exit_code": -1,
-                "error": f"Terminal tool is disabled due to import error: {import_error}",
-                "status": "disabled"
+                "error": f"Terminal tool is disabled due to import error: {import_error}"
            })

        # Get configuration from environment
        vm_lifetime_seconds = int(os.getenv("HECATE_VM_LIFETIME_SECONDS", "300"))
-        snapshot_id = os.getenv("HECATE_DEFAULT_SNAPSHOT_ID", "python-2025-10-31")
+        vm_ttl_seconds = int(os.getenv("HECATE_VM_TTL_SECONDS", "1200"))  # 20 minutes default
+        snapshot_id = os.getenv("HECATE_DEFAULT_SNAPSHOT_ID", "snapshot_defv9tjg")

        # Check API key
        morph_api_key = os.getenv("MORPH_API_KEY")
@@ -151,25 +286,37 @@ def terminal_tool(
            return json.dumps({
                "output": "",
                "screen": "",
-                "session_id": None,
                "exit_code": -1,
-                "error": "MORPH_API_KEY environment variable not set",
-                "status": "disabled"
+                "error": "MORPH_API_KEY environment variable not set"
            })

-        # Get or create VM instance and execution context
+        # Use task_id to isolate VMs between concurrent tasks
+        # If no task_id provided, use "default" for backward compatibility
+        effective_task_id = task_id or "default"
+
+        # Start the cleanup thread if not already running
+        _start_cleanup_thread()
+
+        # Get or create VM instance and execution context per task
        # This is critical for interactive session support - the context must persist!
        with _instance_lock:
-            if _active_instance is None:
+            if effective_task_id not in _active_instances:
                morph_client = MorphCloudClient(api_key=morph_api_key)
-                _active_instance = morph_client.instances.start(snapshot_id=snapshot_id)
+                _active_instances[effective_task_id] = morph_client.instances.start(
+                    snapshot_id=snapshot_id,
+                    ttl_seconds=vm_ttl_seconds,
+                    ttl_action="stop"
+                )

-            # Get or create persistent execution context
-            if _active_context is None:
-                _active_context = ExecutionContext()
+            # Get or create persistent execution context per task
+            if effective_task_id not in _active_contexts:
+                _active_contexts[effective_task_id] = ExecutionContext()

-            instance = _active_instance
-            ctx = _active_context
+            # Update last activity time for this VM (resets the inactivity timer)
+            _last_activity[effective_task_id] = time.time()
+
+            instance = _active_instances[effective_task_id]
+            ctx = _active_contexts[effective_task_id]

        # Build tool input based on provided parameters
        tool_input = {}
@@ -208,15 +355,13 @@ def terminal_tool(
            ctx=ctx
        )

-        # Format the result with all possible fields
+        # Format the result with only essential fields for the LLM
        # Map hecate's "stdout" to "output" for compatibility
        formatted_result = {
            "output": result.get("stdout", result.get("output", "")),
            "screen": result.get("screen", ""),
-            "session_id": result.get("session_id"),
            "exit_code": result.get("returncode", result.get("exit_code", -1)),
-            "error": result.get("error"),
-            "status": "active" if result.get("session_id") else "ended"
+            "error": result.get("error")
        }

        return json.dumps(formatted_result)
@@ -225,10 +370,8 @@ def terminal_tool(
        return json.dumps({
            "output": "",
            "screen": "",
-            "session_id": None,
            "exit_code": -1,
-            "error": f"Failed to execute terminal command: {str(e)}",
-            "status": "error"
+            "error": f"Failed to execute terminal command: {str(e)}"
        })

 def check_hecate_requirements() -> bool:
@@ -304,5 +447,6 @@ if __name__ == "__main__":
    print("\nEnvironment Variables:")
    print(f"  MORPH_API_KEY: {'Set' if os.getenv('MORPH_API_KEY') else 'Not set'}")
    print(f"  OPENAI_API_KEY: {'Set' if os.getenv('OPENAI_API_KEY') else 'Not set (optional)'}")
-    print(f"  HECATE_VM_LIFETIME_SECONDS: {os.getenv('HECATE_VM_LIFETIME_SECONDS', '300')} (default: 300)")
-    print(f"  HECATE_DEFAULT_SNAPSHOT_ID: {os.getenv('HECATE_DEFAULT_SNAPSHOT_ID', 'snapshot_p5294qxt')} (default: snapshot_p5294qxt)")
+    print(f"  HECATE_VM_TTL_SECONDS: {os.getenv('HECATE_VM_TTL_SECONDS', '1200')} (default: 1200 / 20 minutes)")
+    print(f"  HECATE_VM_LIFETIME_SECONDS: {os.getenv('HECATE_VM_LIFETIME_SECONDS', '300')} (default: 300 / 5 minutes)")
+    print(f"  HECATE_DEFAULT_SNAPSHOT_ID: {os.getenv('HECATE_DEFAULT_SNAPSHOT_ID', 'snapshot_defv9tjg')} (default: snapshot_defv9tjg)")
--- a/toolset_distributions.py
+++ b/toolset_distributions.py
@@ -61,7 +61,19 @@ DISTRIBUTIONS = {
            "terminal": 10  # 10% chance of terminal tools
        }
    },
-    
+
+    # Scientific problem solving focused distribution
+    "science": {
+        "description": "Web research with vision analysis and reasoning",
+        "toolsets": {
+            "web": 94,      # 90% chance of web tools
+            "vision": 50,   # 50% chance of vision tools
+            "moa": 10,      # 40% chance of reasoning tools
+            "terminal": 94,  # 10% chance of terminal tools
+            "image_gen": 15  # 80% chance of image generation tools
+        }
+    },
+
    # Development-focused distribution
    "development": {
        "description": "Terminal and reasoning with occasional web lookup",
Author	SHA1	Message	Date
hjc-puro	fbd3a2fdb8	prevent leakage of morph instances between tasks	2025-11-04 03:32:43 -05:00
hjc-puro	a4db3fdee5	fix leakage	2025-11-03 17:42:23 -05:00
hjc-puro	0ca3e0aaa9	update snapshot	2025-11-02 23:13:49 -05:00
teknium	f6f75cbe2b	update webtools	2025-11-02 06:03:21 +00:00
Teknium	d4544f08c5	Merge pull request #4 from NousResearch/fix-terminal Fix terminal interactivity	2025-11-01 22:39:21 -07:00