[GAIA] Add prompt improvement to alleviate solution parsing issue & support Tavily search tools (OpenHands#9057)

ryanhoangt · web-flow · commit ddaa186971a8 · 2025-06-17T13:16:50.000+07:00
diff --git a/evaluation/benchmarks/gaia/.gitignore b/evaluation/benchmarks/gaia/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/evaluation/benchmarks/gaia/README.md b/evaluation/benchmarks/gaia/README.md
@@ -6,6 +6,13 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench
 
 Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
+To enable the Tavily MCP Server, you can add the Tavily API key under the `core` section of your `config.toml` file, like below:
+
+```toml
+[core]
+search_api_key = "tvly-******"
+```
+
 ## Run the evaluation
 
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
@@ -1,11 +1,13 @@
 import asyncio
+import copy
 import functools
 import os
 import re
 
 import huggingface_hub
 import pandas as pd
 from datasets import load_dataset
+from pydantic import SecretStr
 
 from evaluation.benchmarks.gaia.scorer import question_scorer
 from evaluation.utils.shared import (
@@ -24,6 +26,7 @@
     OpenHandsConfig,
     get_llm_config_arg,
     get_parser,
+    load_from_toml,
 )
 from openhands.core.config.utils import get_agent_config_arg
 from openhands.core.logger import openhands_logger as logger
@@ -41,15 +44,15 @@
 }
 
 AGENT_CLS_TO_INST_SUFFIX = {
-    'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
+    'CodeActAgent': 'When you think you have solved the question, please use the finish tool and include your final answer in the message parameter of the finish tool. Your final answer MUST be encapsulated within <solution> and </solution>.\n'
 }
 
 
 def get_config(
     metadata: EvalMetadata,
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = 'python:3.12-bookworm'
+    sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
     config = OpenHandsConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
@@ -67,6 +70,11 @@ def get_config(
         logger.info('Agent config not provided, using default settings')
         agent_config = config.get_agent_config(metadata.agent_class)
         agent_config.enable_prompt_extensions = False
+
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if config_copy.search_api_key:
+        config.search_api_key = SecretStr(config_copy.search_api_key)
     return config
 
 
@@ -134,16 +142,26 @@ def process_instance(
         dest_file = None
 
     # Prepare instruction
-    instruction = f'{instance["Question"]}\n'
+    instruction = """You have one question to answer. It is paramount that you provide a correct answer.
+Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
+You must make sure you find the correct answer! You MUST strictly follow the task-specific formatting instructions for your final answer.
+Here is the task:
+{task_question}
+""".format(
+        task_question=instance['Question'],
+    )
     logger.info(f'Instruction: {instruction}')
     if dest_file:
         instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'
 
-    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-    instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+    instruction += """IMPORTANT: When seeking information from a website, REFRAIN from arbitrary URL navigation. You should utilize the designated search engine tool with precise keywords to obtain relevant URLs or use the specific website's search interface. DO NOT navigate directly to specific URLs as they may not exist.\n\nFor example: if you want to search for a research paper on Arxiv, either use the search engine tool with specific keywords or navigate to arxiv.org and then use its interface.\n"""
+    instruction += 'IMPORTANT: You should NEVER ask for Human Help.\n'
+    instruction += 'IMPORTANT: Please encapsulate your final answer (answer ONLY) within <solution> and </solution>. Your answer will be evaluated using string matching approaches so it important that you STRICTLY adhere to the output formatting instructions specified in the task (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)\n'
     instruction += (
         'For example: The answer to the question is <solution> 42 </solution>.\n'
     )
+    instruction += "IMPORTANT: Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, express it numerically (i.e., with digits rather than words), do not use commas, and do not include units such as $ or percent signs unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities). If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.\n"
+
     # NOTE: You can actually set slightly different instruction for different agents
     instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
     logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
@@ -175,7 +193,7 @@ def process_instance(
     for event in reversed(state.history):
         if event.source == 'agent':
             if isinstance(event, AgentFinishAction):
-                model_answer_raw = event.thought
+                model_answer_raw = event.final_thought
                 break
             elif isinstance(event, CmdRunAction):
                 model_answer_raw = event.thought
@@ -222,6 +240,7 @@ def process_instance(
         error=state.last_error if state and state.last_error else None,
         test_result=test_result,
     )
+    runtime.close()
     return output
 
 
@@ -253,6 +272,8 @@ def process_instance(
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    toml_config = OpenHandsConfig()
+    load_from_toml(toml_config)
     metadata = make_metadata(
         llm_config=llm_config,
         dataset_name='gaia',
@@ -261,7 +282,10 @@ def process_instance(
         eval_note=args.eval_note,
         eval_output_dir=args.eval_output_dir,
         data_split=args.data_split,
-        details={'gaia-level': args.level},
+        details={
+            'gaia-level': args.level,
+            'mcp-servers': ['tavily'] if toml_config.search_api_key else [],
+        },
         agent_config=agent_config,
     )
 
diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -39,7 +39,7 @@ echo "LEVELS: $LEVELS"
 COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
-  --max-iterations 30 \
+  --max-iterations 60 \
   --level $LEVELS \
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
diff --git a/openhands/cli/main.py b/openhands/cli/main.py
@@ -273,9 +273,9 @@ def on_event(event: Event) -> None:
             )
         )
 
-        config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)
+        runtime.config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)
 
-        await add_mcp_tools_to_agent(agent, runtime, memory, config)
+        await add_mcp_tools_to_agent(agent, runtime, memory)
 
     # Clear loading animation
     is_loaded.set()
diff --git a/openhands/core/main.py b/openhands/core/main.py
@@ -139,9 +139,9 @@ async def run_controller(
                 config.mcp_host, config, None
             )
         )
-        config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)
+        runtime.config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)
 
-        await add_mcp_tools_to_agent(agent, runtime, memory, config)
+        await add_mcp_tools_to_agent(agent, runtime, memory)
 
     replay_events: list[Event] | None = None
     if config.replay_trajectory_path:
diff --git a/openhands/mcp/utils.py b/openhands/mcp/utils.py
@@ -10,7 +10,6 @@
     MCPSHTTPServerConfig,
     MCPSSEServerConfig,
 )
-from openhands.core.config.openhands_config import OpenHandsConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.mcp import MCPAction
 from openhands.events.observation.mcp import MCPObservation
@@ -187,9 +186,7 @@ async def call_tool_mcp(mcp_clients: list[MCPClient], action: MCPAction) -> Obse
     )
 
 
-async def add_mcp_tools_to_agent(
-    agent: 'Agent', runtime: Runtime, memory: 'Memory', app_config: OpenHandsConfig
-):
+async def add_mcp_tools_to_agent(agent: 'Agent', runtime: Runtime, memory: 'Memory'):
     """
     Add MCP tools to an agent.
     """
@@ -208,7 +205,6 @@ async def add_mcp_tools_to_agent(
     extra_stdio_servers = []
 
     # Add microagent MCP tools if available
-    mcp_config: MCPConfig = app_config.mcp
     microagent_mcp_configs = memory.get_microagent_mcp_tools()
     for mcp_config in microagent_mcp_configs:
         if mcp_config.sse_servers:
diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py
@@ -158,7 +158,7 @@ async def start(
             # NOTE: this needs to happen before controller is created
             # so MCP tools can be included into the SystemMessageAction
             if self.runtime and runtime_connected and agent.config.enable_mcp:
-                await add_mcp_tools_to_agent(agent, self.runtime, self.memory, config)
+                await add_mcp_tools_to_agent(agent, self.runtime, self.memory)
 
             if replay_json:
                 initial_message = self._run_replay(
diff --git a/tests/runtime/test_microagent.py b/tests/runtime/test_microagent.py
@@ -385,7 +385,6 @@ async def test_add_mcp_tools_from_microagents():
     """Test that add_mcp_tools_to_agent adds tools from microagents."""
     # Import ActionExecutionClient for mocking
 
-    from openhands.core.config.openhands_config import OpenHandsConfig
     from openhands.runtime.impl.action_execution.action_execution_client import (
         ActionExecutionClient,
     )
@@ -394,10 +393,6 @@ async def test_add_mcp_tools_from_microagents():
     mock_agent = MagicMock()
     mock_runtime = MagicMock(spec=ActionExecutionClient)
     mock_memory = MagicMock()
-    mock_mcp_config = MCPConfig()
-
-    # Create a mock OpenHandsConfig with the MCP config
-    mock_app_config = OpenHandsConfig(mcp=mock_mcp_config, search_api_key=None)
 
     # Configure the mock memory to return a microagent MCP config
     mock_stdio_server = MCPStdioServerConfig(
@@ -425,9 +420,7 @@ async def test_add_mcp_tools_from_microagents():
         new=AsyncMock(return_value=[mock_tool]),
     ):
         # Call the function with the OpenHandsConfig instead of MCPConfig
-        await add_mcp_tools_to_agent(
-            mock_agent, mock_runtime, mock_memory, mock_app_config
-        )
+        await add_mcp_tools_to_agent(mock_agent, mock_runtime, mock_memory)
 
         # Verify that the memory's get_microagent_mcp_tools was called
         mock_memory.get_microagent_mcp_tools.assert_called_once()
diff --git a/tests/unit/test_agent_controller.py b/tests/unit/test_agent_controller.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 from unittest.mock import ANY, AsyncMock, MagicMock, patch
 from uuid import uuid4
 
@@ -259,6 +260,7 @@ def on_event(event: Event):
 
     test_event_stream.subscribe(EventStreamSubscriber.RUNTIME, on_event, str(uuid4()))
     runtime.event_stream = test_event_stream
+    runtime.config = copy.deepcopy(config)
 
     def on_event_memory(event: Event):
         if isinstance(event, RecallAction):
@@ -326,6 +328,7 @@ def on_event(event: Event):
 
     test_event_stream.subscribe(EventStreamSubscriber.RUNTIME, on_event, str(uuid4()))
     runtime.event_stream = test_event_stream
+    runtime.config = copy.deepcopy(config)
 
     def on_event_memory(event: Event):
         if isinstance(event, RecallAction):
@@ -762,6 +765,7 @@ def on_event(event: Event):
 
     event_stream.subscribe(EventStreamSubscriber.RUNTIME, on_event, str(uuid4()))
     runtime.event_stream = event_stream
+    runtime.config = copy.deepcopy(config)
 
     def on_event_memory(event: Event):
         if isinstance(event, RecallAction):
@@ -883,15 +887,17 @@ def on_event_memory(event: Event):
     test_event_stream.subscribe(
         EventStreamSubscriber.MEMORY, on_event_memory, str(uuid4())
     )
+    config = OpenHandsConfig(max_iterations=max_iterations)
     mock_runtime.event_stream = test_event_stream
+    mock_runtime.config = copy.deepcopy(config)
 
     # Now we can run the controller for a fixed number of steps. Since the step
     # state is set to error out before then, if this terminates and we have a
     # record of the error being thrown we can be confident that the controller
     # handles the truncation correctly.
     final_state = await asyncio.wait_for(
         run_controller(
-            config=OpenHandsConfig(max_iterations=max_iterations),
+            config=config,
             initial_user_action=MessageAction(content='INITIAL'),
             runtime=mock_runtime,
             sid='test',
@@ -1027,11 +1033,13 @@ def on_event_memory(event: Event):
         EventStreamSubscriber.MEMORY, on_event_memory, str(uuid4())
     )
     mock_runtime.event_stream = test_event_stream
+    config = OpenHandsConfig(max_iterations=5)
+    mock_runtime.config = copy.deepcopy(config)
 
     try:
         state = await asyncio.wait_for(
             run_controller(
-                config=OpenHandsConfig(max_iterations=5),
+                config=config,
                 initial_user_action=MessageAction(content='INITIAL'),
                 runtime=mock_runtime,
                 sid='test',
@@ -1104,10 +1112,12 @@ def on_event_memory(event: Event):
         EventStreamSubscriber.MEMORY, on_event_memory, str(uuid4())
     )
     mock_runtime.event_stream = test_event_stream
+    config = OpenHandsConfig(max_iterations=3)
+    mock_runtime.config = copy.deepcopy(config)
     try:
         state = await asyncio.wait_for(
             run_controller(
-                config=OpenHandsConfig(max_iterations=3),
+                config=config,
                 initial_user_action=MessageAction(content='INITIAL'),
                 runtime=mock_runtime,
                 sid='test',
@@ -1167,6 +1177,7 @@ def agent_step_fn(state):
 
     runtime = MagicMock(spec=ActionExecutionClient)
     runtime.event_stream = event_stream
+    runtime.config = copy.deepcopy(config)
 
     # Create a real Memory instance
     memory = Memory(event_stream=event_stream, sid='test-memory')
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -208,9 +208,7 @@ async def test_run_session_without_initial_action(
     mock_display_runtime_init.assert_called_once_with('local')
     mock_display_animation.assert_called_once()
     mock_create_agent.assert_called_once_with(mock_config)
-    mock_add_mcp_tools.assert_called_once_with(
-        mock_agent, mock_runtime, mock_memory, mock_config
-    )
+    mock_add_mcp_tools.assert_called_once_with(mock_agent, mock_runtime, mock_memory)
     mock_create_runtime.assert_called_once()
     mock_create_controller.assert_called_once()
     mock_create_memory.assert_called_once()

Original file line number	Diff line number	Diff line change
`@@ -273,9 +273,9 @@ def on_event(event: Event) -> None:`
`273`	`273`	`)`
`274`	`274`	`)`
`275`	`275`
`276`		`- config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)`
	`276`	`+ runtime.config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)`
`277`	`277`
`278`		`- await add_mcp_tools_to_agent(agent, runtime, memory, config)`
	`278`	`+ await add_mcp_tools_to_agent(agent, runtime, memory)`
`279`	`279`
`280`	`280`	`# Clear loading animation`
`281`	`281`	`is_loaded.set()`
Original file line number	Diff line number	Diff line change
`@@ -139,9 +139,9 @@ async def run_controller(`
`139`	`139`	`config.mcp_host, config, None`
`140`	`140`	`)`
`141`	`141`	`)`
`142`		`- config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)`
	`142`	`+ runtime.config.mcp.stdio_servers.extend(openhands_mcp_stdio_servers)`
`143`	`143`
`144`		`- await add_mcp_tools_to_agent(agent, runtime, memory, config)`
	`144`	`+ await add_mcp_tools_to_agent(agent, runtime, memory)`
`145`	`145`
`146`	`146`	`replay_events: list[Event] \| None = None`
`147`	`147`	`if config.replay_trajectory_path:`