feat(eval): Support evaluation on SWE-bench-Live (OpenHands#9137)

yetlinghao · web-flow · commit a93b0457c67b · 2025-06-15T12:30:47.000Z
diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md
@@ -2,6 +2,8 @@
 
 This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).
 
+**UPDATE (6/15/2025): We now support running SWE-bench-Live evaluation (see the paper [here](https://arxiv.org/abs/2505.23419))! For how to run it, checkout [this README](./SWE-bench-Live.md).**
+
 **UPDATE (5/26/2025): We now support running interactive SWE-Bench evaluation (see the paper [here](https://arxiv.org/abs/2502.13069))! For how to run it, checkout [this README](./SWE-Interact.md).**
 
 **UPDATE (4/8/2025): We now support running SWT-Bench evaluation! For more details, checkout [the corresponding section](#SWT-Bench-Evaluation).**
diff --git a/evaluation/benchmarks/swe_bench/SWE-bench-Live.md b/evaluation/benchmarks/swe_bench/SWE-bench-Live.md
@@ -0,0 +1,65 @@
+# SWE-bench-Live
+
+<p align="center">
+<a href="https://arxiv.org/abs/2505.23419">📃 Paper</a>
+•
+<a href="https://huggingface.co/SWE-bench-Live" >🤗 HuggingFace</a>
+•
+<a href="https://SWE-bench-Live.github.io" >📊 Leaderboard</a>
+</p>
+
+SWE-bench-Live is a live benchmark for issue resolving, providing a dataset that contains the latest issue tasks. This document explains how to run the evaluation of OpenHands on SWE-bench-Live.
+
+Since SWE-bench-Live has an almost identical setting to SWE-bench, you only need to simply change the dataset name to `SWE-bench-Live/SWE-bench-Live`, the other parts are basically the same as running on SWE-bench.
+
+## Setting Up
+
+Set up the development environment and configure your LLM provider by following the [README](README.md).
+
+## Running Inference
+
+Use the same script, but change the dataset name to `SWE-bench-Live` and select the split (either `lite` or `full`). The lite split contains 300 instances from the past six months, while the full split includes 1,319 instances created after 2024.
+
+```shell
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+```
+
+In the original SWE-bench-Live paper, max_iterations is set to 100.
+
+```shell
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.your_llm HEAD CodeActAgent 300 100 3 SWE-bench-Live/SWE-bench-Live lite
+```
+
+## Evaluating Results
+
+After OpenHands generates patch results for each issue, we evaluate the results using the [SWE-bench-Live evaluation harness](https://github.com/microsoft/SWE-bench-Live).
+
+Convert to the format of predictions for SWE benchmarks:
+
+```shell
+# You can find output.jsonl in evaluation/evaluation_outputs
+python evaluation/benchmarks/swe_bench/scripts/live/convert.py --output_jsonl [path/to/evaluation/output.jsonl] > preds.jsonl
+```
+
+Please refer to the original [SWE-bench-Live repository](https://github.com/microsoft/SWE-bench-Live) to set up the evaluation harness and use the provided scripts to generate the evaluation report:
+
+```shell
+python -m swebench.harness.run_evaluation \
+    --dataset_name SWE-bench-Live/SWE-bench-Live \
+    --split lite \
+    --namespace starryzhang \
+    --predictions_path preds.jsonl \
+    --max_workers 10 \
+    --run_id openhands
+```
+
+## Citation
+
+```bibtex
+@article{zhang2025swebenchgoeslive,
+  title={SWE-bench Goes Live!},
+  author={Linghao Zhang and Shilin He and Chaoyun Zhang and Yu Kang and Bowen Li and Chengxing Xie and Junhao Wang and Maoquan Wang and Yufan Huang and Shengyu Fu and Elsie Nallipogu and Qingwei Lin and Yingnong Dang and Saravan Rajmohan and Dongmei Zhang},
+  journal={arXiv preprint arXiv:2505.23419},
+  year={2025}
+}
+```
diff --git a/evaluation/benchmarks/swe_bench/live_utils.py b/evaluation/benchmarks/swe_bench/live_utils.py
@@ -0,0 +1,80 @@
+from typing import Any
+
+import pandas as pd
+
+from evaluation.utils.shared import assert_and_raise
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action import CmdRunAction
+from openhands.events.observation import (
+    CmdOutputObservation,
+    ErrorObservation,
+)
+from openhands.runtime.base import Runtime
+from openhands.utils.shutdown_listener import sleep_if_should_continue
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> dict[str, Any]:
+    """Complete the runtime and export the git patch for SWE-bench-Live."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+    workspace_dir_name = instance.instance_id
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action)
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+    action = CmdRunAction(command='git config --global core.pager ""')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git config --global core.pager "": {str(obs)}',
+    )
+    action = CmdRunAction(command='git add -A')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git add -A: {str(obs)}',
+    )
+    n_retries = 0
+    git_patch = None
+    while n_retries < 5:
+        action = CmdRunAction(
+            command=f'git diff --no-color --cached {instance["base_commit"]}',
+        )
+        action.set_hard_timeout(100 + 10 * n_retries)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        n_retries += 1
+        if isinstance(obs, CmdOutputObservation):
+            if obs.exit_code == 0:
+                git_patch = obs.content.strip()
+                break
+            else:
+                logger.info('Failed to get git diff, retrying...')
+                sleep_if_should_continue(10)
+        elif isinstance(obs, ErrorObservation):
+            logger.error(f'Error occurred: {obs.content}. Retrying...')
+            sleep_if_should_continue(10)
+        else:
+            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+    return {'git_patch': git_patch}
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -66,14 +66,37 @@
 ENABLE_LLM_EDITOR = os.environ.get('ENABLE_LLM_EDITOR', 'false').lower() == 'true'
 BenchMode = Literal['swe', 'swt', 'swt-ci']
 
+# Global variable to track dataset type
+DATASET_TYPE = 'SWE-bench'
+
+
+def set_dataset_type(dataset_name: str) -> str:
+    """Set dataset type based on dataset name."""
+    global DATASET_TYPE
+    name_lower = dataset_name.lower()
+
+    if 'swe-gym' in name_lower:
+        DATASET_TYPE = 'SWE-Gym'
+    elif 'swe-bench-live' in name_lower:
+        DATASET_TYPE = 'SWE-bench-Live'
+    elif 'multimodal' in name_lower:
+        DATASET_TYPE = 'Multimodal'
+    else:
+        DATASET_TYPE = 'SWE-bench'
+
+    logger.info(f'Dataset type set to: {DATASET_TYPE}')
+
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
 }
 
 
 def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
-    return f'{instance.repo}__{instance.version}'.replace('/', '__')
+    if DATASET_TYPE == 'SWE-bench-Live':
+        return instance.instance_id
+    else:
+        return f'{instance.repo}__{instance.version}'.replace('/', '__')
 
 
 def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
@@ -153,9 +176,13 @@ def get_instance_docker_image(
     if swebench_official_image:
         # Official SWE-Bench image
         # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
-        docker_image_prefix = 'docker.io/swebench/'
+        # SWE-bench-Live uses the same naming convention as SWE-Bench
+        if DATASET_TYPE == 'SWE-bench-Live':
+            docker_image_prefix = 'docker.io/starryzhang/'
+        elif DATASET_TYPE == 'SWE-bench':
+            docker_image_prefix = 'docker.io/swebench/'
         repo, name = instance_id.split('__')
-        image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
+        image_name = f'{docker_image_prefix.rstrip("/")}/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
         logger.debug(f'Using official SWE-Bench image: {image_name}')
         return image_name
     else:
@@ -173,7 +200,8 @@ def get_config(
     metadata: EvalMetadata,
 ) -> OpenHandsConfig:
     # We use a different instance image for the each instance of swe-bench eval
-    use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
+    use_swebench_official_image = DATASET_TYPE != 'SWE-Gym'
+
     base_container_image = get_instance_docker_image(
         instance['instance_id'],
         swebench_official_image=use_swebench_official_image,
@@ -290,8 +318,12 @@ def initialize_runtime(
         runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
 
         # inject the instance swe entry
+        if DATASET_TYPE == 'SWE-bench-Live':
+            entry_script_path = 'instance_swe_entry_live.sh'
+        else:
+            entry_script_path = 'instance_swe_entry.sh'
         runtime.copy_to(
-            str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
+            str(os.path.join(script_dir, f'scripts/setup/{entry_script_path}')),
             '/swe_util/',
         )
 
@@ -311,14 +343,14 @@ def initialize_runtime(
         logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
     assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
 
-    action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
+    action = CmdRunAction(command=f'source /swe_util/{entry_script_path}')
     action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert_and_raise(
         obs.exit_code == 0,
-        f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
+        f'Failed to source /swe_util/{entry_script_path}: {str(obs)}',
     )
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
@@ -371,9 +403,9 @@ def initialize_runtime(
             obs = runtime.run_action(action)
             logger.info(obs, extra={'msg_type': 'OBSERVATION'})
 
-    if 'multimodal' not in metadata.dataset.lower():
+    if DATASET_TYPE != 'Multimodal' and DATASET_TYPE != 'SWE-bench-Live':
         # Only for non-multimodal datasets, we need to activate the testbed environment for Python
-        # SWE-Bench multimodal datasets are not using the testbed environment
+        # SWE-Bench multimodal datasets and SWE-bench-Live are not using the testbed environment
         action = CmdRunAction(command='which python')
         action.set_hard_timeout(600)
         logger.info(action, extra={'msg_type': 'ACTION'})
@@ -615,7 +647,13 @@ def process_instance(
 
         # ======= THIS IS SWE-Bench specific =======
         # Get git patch
-        return_val = complete_runtime(runtime, instance)
+        if DATASET_TYPE == 'SWE-bench-Live':
+            from evaluation.benchmarks.swe_bench.live_utils import (
+                complete_runtime as complete_runtime_fn,
+            )
+        else:
+            complete_runtime_fn = complete_runtime
+        return_val = complete_runtime_fn(runtime, instance)
         git_patch = return_val['git_patch']
         logger.info(
             f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
@@ -720,11 +758,15 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenHands's repo
     dataset = load_dataset(args.dataset, split=args.split)
+
+    # Set the global dataset type based on dataset name
+    set_dataset_type(args.dataset)
+
     swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
     logger.info(
         f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
     )
-    if 'SWE-Gym' in args.dataset:
+    if DATASET_TYPE == 'SWE-Gym':
         with open(
             os.path.join(
                 os.path.dirname(os.path.abspath(__file__)),
diff --git a/evaluation/benchmarks/swe_bench/scripts/live/convert.py b/evaluation/benchmarks/swe_bench/scripts/live/convert.py
@@ -0,0 +1,33 @@
+import argparse
+import json
+
+
+def main(output_jsonl: str):
+    with open(output_jsonl, 'r') as f:
+        for line in f:
+            try:
+                output = json.loads(line)
+                pred = {
+                    'instance_id': output['instance_id'],
+                    'model_name_or_path': output['metadata']['llm_config']['model'],
+                    'model_patch': output['test_result']['git_patch'],
+                }
+            except Exception as e:
+                print(
+                    f'Error while reading output of instance {output["instance_id"]}: {e}'
+                )
+
+            print(json.dumps(pred))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--output_jsonl',
+        type=str,
+        required=True,
+        help='Path to the prediction file (.../outputs.jsonl)',
+    )
+    args = parser.parse_args()
+
+    main(args.output_jsonl)
diff --git a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry_live.sh b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry_live.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+source ~/.bashrc
+SWEUTIL_DIR=/swe_util
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+
+echo "WORKSPACE_NAME: $SWE_INSTANCE_ID"
+
+# Clear the workspace
+if [ -d /workspace ]; then
+    rm -rf /workspace/*
+else
+    mkdir /workspace
+fi
+# Copy repo to workspace
+if [ -d /workspace/$SWE_INSTANCE_ID ]; then
+    rm -rf /workspace/$SWE_INSTANCE_ID
+fi
+mkdir -p /workspace
+cp -r /testbed /workspace/$SWE_INSTANCE_ID
+
+# SWE-bench-Live does not use conda to manage Python
+# if [ -d /opt/miniconda3 ]; then
+#     . /opt/miniconda3/etc/profile.d/conda.sh
+#     conda activate testbed
+# fi
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
@@ -263,8 +263,19 @@ def prepare_dataset(
             f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
         )
 
+    def make_serializable(instance: pd.Series) -> dict:
+        import numpy as np
+
+        instance_dict = instance.to_dict()
+        for k, v in instance_dict.items():
+            if isinstance(v, np.ndarray):
+                instance_dict[k] = v.tolist()
+            elif isinstance(v, pd.Timestamp):
+                instance_dict[k] = str(v)
+        return instance_dict
+
     new_dataset = [
-        instance
+        make_serializable(instance)
         for _, instance in dataset.iterrows()
         if str(instance[id_column]) not in finished_ids
     ]