open-compass · sudanl · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Nov 11, 2025
diff --git a/examples/eval_ProcessBench.py b/examples/eval_ProcessBench.py
@@ -0,0 +1,40 @@
+from mmengine.config import read_base
+from opencompass.models import VLLMwithChatTemplate
+
+with read_base():
+    from opencompass.configs.datasets.ProcessBench.processbench_gen import processbench_datasets as processbench_datasets
+
+    from opencompass.configs.models.qwen2_5.vllm_qwen2_5_7b_instruct import models as vllm_qwen2_5_7b_instruct_model
+    from opencompass.configs.models.qwen2_5.vllm_qwen2_5_14b_instruct import models as vllm_qwen2_5_14b_instruct_model
+    from opencompass.configs.models.qwen2_5.vllm_qwen2_5_32b_instruct import models as vllm_qwen2_5_32b_instruct_model
+    from opencompass.configs.models.qwen2_5.vllm_qwen2_5_72b_instruct import models as vllm_qwen2_5_72b_instruct_model
+    # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
+    # from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=8,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLEvalTask)
+    ),
+)
+
+work_dir = 'outputs/ProcessBench'
diff --git a/opencompass/configs/datasets/ProcessBench/README.md b/opencompass/configs/datasets/ProcessBench/README.md
@@ -0,0 +1,82 @@
+# ProcessBench: Identifying Process Errors in Mathematical Reasoning
+
+ProcessBench is a benchmark dataset for evaluating large language models' ability to identify errors in mathematical reasoning processes. Developed by the Qwen team and published at **ACL 2025**, this dataset aims to assess whether models can accurately locate where errors occur in multi-step mathematical reasoning.
+
+## Overview
+
+### Purpose
+
+While large language models excel at mathematical problem-solving, their ability to identify and locate errors in reasoning processes is equally important. ProcessBench focuses on evaluating models' capabilities in:
+- **Error Detection**: Determining whether errors exist in the reasoning process
+- **Error Localization**: Accurately identifying the step where the earliest error occurs
+- **Process Understanding**: Deep comprehension of logical chains in multi-step mathematical reasoning
+
+### Dataset Construction
+
+ProcessBench contains four subsets at different difficulty levels:
+- **gsm8k**: Grade school math word problems
+- **math**: High school competition mathematics
+- **olympiadbench**: Mathematical Olympiad problems
+- **omnimath**: Comprehensive mathematical problems
+
+Each sample includes:
+- **problem**: The original math problem
+- **steps**: Step-by-step solution process
+- **label**: Index of the error step (-1 indicates no error)
+- **final_answer_correct**: Whether the final answer is correct
+
+## Usage
+
+### 1. Data Preview
+
+You can preview ProcessBench data using the following code:
+
+```python
+import json
+from datasets import load_dataset
+
+dataset = load_dataset('Qwen/ProcessBench', split='gsm8k')
+print(json.dumps(dataset[0], indent=2))
+```
+
+### 2. Evaluation with OpenCompass
+
+To evaluate models on ProcessBench using OpenCompass:
+
+```bash
+python run.py examples/eval_ProcessBench.py
+```
+
+### 3. Configuration
+
+The dataset configuration is located at `opencompass/configs/datasets/ProcessBench/processbench_gen.py`, supporting the following subsets:
+- `processbench_gsm8k`
+- `processbench_math`
+- `processbench_olympiadbench`
+- `processbench_omnimath`
+
+### 4. Evaluation Metrics
+
+ProcessBench uses the following metrics to evaluate model performance:
+- **error_acc**: Accuracy on error samples (whether the model can accurately locate the error position)
+- **correct_acc**: Accuracy on correct samples (whether the model can identify error-free reasoning)
+- **f1**: Harmonic mean of error_acc and correct_acc, providing a comprehensive measure of performance
+
+## Citation
+
+```bibtex
+@inproceedings{processbench,
+  title={ProcessBench: Identifying Process Errors in Mathematical Reasoning}, 
+  author={Chujie Zheng and Zhenru Zhang and Beichen Zhang and Runji Lin and Keming Lu and
+          Bowen Yu and Dayiheng Liu and Jingren Zhou and Junyang Lin},
+  booktitle={The 63rd Annual Meeting of the Association for Computational Linguistics},
+  year={2025}
+}
+```
+
+## Resources
+
+- 📄 [Paper](https://arxiv.org/abs/2412.06559)
+- 🤗 [HuggingFace Dataset](https://huggingface.co/datasets/Qwen/ProcessBench)
+- 💻 [Official Code Repository](https://github.com/QwenLM/ProcessBench)
+
diff --git a/opencompass/configs/datasets/ProcessBench/processbench_gen.py b/opencompass/configs/datasets/ProcessBench/processbench_gen.py
@@ -0,0 +1,55 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import ProcessBenchEvaluator, ProcessBenchEvalDataset
+
+PROCESSBENCH_CRITIQUE_PROMPT = """
+The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
+
+[Math Problem]
+
+{problem}
+
+[Solution]
+
+{tagged_response}
+
+Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
+
+Please put your final answer (i.e., the index) in \\boxed{{}}.
+"""
+
+
+
+processbench_reader_cfg = dict(input_columns=['problem, tagged_response, label'], output_column='label', test_split='test')
+
+processbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt=PROCESSBENCH_CRITIQUE_PROMPT),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+processbench_eval_cfg = dict(
+    evaluator=dict(
+        type=ProcessBenchEvaluator,
+    )
+)
+
+subsets = ['gsm8k', 'math', 'olympiadbench', 'omnimath']
+
+processbench_datasets = []
+
+for subset in subsets:
+    processbench_datasets.append(
+        dict(
+            type=ProcessBenchEvalDataset,
+            abbr=f'processbench_{subset}',
+            path='Qwen/ProcessBench',
+            subset=subset,
+            reader_cfg=processbench_reader_cfg,
+            infer_cfg=processbench_infer_cfg,
+            eval_cfg=processbench_eval_cfg)
+    )
diff --git a/opencompass/datasets/ProcessBench.py b/opencompass/datasets/ProcessBench.py
@@ -0,0 +1,99 @@
+import re
+
+from datasets import Dataset, load_dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+
+from .base import BaseDataset
+
+
+def extract_answer(solution_text: str):
+    boxed_pattern = r'boxed\{([^}]*)\}'
+    matches = re.findall(boxed_pattern, solution_text)
+    if matches:
+        try:
+            return int(matches[-1].replace('{', '').replace('}', '').strip())
+        except ValueError:
+            return matches[-1].replace('{', '').replace('}', '').strip()
+    return None
+
+
+@LOAD_DATASET.register_module()
+class ProcessBenchEvalDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, subset: str, **kwargs):
+        # Load from HuggingFace datasets
+        input_data = load_dataset(path, split=subset)
+
+        # Process data to match expected format for inferencer
+        processed_data = []
+        for item in input_data:
+            problem = item['problem']
+            steps = item['steps']
+            tagged_response = ''
+            for sdx, step in enumerate(steps):
+                tagged_response += (f'<paragraph_{sdx}>\n{step}\n'
+                                    f'</paragraph_{sdx}>\n\n')
+            tagged_response = tagged_response.strip()
+
+            processed_data.append({
+                'problem': problem,
+                'tagged_response': tagged_response,
+                'label': item['label']
+            })
+
+        dataset = Dataset.from_list(processed_data)
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class ProcessBenchEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+
+        res_data = []
+        for i in range(len(predictions)):
+            d = {}
+            generated_critique = predictions[i]
+            pred = extract_answer(generated_critique)
+
+            # Convert reference to int if possible for comparison
+            try:
+                ref_label = int(references[i])
+            except ValueError:
+                ref_label = references[i]
+
+            d['generated_critique'] = generated_critique
+            d['prediction'] = pred
+            d['label'] = ref_label
+            d['match'] = (pred == ref_label)
+
+            res_data.append(d)
+
+        error_data = [e for e in res_data if e['label'] != -1]
+        correct_data = [e for e in res_data if e['label'] == -1]
+
+        acc1 = 0.0
+        if len(error_data) > 0:
+            acc1 = (sum([e['match']
+                         for e in error_data]) / len(error_data) * 100)
+
+        acc2 = 0.0
+        if len(correct_data) > 0:
+            acc2 = (sum([e['match']
+                         for e in correct_data]) / len(correct_data) * 100)
+
+        f1 = 0.0
+        if (acc1 + acc2) > 0:
+            f1 = 2 * acc1 * acc2 / (acc1 + acc2)
+
+        return {
+            'error_acc': acc1,
+            'correct_acc': acc2,
+            'f1': f1,
+            'details': res_data
+        }
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
@@ -134,6 +134,7 @@
 from .physics import *  # noqa: F401, F403
 from .PI_LLM import PILLMDataset  # noqa: F401
 from .piqa import *  # noqa: F401, F403
+from .ProcessBench import *  # noqa: F401, F403
 from .ProteinLMBench import *  # noqa: F401, F403
 from .py150 import *  # noqa: F401, F403
 from .qasper import *  # noqa: F401, F403