Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions examples/eval_ProcessBench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from mmengine.config import read_base
from opencompass.models import VLLMwithChatTemplate

with read_base():
from opencompass.configs.datasets.ProcessBench.processbench_gen import processbench_datasets as processbench_datasets

from opencompass.configs.models.qwen2_5.vllm_qwen2_5_7b_instruct import models as vllm_qwen2_5_7b_instruct_model
from opencompass.configs.models.qwen2_5.vllm_qwen2_5_14b_instruct import models as vllm_qwen2_5_14b_instruct_model
from opencompass.configs.models.qwen2_5.vllm_qwen2_5_32b_instruct import models as vllm_qwen2_5_32b_instruct_model
from opencompass.configs.models.qwen2_5.vllm_qwen2_5_72b_instruct import models as vllm_qwen2_5_72b_instruct_model
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
# from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model

datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])


from opencompass.runners import LocalRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask

infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)
),
)

eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)
),
)

work_dir = 'outputs/ProcessBench'
82 changes: 82 additions & 0 deletions opencompass/configs/datasets/ProcessBench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# ProcessBench: Identifying Process Errors in Mathematical Reasoning

ProcessBench is a benchmark dataset for evaluating large language models' ability to identify errors in mathematical reasoning processes. Developed by the Qwen team and published at **ACL 2025**, this dataset aims to assess whether models can accurately locate where errors occur in multi-step mathematical reasoning.

## Overview

### Purpose

While large language models excel at mathematical problem-solving, their ability to identify and locate errors in reasoning processes is equally important. ProcessBench focuses on evaluating models' capabilities in:
- **Error Detection**: Determining whether errors exist in the reasoning process
- **Error Localization**: Accurately identifying the step where the earliest error occurs
- **Process Understanding**: Deep comprehension of logical chains in multi-step mathematical reasoning

### Dataset Construction

ProcessBench contains four subsets at different difficulty levels:
- **gsm8k**: Grade school math word problems
- **math**: High school competition mathematics
- **olympiadbench**: Mathematical Olympiad problems
- **omnimath**: Comprehensive mathematical problems

Each sample includes:
- **problem**: The original math problem
- **steps**: Step-by-step solution process
- **label**: Index of the error step (-1 indicates no error)
- **final_answer_correct**: Whether the final answer is correct

## Usage

### 1. Data Preview

You can preview ProcessBench data using the following code:

```python
import json
from datasets import load_dataset

dataset = load_dataset('Qwen/ProcessBench', split='gsm8k')
print(json.dumps(dataset[0], indent=2))
```

### 2. Evaluation with OpenCompass

To evaluate models on ProcessBench using OpenCompass:

```bash
python run.py examples/eval_ProcessBench.py
```

### 3. Configuration

The dataset configuration is located at `opencompass/configs/datasets/ProcessBench/processbench_gen.py`, supporting the following subsets:
- `processbench_gsm8k`
- `processbench_math`
- `processbench_olympiadbench`
- `processbench_omnimath`

### 4. Evaluation Metrics

ProcessBench uses the following metrics to evaluate model performance:
- **error_acc**: Accuracy on error samples (whether the model can accurately locate the error position)
- **correct_acc**: Accuracy on correct samples (whether the model can identify error-free reasoning)
- **f1**: Harmonic mean of error_acc and correct_acc, providing a comprehensive measure of performance

## Citation

```bibtex
@inproceedings{processbench,
title={ProcessBench: Identifying Process Errors in Mathematical Reasoning},
author={Chujie Zheng and Zhenru Zhang and Beichen Zhang and Runji Lin and Keming Lu and
Bowen Yu and Dayiheng Liu and Jingren Zhou and Junyang Lin},
booktitle={The 63rd Annual Meeting of the Association for Computational Linguistics},
year={2025}
}
```

## Resources

- 📄 [Paper](https://arxiv.org/abs/2412.06559)
- 🤗 [HuggingFace Dataset](https://huggingface.co/datasets/Qwen/ProcessBench)
- 💻 [Official Code Repository](https://github.com/QwenLM/ProcessBench)

55 changes: 55 additions & 0 deletions opencompass/configs/datasets/ProcessBench/processbench_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import ProcessBenchEvaluator, ProcessBenchEvalDataset

PROCESSBENCH_CRITIQUE_PROMPT = """
The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):

[Math Problem]

{problem}

[Solution]

{tagged_response}

Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").

Please put your final answer (i.e., the index) in \\boxed{{}}.
"""



processbench_reader_cfg = dict(input_columns=['problem, tagged_response, label'], output_column='label', test_split='test')

processbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=PROCESSBENCH_CRITIQUE_PROMPT),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))

processbench_eval_cfg = dict(
evaluator=dict(
type=ProcessBenchEvaluator,
)
)

subsets = ['gsm8k', 'math', 'olympiadbench', 'omnimath']

processbench_datasets = []

for subset in subsets:
processbench_datasets.append(
dict(
type=ProcessBenchEvalDataset,
abbr=f'processbench_{subset}',
path='Qwen/ProcessBench',
subset=subset,
reader_cfg=processbench_reader_cfg,
infer_cfg=processbench_infer_cfg,
eval_cfg=processbench_eval_cfg)
)
99 changes: 99 additions & 0 deletions opencompass/datasets/ProcessBench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import re

from datasets import Dataset, load_dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET

from .base import BaseDataset


def extract_answer(solution_text: str):
boxed_pattern = r'boxed\{([^}]*)\}'
matches = re.findall(boxed_pattern, solution_text)
if matches:
try:
return int(matches[-1].replace('{', '').replace('}', '').strip())
except ValueError:
return matches[-1].replace('{', '').replace('}', '').strip()
return None


@LOAD_DATASET.register_module()
class ProcessBenchEvalDataset(BaseDataset):

@staticmethod
def load(path: str, subset: str, **kwargs):
# Load from HuggingFace datasets
input_data = load_dataset(path, split=subset)

# Process data to match expected format for inferencer
processed_data = []
for item in input_data:
problem = item['problem']
steps = item['steps']
tagged_response = ''
for sdx, step in enumerate(steps):
tagged_response += (f'<paragraph_{sdx}>\n{step}\n'
f'</paragraph_{sdx}>\n\n')
tagged_response = tagged_response.strip()

processed_data.append({
'problem': problem,
'tagged_response': tagged_response,
'label': item['label']
})

dataset = Dataset.from_list(processed_data)
return dataset


@ICL_EVALUATORS.register_module()
class ProcessBenchEvaluator(BaseEvaluator):

def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}

res_data = []
for i in range(len(predictions)):
d = {}
generated_critique = predictions[i]
pred = extract_answer(generated_critique)

# Convert reference to int if possible for comparison
try:
ref_label = int(references[i])
except ValueError:
ref_label = references[i]

d['generated_critique'] = generated_critique
d['prediction'] = pred
d['label'] = ref_label
d['match'] = (pred == ref_label)

res_data.append(d)

error_data = [e for e in res_data if e['label'] != -1]
correct_data = [e for e in res_data if e['label'] == -1]

acc1 = 0.0
if len(error_data) > 0:
acc1 = (sum([e['match']
for e in error_data]) / len(error_data) * 100)

acc2 = 0.0
if len(correct_data) > 0:
acc2 = (sum([e['match']
for e in correct_data]) / len(correct_data) * 100)

f1 = 0.0
if (acc1 + acc2) > 0:
f1 = 2 * acc1 * acc2 / (acc1 + acc2)

return {
'error_acc': acc1,
'correct_acc': acc2,
'f1': f1,
'details': res_data
}
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@
from .physics import * # noqa: F401, F403
from .PI_LLM import PILLMDataset # noqa: F401
from .piqa import * # noqa: F401, F403
from .ProcessBench import * # noqa: F401, F403
from .ProteinLMBench import * # noqa: F401, F403
from .py150 import * # noqa: F401, F403
from .qasper import * # noqa: F401, F403
Expand Down