diff --git a/language/gpt-oss/.gitignore b/language/gpt-oss/.gitignore
new file mode 100644
index 0000000000..78317dc552
--- /dev/null
+++ b/language/gpt-oss/.gitignore
@@ -0,0 +1,3 @@
+*venv*
+*.pkl
+*.csv
\ No newline at end of file
diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
new file mode 100644
index 0000000000..593721aebd
--- /dev/null
+++ b/language/gpt-oss/README.md
@@ -0,0 +1,103 @@
+# MLPerf Inference reference implementation for GPT-OSS-120B
+This is the reference implementation for GPT-OSS-120B. This is a proposal and is a WIP. 
+
+## Model and Dataset download
+
+* Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
+* Dataset: Please request access at [this link](https://drive.google.com/drive/folders/1DCfEXHqe69okrqKbSyV-8VUw413JqpPY?usp=drive_link) - **this is a tentative dataset**
+
+Verify the dataset contents by computing the `sha1sum`:
+```bash
+$ sha1sum gptoss-*-eval.pkl
+35228fcf5581b916e70920748baf2c016ea2c06b  gptoss-acc-eval.pkl
+ddec911ad479fc4b30ef1c050c9dea63134c090e  gptoss-perf-eval.pkl
+
+```
+
+## Environment setup
+Work on reference implementation is done using the sglang containers at [https://hub.docker.com/r/lmsysorg/sglang/tags](https://hub.docker.com/r/lmsysorg/sglang/tags). For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
+
+Once in the environment, install additional requirements using [`setup.sh`](./setup.sh): 
+```bash
+./setup.sh
+```
+
+## Running the reference implementation: SGLang
+Use [`./sglang/run_server.sh`](./sglang/run_server.sh) to launch an SGLang server hosting `gpt-oss-120b`.
+
+### Run the server
+```bash
+./run_server.sh \
+  --model_path path/to/gpt-oss-120b/model \
+  --dp N  \
+  --stream_interval 100 \
+  --eagle_path optional/path/to/eagle/head
+```
+The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified. 
+
+Then, run a benchmark script that uses the client to send/recv requests.
+### Run the inference
+```bash
+python3 run_mlperf.py --help
+usage: run_mlperf.py [-h] [--scenario {offline,server}] --input-file INPUT_FILE [--max-samples MAX_SAMPLES] [--mlperf-conf MLPERF_CONF]
+                     [--user-conf USER_CONF] [--accuracy] [--output-dir OUTPUT_DIR] [--backend {sglang}] [--server-url SERVER_URL]
+                     [--generation-config GENERATION_CONFIG] [--max-new-tokens MAX_NEW_TOKENS] [--num-workers NUM_WORKERS]
+                     [--max-concurrency MAX_CONCURRENCY]
+
+Run MLPerf inference benchmarks for gpt-oss
+
+options:
+  -h, --help            show this help message and exit
+  --scenario {offline,server}
+                        MLPerf scenario mode
+  --input-file INPUT_FILE
+                        Path to tokenized dataset (pickle file)
+  --max-samples MAX_SAMPLES
+                        Maximum number of samples to use (None for all)
+  --mlperf-conf MLPERF_CONF
+                        Path to MLPerf configuration file
+  --user-conf USER_CONF
+                        Path to user configuration file
+  --accuracy            Run accuracy mode instead of performance
+  --output-dir OUTPUT_DIR
+                        Directory for MLPerf output logs
+  --backend {sglang}    Backend to use for inference
+  --server-url SERVER_URL
+                        Server URL for backend (SGLang)
+  --generation-config GENERATION_CONFIG
+                        Path to generation configuration JSON file
+  --max-new-tokens MAX_NEW_TOKENS
+                        Override max_new_tokens from generation config (default: use value from config)
+  --num-workers NUM_WORKERS
+                        Number of worker threads (for server scenario)
+  --max-concurrency MAX_CONCURRENCY
+                        Maximum concurrent requests to backend (SGLang handles batching internally)
+
+```
+
+### Evaluate the accuracy
+Run `run_mlperf.py` with `--accuracy`, and then use the generated `mlperf_log_accuracy.json` to evaluate the accuracy of the run. Usage is as below.
+```bash
+python3 eval_mlperf_accuracy.py --help
+usage: eval_mlperf_accuracy.py [-h] --mlperf-log MLPERF_LOG --reference-data REFERENCE_DATA [--tokenizer TOKENIZER] [--output-file OUTPUT_FILE]
+                               [--save-outputs SAVE_OUTPUTS] [--num-lcb-workers NUM_LCB_WORKERS] [--verbose]
+
+Evaluate MLPerf accuracy logs for gpt-oss-120b
+
+options:
+  -h, --help            show this help message and exit
+  --mlperf-log MLPERF_LOG
+                        Path to mlperf_log_accuracy.json
+  --reference-data REFERENCE_DATA
+                        Path to reference pickle file (DataFrame with dataset, ground_truth, etc.)
+  --tokenizer TOKENIZER
+                        HuggingFace tokenizer name or path
+  --output-file OUTPUT_FILE
+                        Output JSON file for results (optional)
+  --save-outputs SAVE_OUTPUTS
+                        Save detokenized outputs to pickle file (ordered by qsl_idx) for debugging
+  --num-lcb-workers NUM_LCB_WORKERS
+                        Number of parallel workers for LiveCodeBench evaluation (default: 64)
+  --verbose             Verbose logging
+
+```
\ No newline at end of file
diff --git a/language/gpt-oss/archive/collect_results_csv.py b/language/gpt-oss/archive/collect_results_csv.py
new file mode 100755
index 0000000000..10c93a2a37
--- /dev/null
+++ b/language/gpt-oss/archive/collect_results_csv.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+"""
+Collect results from multiple summarize_eval.py JSON outputs into a CSV.
+
+The CSV format shows:
+- Each row represents one dataset from one JSON file
+- Columns: run_1, run_2, ..., run_k, pass@k
+- Values are the "correct" counts (number of correct answers)
+"""
+
+import argparse
+import json
+import csv
+import sys
+import glob
+from pathlib import Path
+from typing import List, Dict, Any
+from collections import defaultdict
+
+
+def expand_glob_patterns(patterns: List[str]) -> List[str]:
+    """Expand glob patterns to actual file paths.
+
+    Args:
+        patterns: List of file paths or glob patterns (e.g., '*.json', 'results_*_summarize.json')
+
+    Returns:
+        List of actual file paths (sorted)
+    """
+    expanded_files = []
+
+    for pattern in patterns:
+        # If it's a literal file path that exists, use it directly
+        if Path(pattern).exists() and not any(
+                c in pattern for c in ['*', '?', '[', ']']):
+            expanded_files.append(pattern)
+        else:
+            # Try to expand as a glob pattern
+            matches = glob.glob(pattern)
+            if matches:
+                expanded_files.extend(matches)
+            else:
+                # If no matches and it's not a glob pattern, report the file as
+                # missing
+                if not any(c in pattern for c in ['*', '?', '[', ']']):
+                    print(
+                        f"Warning: File not found: {pattern}",
+                        file=sys.stderr)
+                else:
+                    print(
+                        f"Warning: No files matched pattern: {pattern}",
+                        file=sys.stderr)
+
+    # Remove duplicates and sort
+    return sorted(set(expanded_files))
+
+
+def load_json_summary(json_path: str) -> Dict[str, Any]:
+    """Load a JSON summary file."""
+    with open(json_path, 'r') as f:
+        return json.load(f)
+
+
+def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
+    """Extract per-pass and aggregated correct counts by dataset.
+
+    Returns:
+        Dictionary mapping dataset name to results:
+        {
+            'aime': {
+                'run_1': 735,
+                'run_2': 740,
+                ...
+                'pass@k': 875
+            }
+        }
+    """
+    pass_k = json_data['pass_k']
+    results = defaultdict(dict)
+    overall_results = {}
+
+    # Extract per-pass results
+    if 'per_pass_results' in json_data:
+        for pass_result in json_data['per_pass_results']:
+            pass_num = pass_result['pass_number']
+            run_label = f"run_{pass_num + 1}"  # Convert 0-indexed to 1-indexed
+
+            # Calculate sum of individual datasets for verification
+            sum_correct = 0
+            for dataset_stat in pass_result['datasets']:
+                dataset_name = dataset_stat['dataset']
+                correct = dataset_stat['correct']
+                results[dataset_name][run_label] = correct
+                sum_correct += correct
+
+            # Extract overall from JSON
+            if 'overall' in pass_result:
+                overall_correct = pass_result['overall']['correct']
+                overall_results[run_label] = overall_correct
+
+                # Assert that the sum matches the overall
+                assert sum_correct == overall_correct, (
+                    f"Mismatch in {run_label}: sum of datasets ({sum_correct}) != "
+                    f"overall ({overall_correct})"
+                )
+
+    # Extract aggregated results
+    if 'aggregated_results' in json_data:
+        # Calculate sum of individual datasets for verification
+        sum_correct = 0
+        for dataset_stat in json_data['aggregated_results']['datasets']:
+            dataset_name = dataset_stat['dataset']
+            correct = dataset_stat['correct']
+            results[dataset_name][f'pass@{pass_k}'] = correct
+            sum_correct += correct
+
+        # Extract overall from JSON
+        if 'overall' in json_data['aggregated_results']:
+            overall_correct = json_data['aggregated_results']['overall']['correct']
+            overall_results[f'pass@{pass_k}'] = overall_correct
+
+            # Assert that the sum matches the overall
+            assert sum_correct == overall_correct, (
+                f"Mismatch in pass@{pass_k}: sum of datasets ({sum_correct}) != "
+                f"overall ({overall_correct})"
+            )
+
+    # Handle single-pass results
+    elif 'results' in json_data:
+        # Calculate sum of individual datasets for verification
+        sum_correct = 0
+        for dataset_stat in json_data['results']['datasets']:
+            dataset_name = dataset_stat['dataset']
+            correct = dataset_stat['correct']
+            results[dataset_name]['run_1'] = correct
+            results[dataset_name]['pass@1'] = correct
+            sum_correct += correct
+
+        # Extract overall from JSON if available
+        if 'overall' in json_data['results']:
+            overall_correct = json_data['results']['overall']['correct']
+            overall_results['run_1'] = overall_correct
+            overall_results['pass@1'] = overall_correct
+
+            # Assert that the sum matches the overall
+            assert sum_correct == overall_correct, (
+                f"Mismatch in run_1: sum of datasets ({sum_correct}) != "
+                f"overall ({overall_correct})"
+            )
+
+    # Add overall results
+    if overall_results:
+        results['overall'] = overall_results
+
+    return dict(results)
+
+
+def collect_to_csv(json_files: List[str], output_csv: str,
+                   dataset_order: List[str] = None):
+    """Collect results from multiple JSON files into a CSV.
+
+    Args:
+        json_files: List of JSON file paths
+        output_csv: Output CSV file path
+        dataset_order: Optional list to specify dataset order (e.g., ['aime', 'gpqa', 'livecodebench'])
+    """
+    all_results = []
+    pass_k = None
+
+    # Load all JSON files
+    for json_path in json_files:
+        json_data = load_json_summary(json_path)
+
+        # Determine pass@k value
+        if pass_k is None:
+            pass_k = json_data['pass_k']
+        elif pass_k != json_data['pass_k']:
+            print(f"Warning: {json_path} has pass@{json_data['pass_k']} but expected pass@{pass_k}",
+                  file=sys.stderr)
+
+        # Extract results
+        results = extract_results(json_data)
+        all_results.append({
+            'source_file': json_path,
+            'results': results
+        })
+
+    if not all_results:
+        print("Error: No results to process", file=sys.stderr)
+        return
+
+    # Determine column order
+    run_columns = [f"run_{i+1}" for i in range(pass_k)]
+    pass_column = f"pass@{pass_k}"
+    columns = ['dataset'] + run_columns + [pass_column]
+
+    # Collect all unique datasets
+    all_datasets = set()
+    for result in all_results:
+        all_datasets.update(result['results'].keys())
+
+    # Sort datasets (use provided order or alphabetical)
+    # Always put 'overall' at the end
+    all_datasets_no_overall = all_datasets - {'overall'}
+
+    if dataset_order:
+        # Use provided order, put remaining datasets at the end
+        sorted_datasets = []
+        for ds in dataset_order:
+            if ds.lower() in [d.lower() for d in all_datasets_no_overall]:
+                # Find the actual dataset name (case-sensitive)
+                actual_name = next(
+                    d for d in all_datasets_no_overall if d.lower() == ds.lower())
+                sorted_datasets.append(actual_name)
+        # Add any datasets not in the order list (excluding 'overall')
+        remaining = sorted(
+            [d for d in all_datasets_no_overall if d not in sorted_datasets])
+        sorted_datasets.extend(remaining)
+    else:
+        sorted_datasets = sorted(all_datasets_no_overall)
+
+    # Add 'overall' at the end if it exists
+    if 'overall' in all_datasets:
+        sorted_datasets.append('overall')
+
+    # Write CSV
+    with open(output_csv, 'w', newline='') as f:
+        writer = csv.writer(f)
+
+        # Write header
+        writer.writerow(columns)
+
+        # Write data rows
+        for result in all_results:
+            for dataset in sorted_datasets:
+                if dataset in result['results']:
+                    row = [dataset]
+                    dataset_results = result['results'][dataset]
+
+                    # Add run columns
+                    for run_col in run_columns:
+                        row.append(dataset_results.get(run_col, ''))
+
+                    # Add pass@k column
+                    row.append(dataset_results.get(pass_column, ''))
+
+                    writer.writerow(row)
+
+    print(f"CSV saved to: {output_csv}")
+    print(
+        f"Collected {len(all_results)} result sets across {len(sorted_datasets)} datasets")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Collect multiple JSON summaries into a CSV. Supports glob patterns.",
+        epilog="Examples:\n"
+               "  %(prog)s results_*_summarize.json\n"
+               "  %(prog)s data/*.json -o output.csv\n"
+               "  %(prog)s run1.json run2.json run3.json --dataset-order aime gpqa livecodebench",
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("json_files", nargs='+',
+                        help="One or more JSON files or glob patterns (e.g., '*.json', 'results_*_summarize.json')")
+    parser.add_argument("-o", "--output", default="collected_results.csv",
+                        help="Output CSV file (default: collected_results.csv)")
+    parser.add_argument("--dataset-order", nargs='*',
+                        help="Optional dataset order (e.g., aime gpqa livecodebench)")
+
+    args = parser.parse_args()
+
+    # Expand glob patterns
+    expanded_files = expand_glob_patterns(args.json_files)
+
+    if not expanded_files:
+        print(
+            "Error: No JSON files found matching the provided patterns",
+            file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Found {len(expanded_files)} JSON files:")
+    for f in expanded_files:
+        print(f"  - {f}")
+    print()
+
+    try:
+        collect_to_csv(expanded_files, args.output, args.dataset_order)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/archive/plot.py b/language/gpt-oss/archive/plot.py
new file mode 100644
index 0000000000..d85d470c22
--- /dev/null
+++ b/language/gpt-oss/archive/plot.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+"""
+Histogram analysis of token input length (ISL) and output length (OSL) across datasets.
+Creates 8 histograms as specified.
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import argparse
+
+
+def load_data(pkl_path):
+    """Load the pickle file and return the DataFrame."""
+    print(f"Loading data from {pkl_path}...")
+    df = pd.read_pickle(pkl_path)
+    print(f"Loaded {len(df)} rows with columns: {list(df.columns)}")
+    return df
+
+
+def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
+    """Create individual histograms for each dataset."""
+    datasets = sorted(df['dataset'].unique())
+    print(f"Creating {filename}...")
+    print(f"  Datasets: {datasets}")
+    print(f"  Total samples: {len(df)}")
+
+    # Determine grid layout
+    n_datasets = len(datasets)
+    n_cols = 3
+    n_rows = (n_datasets + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
+    if n_datasets == 1:
+        axes = np.array([axes])
+    axes = axes.flatten()
+
+    for i, dataset in enumerate(datasets):
+        ax = axes[i]
+        dataset_data = df[df['dataset'] == dataset][column_name]
+
+        # Create histogram
+        ax.hist(
+            dataset_data,
+            bins=30,
+            alpha=0.7,
+            edgecolor='black',
+            linewidth=0.5,
+            color='skyblue' if 'OSL' in title else 'lightcoral')
+
+        ax.set_title(
+            f'{dataset}\n(n={len(dataset_data)})',
+            fontsize=12,
+            fontweight='bold')
+        ax.set_xlabel(title, fontsize=10)
+        ax.set_ylabel('Frequency', fontsize=10)
+        ax.grid(True, alpha=0.3)
+
+        # Add percentile lines
+        p50 = dataset_data.quantile(0.50)
+        p75 = dataset_data.quantile(0.75)
+        p99 = dataset_data.quantile(0.99)
+
+        ax.axvline(
+            p50,
+            color='green',
+            linestyle='--',
+            linewidth=2,
+            alpha=0.7,
+            label=f'50th: {p50:.1f}')
+        ax.axvline(
+            p75,
+            color='orange',
+            linestyle='--',
+            linewidth=2,
+            alpha=0.7,
+            label=f'75th: {p75:.1f}')
+        ax.axvline(
+            p99,
+            color='red',
+            linestyle='--',
+            linewidth=2,
+            alpha=0.7,
+            label=f'99th: {p99:.1f}')
+        ax.legend(loc='upper right', fontsize=8)
+
+        # Add statistics
+        mean_val = dataset_data.mean()
+        median_val = dataset_data.median()
+        std_val = dataset_data.std()
+        stats_text = f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}'
+        ax.text(0.98, 0.78, stats_text,
+                transform=ax.transAxes,
+                verticalalignment='top',
+                horizontalalignment='right',
+                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
+                fontsize=9)
+
+    # Hide unused subplots
+    for i in range(n_datasets, len(axes)):
+        axes[i].set_visible(False)
+
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
+    print(f"  Saved to {output_dir}/{filename}")
+    plt.close()
+
+
+def create_full_histogram(df, column_name, title,
+                          filename, output_dir, save_bins=False):
+    """Create a single histogram combining all datasets."""
+    print(f"Creating {filename}...")
+    print(f"  Total samples: {len(df)}")
+
+    plt.figure(figsize=(12, 8))
+
+    color = 'skyblue' if 'OSL' in title else 'lightcoral'
+    counts, bin_edges, patches = plt.hist(
+        df[column_name],
+        bins=50,
+        alpha=0.7,
+        edgecolor='black',
+        linewidth=0.5,
+        color=color)
+
+    plt.title(title, fontsize=14, fontweight='bold')
+    plt.xlabel(
+        column_name.replace(
+            'tok_',
+            '').replace(
+            '_len',
+            '').upper(),
+        fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.grid(True, alpha=0.3)
+
+    # Add percentile lines
+    p50 = df[column_name].quantile(0.50)
+    p75 = df[column_name].quantile(0.75)
+    p99 = df[column_name].quantile(0.99)
+
+    plt.axvline(p50, color='green', linestyle='--', linewidth=2,
+                alpha=0.7, label=f'50th percentile: {p50:.1f}')
+    plt.axvline(p75, color='orange', linestyle='--', linewidth=2,
+                alpha=0.7, label=f'75th percentile: {p75:.1f}')
+    plt.axvline(p99, color='red', linestyle='--', linewidth=2,
+                alpha=0.7, label=f'99th percentile: {p99:.1f}')
+    plt.legend(loc='upper right', fontsize=10)
+
+    # Add statistics
+    mean_val = df[column_name].mean()
+    median_val = df[column_name].median()
+    std_val = df[column_name].std()
+    min_val = df[column_name].min()
+    max_val = df[column_name].max()
+
+    stats_text = f'Total samples: {len(df)}\n'
+    stats_text += f'Mean: {mean_val:.1f}\n'
+    stats_text += f'Median: {median_val:.1f}\n'
+    stats_text += f'Std: {std_val:.1f}\n'
+    stats_text += f'Min: {min_val}\n'
+    stats_text += f'Max: {max_val}'
+
+    plt.text(0.98, 0.78, stats_text,
+             transform=plt.gca().transAxes,
+             verticalalignment='top',
+             horizontalalignment='right',
+             fontsize=10,
+             bbox=dict(boxstyle='round', facecolor='lightblue' if 'OSL' in title else 'lightcoral', alpha=0.8))
+
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
+    print(f"  Saved to {output_dir}/{filename}")
+    plt.close()
+
+    # Save bin data to CSV if requested
+    if save_bins:
+        csv_filename = filename.replace('.png', '_bins.csv')
+
+        # Create bin data DataFrame
+        bin_data = pd.DataFrame({
+            'bin_lower': bin_edges[:-1],
+            'bin_upper': bin_edges[1:],
+            'bin_center': (bin_edges[:-1] + bin_edges[1:]) / 2,
+            'count': counts.astype(int)
+        })
+
+        csv_path = f'{output_dir}/{csv_filename}'
+
+        # Save with header containing percentile information
+        with open(csv_path, 'w') as f:
+            f.write(
+                f'# Percentiles: 50th={p50:.2f}, 75th={p75:.2f}, 99th={p99:.2f}\n')
+            f.write(
+                f'# Mean={mean_val:.2f}, Median={median_val:.2f}, Std={std_val:.2f}\n')
+            f.write(
+                f'# Min={min_val}, Max={max_val}, Total samples={len(df)}\n')
+            bin_data.to_csv(f, index=False)
+
+        print(f"  Saved bin data to {csv_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Create histograms of token lengths (ISL and OSL)')
+    parser.add_argument('pkl_path', help='Path to the pickle file')
+    parser.add_argument(
+        '--output-dir',
+        default='histograms',
+        help='Output directory for plots')
+
+    args = parser.parse_args()
+
+    # Create output directory
+    Path(args.output_dir).mkdir(exist_ok=True)
+
+    # Load data
+    df = load_data(args.pkl_path)
+
+    # Check if dataset column exists
+    has_dataset = 'dataset' in df.columns
+    if not has_dataset:
+        print("\nNote: 'dataset' column not found - skipping per-dataset histograms")
+        # Add a dummy dataset column for compatibility with existing code
+        df['dataset'] = 'default'
+
+    # Check if prompt_accuracy column exists
+    has_accuracy = 'prompt_accuracy' in df.columns
+
+    # Determine which output length column to use
+    if 'tok_model_output_len' in df.columns:
+        output_len_col = 'tok_model_output_len'
+    elif 'tok_model_output_len_0' in df.columns:
+        output_len_col = 'tok_model_output_len_0'
+        print("\nNote: 'tok_model_output_len' not found, using 'tok_model_output_len_0' instead")
+    else:
+        raise ValueError(
+            "Neither 'tok_model_output_len' nor 'tok_model_output_len_0' column found in data")
+
+    if has_accuracy:
+        # Filter for 100% accuracy
+        df_100 = df[df['prompt_accuracy'] == 100.0].copy()
+        print(
+            f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
+    else:
+        print("\nNote: 'prompt_accuracy' column not found - skipping accuracy-based histograms\n")
+        # Create empty dataframe with dataset column for consistency
+        df_100 = pd.DataFrame(columns=df.columns)
+
+    print("=" * 60)
+    print("CREATING ISL HISTOGRAMS")
+    print("=" * 60)
+
+    # 1. Per dataset ISL histogram
+    if has_dataset:
+        create_per_dataset_histogram(
+            df, 'tok_input_len',
+            'Token Input Length (ISL)',
+            '1_per_dataset_ISL.png',
+            args.output_dir)
+    else:
+        print("Skipping per-dataset ISL: dataset column not found")
+
+    # 2. Per dataset ISL histogram (accuracy == 100)
+    if has_dataset and has_accuracy and len(df_100) > 0:
+        create_per_dataset_histogram(
+            df_100, 'tok_input_len',
+            'Token Input Length (ISL) - 100% Accuracy',
+            '2_per_dataset_ISL_acc100.png',
+            args.output_dir)
+    elif not has_dataset:
+        print("Skipping per-dataset ISL (acc==100): dataset column not found")
+    elif not has_accuracy:
+        print("Skipping per-dataset ISL (acc==100): prompt_accuracy column not found")
+    else:
+        print("Skipping per-dataset ISL (acc==100): no data with 100% accuracy")
+
+    # 3. Full ISL histogram
+    create_full_histogram(
+        df, 'tok_input_len',
+        'Token Input Length (ISL) - All Data',
+        '3_full_ISL.png',
+        args.output_dir,
+        save_bins=True)
+
+    # 4. Full ISL histogram (accuracy == 100)
+    if has_accuracy and len(df_100) > 0:
+        create_full_histogram(
+            df_100, 'tok_input_len',
+            'Token Input Length (ISL) - 100% Accuracy',
+            '4_full_ISL_acc100.png',
+            args.output_dir)
+    elif has_accuracy:
+        print("Skipping full ISL (acc==100): no data with 100% accuracy")
+    else:
+        print("Skipping full ISL (acc==100): prompt_accuracy column not found")
+
+    print("\n" + "=" * 60)
+    print("CREATING OSL HISTOGRAMS")
+    print("=" * 60)
+
+    # 5. Per dataset OSL histogram
+    if has_dataset:
+        create_per_dataset_histogram(
+            df, output_len_col,
+            'Token Output Length (OSL)',
+            '5_per_dataset_OSL.png',
+            args.output_dir)
+    else:
+        print("Skipping per-dataset OSL: dataset column not found")
+
+    # 6. Per dataset OSL histogram (accuracy == 100)
+    if has_dataset and has_accuracy and len(df_100) > 0:
+        create_per_dataset_histogram(
+            df_100, output_len_col,
+            'Token Output Length (OSL) - 100% Accuracy',
+            '6_per_dataset_OSL_acc100.png',
+            args.output_dir)
+    elif not has_dataset:
+        print("Skipping per-dataset OSL (acc==100): dataset column not found")
+    elif not has_accuracy:
+        print("Skipping per-dataset OSL (acc==100): prompt_accuracy column not found")
+    else:
+        print("Skipping per-dataset OSL (acc==100): no data with 100% accuracy")
+
+    # 7. Full OSL histogram
+    create_full_histogram(
+        df, output_len_col,
+        'Token Output Length (OSL) - All Data',
+        '7_full_OSL.png',
+        args.output_dir,
+        save_bins=True)
+
+    # 8. Full OSL histogram (accuracy == 100)
+    if has_accuracy and len(df_100) > 0:
+        create_full_histogram(
+            df_100, output_len_col,
+            'Token Output Length (OSL) - 100% Accuracy',
+            '8_full_OSL_acc100.png',
+            args.output_dir)
+    elif has_accuracy:
+        print("Skipping full OSL (acc==100): no data with 100% accuracy")
+    else:
+        print("Skipping full OSL (acc==100): prompt_accuracy column not found")
+
+    print(f"\n{'=' * 60}")
+    print(f"All histograms saved to {args.output_dir}/")
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/archive/plot_results.py b/language/gpt-oss/archive/plot_results.py
new file mode 100755
index 0000000000..d3cc889825
--- /dev/null
+++ b/language/gpt-oss/archive/plot_results.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""
+Generate grouped box plots from collected results CSV.
+
+Creates two plots:
+1. Individual runs box plot (run_1, run_2, ..., run_k)
+2. Pass@k box plot
+"""
+
+import argparse
+import sys
+import csv
+import re
+from pathlib import Path
+from typing import Dict, List
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[float]]]:
+    """Load CSV data and organize by dataset.
+
+    Returns:
+        {
+            'aime': {
+                'run_1': [735, 752, 765, ...],
+                'run_2': [740, 754, 765, ...],
+                'pass@5': [875, 875, 885, ...],
+                'pass@1 with 5 repeats: (average of run_k)': [861, 857, ...]
+            },
+            'gpqa': {...},
+            ...
+        }
+    """
+    data = defaultdict(lambda: defaultdict(list))
+
+    with open(csv_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            dataset = row['dataset']
+            for key, value in row.items():
+                if key != 'dataset' and value:
+                    try:
+                        # Try to parse as float to support decimal values
+                        data[dataset][key].append(float(value))
+                    except ValueError:
+                        continue
+
+    return dict(data)
+
+
+def create_combined_box_plot(dataset_name: str,
+                             dataset_data: Dict[str, List[float]],
+                             run_columns: List[str],
+                             passk_columns: List[str],
+                             output_file: str,
+                             ylabel: str = "Correct Count"):
+    """Create separate box plots for individual runs, pass@k, and computed averages in the same figure.
+
+    Args:
+        dataset_name: Name of the dataset
+        dataset_data: Data for this dataset (column -> list of values)
+        run_columns: Individual run columns to combine (e.g., ['run_1', 'run_2', ...])
+        passk_columns: Pass@k columns (e.g., ['pass@5'])
+        output_file: Output file path
+        ylabel: Y-axis label
+    """
+    # Combine all individual runs into one list
+    all_runs_data = []
+    for col in run_columns:
+        if col in dataset_data and dataset_data[col]:
+            all_runs_data.extend(dataset_data[col])
+
+    # Collect pass@k data
+    passk_data = []
+    for col in passk_columns:
+        if col in dataset_data and dataset_data[col]:
+            passk_data.extend(dataset_data[col])
+
+    # Compute averages from individual runs (average across runs for each
+    # trial)
+    average_data = []
+    if run_columns:
+        # Find the number of trials (minimum length across all run columns)
+        num_trials = min(len(dataset_data.get(col, []))
+                         for col in run_columns if col in dataset_data)
+
+        # For each trial, compute the average across all runs
+        for trial_idx in range(num_trials):
+            trial_values = []
+            for col in run_columns:
+                if col in dataset_data and trial_idx < len(dataset_data[col]):
+                    trial_values.append(dataset_data[col][trial_idx])
+
+            if trial_values:
+                average_data.append(np.mean(trial_values))
+
+    if not all_runs_data and not passk_data and not average_data:
+        print(f"Warning: No data to plot for {dataset_name}")
+        return
+
+    # Determine number of subplots needed
+    num_plots = 0
+    if all_runs_data:
+        num_plots += 1
+    if passk_data:
+        num_plots += 1
+    if average_data:
+        num_plots += 1
+
+    if num_plots == 0:
+        print(f"Warning: No data to plot for {dataset_name}")
+        return
+
+    # Create figure with subplots side by side
+    fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 6))
+
+    # Make axes iterable even if there's only one subplot
+    if num_plots == 1:
+        axes = [axes]
+
+    plot_idx = 0
+
+    # Plot individual runs
+    if all_runs_data:
+        ax = axes[plot_idx]
+        plot_idx += 1
+
+        bp = ax.boxplot([all_runs_data], positions=[0], widths=0.5,
+                        patch_artist=True, showmeans=True,
+                        whis=[0, 100], showfliers=False,
+                        meanprops=dict(marker='D', markerfacecolor='red',
+                                       markeredgecolor='red', markersize=8))
+
+        # Color the box
+        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.2))
+        bp['boxes'][0].set_alpha(0.7)
+
+        # Add scatter plot of individual points
+        # Add small random jitter to x-position for visibility
+        np.random.seed(42)  # For reproducibility
+        x_jitter = np.random.normal(0, 0.04, size=len(all_runs_data))
+        ax.scatter(x_jitter, all_runs_data, alpha=0.4, s=30,
+                   color='darkblue', zorder=3, edgecolors='black', linewidth=0.5)
+
+        # Set labels
+        ax.set_xticks([0])
+        ax.set_xticklabels(['Individual Runs'], fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(
+            f"{dataset_name} - Individual Runs",
+            fontsize=13,
+            fontweight='bold')
+        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
+
+        # Add statistics
+        mean_val = np.mean(all_runs_data)
+        std_val = np.std(all_runs_data)
+        min_val = np.min(all_runs_data)
+        max_val = np.max(all_runs_data)
+        n_samples = len(all_runs_data)
+
+        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
+        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
+        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
+                verticalalignment='top', bbox=props, family='monospace')
+
+    # Plot pass@k
+    if passk_data:
+        ax = axes[plot_idx]
+        plot_idx += 1
+
+        passk_label = passk_columns[0] if len(passk_columns) == 1 else 'Pass@k'
+
+        bp = ax.boxplot([passk_data], positions=[0], widths=0.5,
+                        patch_artist=True, showmeans=True,
+                        whis=[0, 100], showfliers=False,
+                        meanprops=dict(marker='D', markerfacecolor='red',
+                                       markeredgecolor='red', markersize=8))
+
+        # Color the box
+        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.6))
+        bp['boxes'][0].set_alpha(0.7)
+
+        # Add scatter plot of individual points
+        # Add small random jitter to x-position for visibility
+        np.random.seed(42)  # For reproducibility
+        x_jitter = np.random.normal(0, 0.04, size=len(passk_data))
+        ax.scatter(x_jitter, passk_data, alpha=0.4, s=30,
+                   color='darkorange', zorder=3, edgecolors='black', linewidth=0.5)
+
+        # Set labels
+        ax.set_xticks([0])
+        ax.set_xticklabels([passk_label], fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(
+            f"{dataset_name} - {passk_label}",
+            fontsize=13,
+            fontweight='bold')
+        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
+
+        # Add statistics
+        mean_val = np.mean(passk_data)
+        std_val = np.std(passk_data)
+        min_val = np.min(passk_data)
+        max_val = np.max(passk_data)
+        n_samples = len(passk_data)
+
+        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
+        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
+        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
+                verticalalignment='top', bbox=props, family='monospace')
+
+    # Plot computed averages
+    if average_data:
+        ax = axes[plot_idx]
+
+        average_label = 'Pass@1 (avg of runs)'
+
+        bp = ax.boxplot([average_data], positions=[0], widths=0.5,
+                        patch_artist=True, showmeans=True,
+                        whis=[0, 100], showfliers=False,
+                        meanprops=dict(marker='D', markerfacecolor='red',
+                                       markeredgecolor='red', markersize=8))
+
+        # Color the box
+        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.9))
+        bp['boxes'][0].set_alpha(0.7)
+
+        # Add scatter plot of individual points
+        # Add small random jitter to x-position for visibility
+        np.random.seed(42)  # For reproducibility
+        x_jitter = np.random.normal(0, 0.04, size=len(average_data))
+        ax.scatter(x_jitter, average_data, alpha=0.4, s=30,
+                   color='darkgreen', zorder=3, edgecolors='black', linewidth=0.5)
+
+        # Set labels
+        ax.set_xticks([0])
+        ax.set_xticklabels([average_label], fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(
+            f"{dataset_name} - {average_label}",
+            fontsize=13,
+            fontweight='bold')
+        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
+
+        # Add statistics
+        mean_val = np.mean(average_data)
+        std_val = np.std(average_data)
+        min_val = np.min(average_data)
+        max_val = np.max(average_data)
+        n_samples = len(average_data)
+
+        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val:.1f}\nmax={max_val:.1f}"
+        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
+        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
+                verticalalignment='top', bbox=props, family='monospace')
+
+    # Adjust layout
+    plt.tight_layout()
+
+    # Save figure
+    plt.savefig(output_file, dpi=300, bbox_inches='tight')
+    print(f"Saved plot to: {output_file}")
+    plt.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate box plots from collected results CSV")
+    parser.add_argument("csv_file",
+                        help="Input CSV file from collect_results_csv.py")
+    parser.add_argument("-o", "--output-dir", default=".",
+                        help="Output directory for plots (default: current directory)")
+    parser.add_argument("--prefix", default="boxplot",
+                        help="Prefix for output files (default: boxplot)")
+
+    args = parser.parse_args()
+
+    # Check if input file exists
+    if not Path(args.csv_file).exists():
+        print(f"Error: File not found: {args.csv_file}", file=sys.stderr)
+        sys.exit(1)
+
+    # Create output directory if needed
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load data
+    print(f"Loading data from: {args.csv_file}")
+    data = load_csv_data(args.csv_file)
+
+    if not data:
+        print("Error: No data loaded from CSV", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Loaded data for {len(data)} datasets")
+
+    # Determine which columns are runs vs pass@k
+    all_columns = set()
+    for dataset_data in data.values():
+        all_columns.update(dataset_data.keys())
+
+    # Separate run columns from pass@k columns
+    # Use regex to match only exact pass@k format (e.g., pass@5, pass@10)
+    run_columns = sorted(
+        [col for col in all_columns if col.startswith('run_')])
+    passk_pattern = re.compile(r'^pass@\d+$')
+    passk_columns = sorted(
+        [col for col in all_columns if passk_pattern.match(col)])
+
+    if not run_columns and not passk_columns:
+        print("Error: No run or pass@k columns found in CSV", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Found {len(run_columns)} run columns: {', '.join(run_columns)}")
+    print(
+        f"Found {len(passk_columns)} pass@k columns: {', '.join(passk_columns)}")
+    print(f"Will compute averages from individual runs")
+    print()
+
+    # Generate plots for each dataset separately
+    # Always put 'overall' at the end
+    datasets_no_overall = [d for d in data.keys() if d != 'overall']
+    datasets = sorted(datasets_no_overall)
+    if 'overall' in data:
+        datasets.append('overall')
+
+    print(f"Generating plots for {len(datasets)} datasets...\n")
+
+    for dataset in datasets:
+        dataset_data = data[dataset]
+
+        # Create combined plot: Individual Runs (all combined) vs Pass@k vs
+        # Computed Averages
+        if run_columns or passk_columns:
+            output_file = output_dir / f"{args.prefix}_{dataset}.png"
+            print(f"Creating combined box plot for {dataset}...")
+            create_combined_box_plot(
+                dataset_name=dataset,
+                dataset_data=dataset_data,
+                run_columns=run_columns,
+                passk_columns=passk_columns,
+                output_file=str(output_file),
+                ylabel="Correct Count"
+            )
+
+        print()
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/archive/run_infer.py b/language/gpt-oss/archive/run_infer.py
new file mode 100644
index 0000000000..049ef4112f
--- /dev/null
+++ b/language/gpt-oss/archive/run_infer.py
@@ -0,0 +1,505 @@
+#!/usr/bin/env python3
+"""
+Script to send pre-tokenized requests to SGLang server.
+
+Usage:
+    python run_infer.py --input-tokens tokenized_data.pkl [options]
+
+Arguments:
+    --input-tokens     Path to pickle file containing pre-tokenized data from harmony-tokens.py
+    --server-url       SGLang server URL (default: http://localhost:30000)
+    --max-samples      Maximum number of samples to process (default: all)
+    --max-tokens       Maximum tokens to generate per request (default: 100)
+    --max-concurrency  Maximum number of concurrent requests (default: 256)
+    --output           Output pickle file for responses (optional)
+    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
+"""
+
+import requests
+import json
+import time
+import argparse
+from typing import List, Dict, Any
+import logging
+from multiprocessing import Pool
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Initialize tokenizer
+MODEL_NAME = "openai/gpt-oss-120b"
+tokenizer = None
+
+
+def get_tokenizer():
+    """Get or initialize the tokenizer."""
+    global tokenizer
+    if tokenizer is None:
+        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        logger.info("Tokenizer loaded successfully")
+    return tokenizer
+
+
+class SGLangClient:
+    def __init__(self,
+                 server_url: str = "http://localhost:30000",
+                 temperature: float = 0.001,
+                 top_k: int = 1,
+                 top_p: float = 1.0,
+                 timeout: int = 1200
+                 ):
+        self.base_url = server_url
+        self.session = requests.Session()
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.timeout = timeout
+
+    def send_request(
+            self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
+        """Send a single request to the SGLang server."""
+        # SGLang format with input_ids
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": self.temperature,
+                "top_k": self.top_k,
+                "top_p": self.top_p,
+            }
+        }
+
+        try:
+            response = self.session.post(
+                f"{self.base_url}/generate",
+                json=payload,
+                timeout=self.timeout,
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(
+                    f"Request failed with status {response.status_code}: {response.text}")
+                return {"error": f"HTTP {response.status_code}: {response.text}"}
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            return {"error": str(e)}
+
+
+def load_tokenized_data(data_file: str) -> pd.DataFrame:
+    """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
+    logger.info(f"Loading tokenized data from {data_file}")
+
+    # Load DataFrame from pickle
+    df = pd.read_pickle(data_file)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+
+    # Check if tok_input column exists and has valid data
+    if 'tok_input' in df.columns:
+        # Check for any None values in tok_input (indicating failed
+        # tokenization)
+        failed_mask = df['tok_input'].isna()
+        failed_count = failed_mask.sum()
+
+        if failed_count > 0:
+            failed_indices = df[failed_mask].index.unique()
+            error_msg = f"Found {failed_count} failed tokenized samples at indices: {failed_indices.tolist()}"
+            logger.error(error_msg)
+            raise AssertionError(error_msg)
+
+        # Check first sample
+        first_tokens = df.iloc[0]['tok_input']
+        if isinstance(first_tokens, list):
+            logger.info(f"First sample token length: {len(first_tokens)}")
+        else:
+            logger.warning(
+                "tok_input column exists but first sample is not a list")
+
+        logger.info(f"All {len(df)} samples were successfully tokenized")
+    else:
+        logger.warning("No 'tok_input' column found in DataFrame")
+
+    return df
+
+
+def send_single_request(args_tuple):
+    """Send a single request - used by multiprocessing pool."""
+    input_ids, max_tokens, server_url, sample_id, pass_num, temperature, top_k, top_p, timeout = args_tuple
+
+    # Create a new client for this process
+    client = SGLangClient(
+        server_url=server_url,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        timeout=timeout)
+
+    try:
+        # Track latency: time from request sent to response received
+        start_time = time.time()
+        response = client.send_request(input_ids, max_tokens=max_tokens)
+        end_time = time.time()
+        latency = end_time - start_time
+        return sample_id, pass_num, response, latency
+    except Exception as e:
+        logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
+        # Return None for latency on error
+        return sample_id, pass_num, {"error": str(e)}, None
+
+
+def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
+                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0, timeout: int = 1200,
+                           pass_k: int = 1):
+    """Send all requests to SGLang server in parallel using multiprocessing.
+
+    Args:
+        pass_k: Number of inference passes per sample for pass@k strategy
+
+    Returns:
+        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
+    """
+    num_samples = len(tokenized_df)
+    total_requests = num_samples * pass_k
+    logger.info(
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) to server with {max_concurrency} concurrent workers...")
+
+    # Prepare arguments for multiprocessing - create pass_k requests per sample
+    args_list = []
+    for idx, row in tokenized_df.iterrows():
+        for pass_num in range(pass_k):
+            args_list.append((
+                row['tok_input'], max_tokens, server_url,
+                idx, pass_num, temperature, top_k, top_p, timeout
+            ))
+
+    start_time = time.time()
+
+    with Pool(processes=min(max_concurrency, total_requests)) as pool:
+        results = list(tqdm(
+            pool.imap_unordered(send_single_request, args_list),
+            total=len(args_list),
+            desc="Sending requests",
+            unit="request"
+        ))
+
+    # Group results by sample_id and pass_num
+    responses_by_pass = {}
+    latencies_by_pass = {}
+    for sample_id, pass_num, response, latency in results:
+        responses_by_pass[(sample_id, pass_num)] = response
+        latencies_by_pass[(sample_id, pass_num)] = latency
+
+    total_time = time.time() - start_time
+    logger.info(
+        f"Completed {total_requests} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
+
+    # Log latency statistics
+    valid_latencies = [
+        lat for lat in latencies_by_pass.values() if lat is not None]
+    if valid_latencies:
+        avg_latency = sum(valid_latencies) / len(valid_latencies)
+        min_latency = min(valid_latencies)
+        max_latency = max(valid_latencies)
+        logger.info(
+            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
+
+    return responses_by_pass, latencies_by_pass
+
+
+def extract_response_ids(
+        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
+    """Extract response output_ids from SGLang responses for all passes.
+
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        tokenized_df: DataFrame with samples
+        pass_k: Number of passes per sample
+
+    Returns:
+        Dict mapping (sample_id, pass_num) to output_ids list
+    """
+    logger.info("Extracting response output_ids...")
+
+    response_ids_by_pass = {}
+    total_responses = len(tokenized_df) * pass_k
+
+    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
+        for idx, row in tokenized_df.iterrows():
+            for pass_num in range(pass_k):
+                response = responses_by_pass.get((idx, pass_num), {})
+                response_id = []
+                if "error" not in response and "output_ids" in response:
+                    try:
+                        # SGLang returns the generated token IDs in the
+                        # 'output_ids' field
+                        response_id = response["output_ids"]
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
+                response_ids_by_pass[(idx, pass_num)] = response_id
+                pbar.update(1)
+
+    logger.info("Response output_ids extraction complete")
+    return response_ids_by_pass
+
+
+def detokenize_output_ids(
+        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
+    """Detokenize output_ids back to text using AutoTokenizer for all passes.
+
+    Args:
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        pass_k: Number of passes per sample
+
+    Returns:
+        Dict mapping (sample_id, pass_num) to detokenized text
+    """
+    logger.info("Detokenizing output_ids to text...")
+
+    tokenizer = get_tokenizer()
+    detokenized_texts_by_pass = {}
+
+    for (sample_id, pass_num), token_ids in tqdm(
+            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
+        try:
+            # Detokenize the token IDs back to text
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+            detokenized_texts_by_pass[(sample_id, pass_num)] = text
+        except Exception as e:
+            logger.warning(
+                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
+            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
+
+    logger.info("Output detokenization complete")
+    return detokenized_texts_by_pass
+
+
+def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
+                   response_ids_by_pass: Dict[tuple, List[int]],
+                   detokenized_texts_by_pass: Dict[tuple, str],
+                   latencies_by_pass: Dict[tuple, float],
+                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
+    """Save all responses to DataFrame and optionally to pickle file.
+
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
+        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
+        tokenized_df: Original DataFrame with samples
+        pass_k: Number of passes per sample
+        output_file: Optional output pickle file
+
+    Returns:
+        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
+    """
+    logger.info("Processing responses and updating DataFrame...")
+
+    # Work with the original DataFrame
+    result_df = tokenized_df.copy()
+
+    # Create columns for each pass with _0, _1, _2, ... suffixes
+    for pass_num in range(pass_k):
+        # Lists to store data for this pass
+        model_outputs = []
+        tok_model_outputs = []
+        tok_model_output_lens = []
+        infer_times = []
+
+        for idx in tokenized_df.index:
+            key = (idx, pass_num)
+            detokenized_text = detokenized_texts_by_pass.get(key, "")
+            response_ids = response_ids_by_pass.get(key, [])
+            latency = latencies_by_pass.get(key, None)
+
+            model_outputs.append(detokenized_text)
+            tok_model_outputs.append(response_ids)
+            tok_model_output_lens.append(len(response_ids))
+            infer_times.append(latency)
+
+        # Add columns with suffixes
+        result_df[f'model_output_{pass_num}'] = model_outputs
+        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
+        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
+        result_df[f'infer_time_{pass_num}'] = infer_times
+
+    # Calculate output token lengths for logging
+    all_output_token_lengths = []
+    for idx in tokenized_df.index:
+        for pass_num in range(pass_k):
+            key = (idx, pass_num)
+            response = responses_by_pass.get(key, {})
+            response_ids = response_ids_by_pass.get(key, [])
+            try:
+                output_token_length = response.get(
+                    "meta_info", {}).get(
+                    "completion_tokens", len(response_ids))
+                all_output_token_lengths.append(output_token_length)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
+                all_output_token_lengths.append(len(response_ids))
+
+    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
+    new_columns = [
+        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
+    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
+    if all_output_token_lengths:
+        logger.info(
+            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
+
+    # Save to pickle file if output_file is provided
+    if output_file:
+        logger.info(f"Saving responses to {output_file}...")
+        result_df.to_pickle(output_file)
+        logger.info(f"Responses saved to {output_file}")
+
+    return result_df
+
+
+def process_requests(tokenized_df: pd.DataFrame, server_url: str,
+                     max_samples: int = None, max_tokens: int = 100,
+                     max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+                     timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
+    """Main processing function that handles requests and response extraction.
+
+    Args:
+        pass_k: Number of inference passes per sample for pass@k strategy
+    """
+
+    # Step 1: Limit samples if specified
+    if max_samples is not None:
+        tokenized_df = tokenized_df.head(max_samples)
+        logger.info(f"Limited to first {max_samples} samples")
+
+    # Step 2: Send all requests in parallel (k passes per sample)
+    responses_by_pass, latencies_by_pass = send_requests_parallel(
+        tokenized_df,
+        server_url,
+        max_tokens,
+        max_concurrency,
+        temperature,
+        top_k,
+        top_p,
+        timeout,
+        pass_k)
+
+    # Step 3: Extract response output_ids for all passes
+    response_ids_by_pass = extract_response_ids(
+        responses_by_pass, tokenized_df, pass_k)
+
+    # Step 4: Detokenize output_ids to text for model_output for all passes
+    detokenized_texts_by_pass = detokenize_output_ids(
+        response_ids_by_pass, pass_k)
+
+    # Step 5: Save all results and return DataFrame
+    result_df = save_responses(
+        responses_by_pass,
+        response_ids_by_pass,
+        detokenized_texts_by_pass,
+        latencies_by_pass,
+        tokenized_df,
+        pass_k,
+        output_file)
+
+    return result_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Send pre-tokenized requests to SGLang server")
+    parser.add_argument("--input-tokens", required=True,
+                        help="Path to pickle file containing pre-tokenized data from harmony-tokens.py")
+    parser.add_argument("--server-url", default="http://localhost:30000",
+                        help="SGLang server URL (default: http://localhost:30000)")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Maximum number of samples to process (default: all)")
+    parser.add_argument("--max-tokens", type=int, default=100,
+                        help="Maximum tokens to generate per request")
+    parser.add_argument("--max-concurrency", type=int, default=256,
+                        help="Maximum number of concurrent requests (default: 256)")
+    parser.add_argument("--output", default=None,
+                        help="Output pickle file for responses (optional)")
+    parser.add_argument("--pass-k", type=int, default=1,
+                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
+    parser.add_argument("--temperature", type=float, default=0.001,
+                        help="Temperature for sampling (default: 0.001)")
+    parser.add_argument("--top-k", type=int, default=1,
+                        help="Top-k for sampling (default: 1)")
+    parser.add_argument("--top-p", type=float, default=1.0,
+                        help="Top-p for sampling (default: 1.0)")
+    parser.add_argument("--timeout", type=int, default=1200,
+                        help="Timeout for requests (default: 1200)")
+
+    args = parser.parse_args()
+
+    # Test connection
+    logger.info(f"Testing server connection to {args.server_url}...")
+    test_client = SGLangClient(
+        server_url=args.server_url,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        timeout=args.timeout)
+
+    test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
+    if "error" in test_response:
+        logger.error(f"Server connection failed: {test_response['error']}")
+        logger.error("Make sure your SGLang server is running. Try:")
+        logger.error(
+            "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
+        return
+    logger.info("Server connection successful")
+
+    # Load pre-tokenized data
+    tokenized_df = load_tokenized_data(args.input_tokens)
+
+    # Process requests and get result DataFrame
+    result_df = process_requests(tokenized_df, args.server_url,
+                                 max_samples=args.max_samples,
+                                 max_tokens=args.max_tokens,
+                                 max_concurrency=args.max_concurrency,
+                                 output_file=args.output,
+                                 temperature=args.temperature,
+                                 top_k=args.top_k,
+                                 top_p=args.top_p,
+                                 timeout=args.timeout,
+                                 pass_k=args.pass_k)
+
+    # Print summary
+    logger.info(f"\nProcessing completed:")
+    logger.info(f"  - Total samples processed: {len(result_df)}")
+    logger.info(f"  - Number of passes per sample: {args.pass_k}")
+    logger.info(
+        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
+
+    # Calculate average output length across all passes
+    if args.pass_k == 1:
+        avg_output_len = result_df['tok_model_output_len_0'].mean()
+        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
+    else:
+        all_output_lens = []
+        for i in range(args.pass_k):
+            all_output_lens.extend(
+                result_df[f'tok_model_output_len_{i}'].tolist())
+        avg_output_len = sum(all_output_lens) / \
+            len(all_output_lens) if all_output_lens else 0
+        logger.info(
+            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
+
+    if args.output:
+        logger.info(f"  - Results saved to: {args.output}")
+    else:
+        logger.info("  - Results returned as DataFrame (not saved to file)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/archive/summarize_eval.py b/language/gpt-oss/archive/summarize_eval.py
new file mode 100644
index 0000000000..739097c68c
--- /dev/null
+++ b/language/gpt-oss/archive/summarize_eval.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""
+Summarize evaluation results from eval_accuracy.py output.
+
+Reads an evaluated pickle file and prints a summary of results by dataset,
+including per-pass statistics and aggregated pass@k results.
+"""
+
+import argparse
+import pickle
+import sys
+import json
+import glob
+from pathlib import Path
+from typing import Dict, Any, List
+import pandas as pd
+
+
+def expand_glob_patterns(patterns: List[str]) -> List[str]:
+    """Expand glob patterns to actual file paths.
+
+    Args:
+        patterns: List of file paths or glob patterns (e.g., '*.pkl', 'data/*_evaluated.pkl')
+
+    Returns:
+        List of actual file paths (sorted)
+    """
+    expanded_files = []
+
+    for pattern in patterns:
+        # If it's a literal file path that exists, use it directly
+        if Path(pattern).exists() and not any(
+                c in pattern for c in ['*', '?', '[', ']']):
+            expanded_files.append(pattern)
+        else:
+            # Try to expand as a glob pattern
+            matches = glob.glob(pattern)
+            if matches:
+                expanded_files.extend(matches)
+            else:
+                # If no matches and it's not a glob pattern, report the file as
+                # missing
+                if not any(c in pattern for c in ['*', '?', '[', ']']):
+                    print(
+                        f"Warning: File not found: {pattern}",
+                        file=sys.stderr)
+                else:
+                    print(
+                        f"Warning: No files matched pattern: {pattern}",
+                        file=sys.stderr)
+
+    # Remove duplicates and sort
+    return sorted(set(expanded_files))
+
+
+def detect_pass_k(df: pd.DataFrame) -> int:
+    """Detect if DataFrame has pass@k format and return k.
+
+    Returns:
+        Number of passes (k) if pass@k format detected, otherwise 1
+    """
+    # Check for model_output_0, model_output_1, etc.
+    pass_k = 0
+    while f'model_output_{pass_k}' in df.columns:
+        pass_k += 1
+
+    # If no _0 suffix found, check for single model_output column
+    if pass_k == 0 and 'model_output' in df.columns:
+        return 1
+
+    return pass_k
+
+
+def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str,
+                            pass_num: int = None, pass_k: int = 1) -> Dict[str, Any]:
+    """Calculate statistics for a specific dataset and pass.
+
+    Args:
+        df: DataFrame with evaluation results
+        dataset_name: Name of the dataset to filter
+        pass_num: Pass number (None for aggregated results)
+        pass_k: Total number of passes (for aggregated results)
+
+    Returns:
+        Dictionary with statistics
+    """
+    # Filter to this dataset
+    dataset_df = df[df['dataset'] == dataset_name]
+
+    # Determine column suffixes
+    if pass_num is None:
+        # Aggregated results
+        accuracy_col = 'prompt_accuracy' if 'prompt_accuracy' in dataset_df.columns else 'prompt_accuracy_0'
+
+        # For aggregated pass@k, count answered as any sample with at least one
+        # extracted answer
+        if pass_k > 1:
+            # Check if any pass has an extracted answer
+            answered_mask = pd.Series(
+                [False] * len(dataset_df),
+                index=dataset_df.index)
+            for i in range(pass_k):
+                col = f'extracted_answer_{i}'
+                if col in dataset_df.columns:
+                    answered_mask |= dataset_df[col].notna()
+            answered = answered_mask.sum()
+        else:
+            extracted_col = 'extracted_answer' if 'extracted_answer' in dataset_df.columns else 'extracted_answer_0'
+            answered = dataset_df[extracted_col].notna().sum()
+    else:
+        # Specific pass
+        suffix = f'_{pass_num}'
+        extracted_col = f'extracted_answer{suffix}'
+        accuracy_col = f'prompt_accuracy{suffix}'
+        answered = dataset_df[extracted_col].notna().sum()
+
+    # Calculate statistics
+    total = len(dataset_df)
+    correct = (dataset_df[accuracy_col] > 0).sum()
+
+    # Calculate percentage (correct / total)
+    if total > 0:
+        pct_correct = (correct / total) * 100
+    else:
+        pct_correct = 0.0
+
+    # Calculate mean accuracy (handles HealthBench partial scores)
+    mean_accuracy = dataset_df[accuracy_col].mean()
+
+    return {
+        'dataset': dataset_name,
+        'total': int(total),
+        'answered': int(answered),
+        'correct': int(correct),
+        'pct_correct': float(pct_correct),
+        'mean_accuracy': float(mean_accuracy),
+    }
+
+
+def print_summary_table(
+        stats_list: List[Dict[str, Any]], title: str = "Summary"):
+    """Print a formatted summary table.
+
+    Args:
+        stats_list: List of statistics dictionaries
+        title: Title for the table
+    """
+    print(f"\n{'=' * 85}")
+    print(f"{title}")
+    print('=' * 85)
+    print(f"{'Dataset':<20} {'Total':>8} {'Answered':>10} {'Correct':>10} {'Accuracy':>12}")
+    print('-' * 85)
+
+    for stats in stats_list:
+        dataset_name = stats['dataset']
+        total = stats['total']
+        answered = stats['answered']
+        correct = stats['correct']
+        pct_correct = stats['pct_correct']
+
+        # Format the row
+        print(
+            f"{dataset_name:<20} {total:>8} {answered:>10} {correct:>10} {pct_correct:>11.2f}%")
+
+    # Print totals
+    if len(stats_list) > 1:
+        total_samples = sum(s['total'] for s in stats_list)
+        total_answered = sum(s['answered'] for s in stats_list)
+        total_correct = sum(s['correct'] for s in stats_list)
+        overall_pct = (
+            total_correct /
+            total_samples *
+            100) if total_samples > 0 else 0.0
+
+        print('-' * 85)
+        print(f"{'OVERALL':<20} {total_samples:>8} {total_answered:>10} {total_correct:>10} {overall_pct:>11.2f}%")
+
+    print('=' * 85)
+
+
+def summarize_evaluation(pickle_path: str, json_output: bool = False) -> str:
+    """Load and summarize evaluation results.
+
+    Args:
+        pickle_path: Path to evaluated pickle file
+        json_output: If True, save results to JSON file instead of printing
+
+    Returns:
+        Path to JSON file if json_output=True, otherwise empty string
+    """
+    # Load the pickle file
+    print(f"Loading evaluation results from: {pickle_path}")
+    with open(pickle_path, 'rb') as f:
+        df = pickle.load(f)
+
+    print(f"Loaded {len(df)} samples")
+
+    # Detect pass@k format
+    pass_k = detect_pass_k(df)
+    print(f"Detected format: pass@{pass_k}" if pass_k >
+          1 else "Detected format: single-pass")
+
+    # Get list of datasets
+    datasets = sorted(df['dataset'].unique())
+    print(f"Datasets found: {', '.join(datasets)}")
+
+    # Structure to hold all results
+    results_data = {
+        'input_file': pickle_path,
+        'total_samples': len(df),
+        'pass_k': pass_k,
+        'datasets': list(datasets),
+    }
+
+    # Calculate statistics for each dataset
+    if pass_k > 1:
+        # Collect per-pass statistics
+        per_pass_results = []
+        for pass_num in range(pass_k):
+            stats_list = []
+            for dataset in datasets:
+                stats = calculate_dataset_stats(
+                    df, dataset, pass_num=pass_num, pass_k=pass_k)
+                stats_list.append(stats)
+
+            print_summary_table(stats_list, title=f"Pass {pass_num} Results")
+
+            per_pass_results.append({
+                'pass_number': pass_num,
+                'datasets': stats_list,
+                'overall': {
+                    'total': sum(s['total'] for s in stats_list),
+                    'answered': sum(s['answered'] for s in stats_list),
+                    'correct': sum(s['correct'] for s in stats_list),
+                    'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
+                }
+            })
+
+        results_data['per_pass_results'] = per_pass_results
+
+        # Show aggregated (pass@k) statistics
+        print("\n")
+        stats_list = []
+        for dataset in datasets:
+            stats = calculate_dataset_stats(
+                df, dataset, pass_num=None, pass_k=pass_k)
+            stats_list.append(stats)
+
+        aggregated_results = {
+            'datasets': stats_list,
+            'overall': {
+                'total': sum(s['total'] for s in stats_list),
+                'answered': sum(s['answered'] for s in stats_list),
+                'correct': sum(s['correct'] for s in stats_list),
+                'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
+            }
+        }
+        results_data['aggregated_results'] = aggregated_results
+
+        # Always print summary table
+        print_summary_table(
+            stats_list,
+            title=f"Aggregated Pass@{pass_k} Results (Max Across Passes)")
+    else:
+        # Single pass - just show the results
+        stats_list = []
+        for dataset in datasets:
+            stats = calculate_dataset_stats(
+                df, dataset, pass_num=None, pass_k=pass_k)
+            stats_list.append(stats)
+
+        single_pass_results = {
+            'datasets': stats_list,
+            'overall': {
+                'total': sum(s['total'] for s in stats_list),
+                'answered': sum(s['answered'] for s in stats_list),
+                'correct': sum(s['correct'] for s in stats_list),
+                'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
+            }
+        }
+        results_data['results'] = single_pass_results
+
+        # Always print summary table
+        print_summary_table(stats_list, title="Evaluation Results")
+
+    # Print column information for reference
+    print("\nColumn Information:")
+    print(f"  - Total: Total number of samples in the dataset")
+    if pass_k > 1:
+        print(f"  - Answered: Number of samples with at least one extracted answer across all passes")
+    else:
+        print(f"  - Answered: Number of samples with extracted answers")
+    print(f"  - Correct: Number of correct answers (accuracy > 0)")
+    print(f"  - Accuracy: Percentage of total samples that were correct (correct / total)")
+
+    if pass_k > 1:
+        print(f"\nPass@{pass_k} Note:")
+        print(f"  - Per-pass results show individual pass performance")
+        print(
+            f"  - Aggregated results show the maximum accuracy achieved across all {pass_k} passes")
+        print(
+            f"  - A sample is considered correct if ANY of the {pass_k} attempts were correct")
+        print(
+            f"  - A sample is considered answered if ANY of the {pass_k} attempts extracted an answer")
+
+    # Save to JSON if requested
+    if json_output:
+        # Generate output filename: input_file_summarize.json
+        input_path = Path(pickle_path)
+        output_filename = input_path.stem + "_summarize.json"
+        output_path = input_path.parent / output_filename
+
+        with open(output_path, 'w') as f:
+            json.dump(results_data, f, indent=2)
+
+        print(f"\nSummary saved to: {output_path}")
+        return str(output_path)
+
+    return ""
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Summarize evaluation results by dataset. Supports glob patterns.",
+        epilog="Examples:\n"
+               "  %(prog)s results_evaluated.pkl\n"
+               "  %(prog)s data/*_evaluated.pkl\n"
+               "  %(prog)s --json data/accuracy_eval_*_evaluated.pkl",
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("input_files", nargs='+',
+                        help="One or more paths to evaluated pickle files or glob patterns (e.g., '*.pkl', 'data/*_evaluated.pkl')")
+    parser.add_argument("--json", action="store_true",
+                        help="Output results in JSON format (for programmatic use)")
+
+    args = parser.parse_args()
+
+    # Expand glob patterns
+    expanded_files = expand_glob_patterns(args.input_files)
+
+    if not expanded_files:
+        print(
+            "Error: No files found matching the provided patterns",
+            file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Found {len(expanded_files)} file(s) to process:")
+    for f in expanded_files:
+        print(f"  - {f}")
+    print()
+
+    # Process each file
+    for input_file in expanded_files:
+        # Check if file has _evaluated suffix (warn if not)
+        if "_evaluated" not in input_file:
+            print(f"Warning: Input file does not contain '_evaluated' suffix. "
+                  f"Make sure this is an evaluated pickle file from eval_accuracy.py",
+                  file=sys.stderr)
+
+        try:
+            summarize_evaluation(input_file, json_output=args.json)
+            print()  # Add spacing between files
+        except Exception as e:
+            print(f"Error processing file {input_file}: {e}", file=sys.stderr)
+            import traceback
+            traceback.print_exc()
+            # Continue processing other files
+            continue
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/backends/__init__.py b/language/gpt-oss/backends/__init__.py
new file mode 100644
index 0000000000..3f68dc171c
--- /dev/null
+++ b/language/gpt-oss/backends/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+"""Backend implementations for gpt-oss inference."""
+
+from .base_backend import BaseBackend
+from .sglang_backend import SGLangBackend
+
+__all__ = [
+    "BaseBackend",
+    "SGLangBackend",
+]
diff --git a/language/gpt-oss/backends/base_backend.py b/language/gpt-oss/backends/base_backend.py
new file mode 100644
index 0000000000..228de1ced8
--- /dev/null
+++ b/language/gpt-oss/backends/base_backend.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Base backend class for gpt-oss inference."""
+
+import abc
+import logging
+from typing import List, Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class BaseBackend(abc.ABC):
+    """Abstract base class for inference backends.
+
+    All backends must implement this interface to work with the MLPerf SUT.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize the backend.
+
+        Args:
+            config: Optional configuration dictionary
+        """
+        self.config = config or {}
+        self.initialized = False
+        logger.info(f"Initializing {self.__class__.__name__}")
+
+    @abc.abstractmethod
+    def initialize(self) -> None:
+        """Initialize the backend (load model, connect to server, etc.)."""
+        raise NotImplementedError("Subclasses must implement initialize()")
+
+    @abc.abstractmethod
+    def generate(
+        self,
+        prompts: List[List[int]],
+        max_tokens: int = 100,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """Generate responses for a batch of prompts.
+
+        Args:
+            prompts: List of token ID sequences
+            max_tokens: Maximum tokens to generate per prompt
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p (nucleus) sampling parameter
+            **kwargs: Additional backend-specific parameters
+
+        Returns:
+            List of response dictionaries with keys:
+                - output_ids: List of generated token IDs
+                - output_text: Generated text (optional)
+                - metadata: Additional metadata (latencies, etc.)
+        """
+        raise NotImplementedError("Subclasses must implement generate()")
+
+    @abc.abstractmethod
+    def cleanup(self) -> None:
+        """Clean up backend resources."""
+        raise NotImplementedError("Subclasses must implement cleanup()")
+
+    def __enter__(self):
+        """Context manager entry."""
+        self.initialize()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.cleanup()
+
+    @property
+    def is_initialized(self) -> bool:
+        """Check if backend is initialized."""
+        return self.initialized
diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
new file mode 100644
index 0000000000..c17ab37451
--- /dev/null
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+"""SGLang backend implementation for gpt-oss."""
+
+import asyncio
+import json
+import logging
+import requests
+import time
+from typing import List, Dict, Any, Optional, AsyncIterator
+import aiohttp
+from .base_backend import BaseBackend
+
+logger = logging.getLogger(__name__)
+
+
+class SGLangBackend(BaseBackend):
+    """SGLang inference backend using HTTP API.
+
+    Connects to an SGLang server running the gpt-oss model.
+    """
+
+    def __init__(
+        self,
+        server_url: str = "http://localhost:30000",
+        timeout: int = 1200,
+        max_pool_size: int = 2000,  # Default pool size for high concurrency
+        **kwargs
+    ):
+        """Initialize SGLang backend.
+
+        Args:
+            server_url: URL of the SGLang server
+            timeout: Request timeout in seconds
+            max_pool_size: Maximum connection pool size (should be >= max_concurrency)
+            **kwargs: Additional configuration
+        """
+        config = {
+            "server_url": server_url,
+            "timeout": timeout,
+            "max_pool_size": max_pool_size,
+            **kwargs
+        }
+        super().__init__(config)
+        self.server_url = server_url
+        self.timeout = timeout
+        self.max_pool_size = max_pool_size
+        self.session = None
+
+    def initialize(self) -> None:
+        """Initialize connection to SGLang server."""
+        if self.initialized:
+            logger.warning("Backend already initialized")
+            return
+
+        logger.info(f"Connecting to SGLang server at {self.server_url}")
+        logger.info(f"Configuring connection pool with max_pool_size={self.max_pool_size}")
+        # Create session with larger connection pool for high concurrency
+        # Default pool size is 10, but we may have 100s-1000s of concurrent
+        # requests
+        self.session = requests.Session()
+
+        # Increase connection pool size to support high concurrency
+        # pool_maxsize should be >= max_concurrency to avoid "pool is full" warnings
+        adapter = requests.adapters.HTTPAdapter(
+            pool_connections=min(100, self.max_pool_size // 10),  # Number of connection pools to cache
+            pool_maxsize=self.max_pool_size,     # Maximum number of connections in the pool
+            max_retries=3,                       # Retry failed requests
+            pool_block=False                     # Don't block when pool is full, create new connections
+        )
+        self.session.mount('http://', adapter)
+        self.session.mount('https://', adapter)
+
+        # Test connection with a simple request
+        try:
+            test_response = self._send_request(
+                input_ids=[1, 2, 3],
+                max_tokens=5,
+                temperature=0.001,
+                top_k=1,
+                top_p=1.0
+            )
+            if "error" in test_response:
+                raise ConnectionError(
+                    f"Failed to connect to SGLang server: {test_response['error']}"
+                )
+            logger.info("Successfully connected to SGLang server")
+            self.initialized = True
+        except Exception as e:
+            logger.error(f"Failed to initialize SGLang backend: {e}")
+            raise
+
+    def _send_request(
+        self,
+        input_ids: List[int],
+        max_tokens: int,
+        temperature: float,
+        top_k: int,
+        top_p: float
+    ) -> Dict[str, Any]:
+        """Send a single request to the SGLang server.
+
+        Args:
+            input_ids: Token IDs for the prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k parameter
+            top_p: Top-p parameter
+
+        Returns:
+            Response dictionary from the server
+        """
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+            }
+        }
+
+        try:
+            response = self.session.post(
+                f"{self.server_url}/generate",
+                json=payload,
+                timeout=self.timeout,
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(
+                    f"Request failed with status {response.status_code}: {response.text}"
+                )
+                return {"error": f"HTTP {response.status_code}: {response.text}"}
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            return {"error": str(e)}
+
+    def generate(
+        self,
+        prompts: List[List[int]],
+        max_tokens: int = 100,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """Generate responses for a batch of prompts.
+
+        Args:
+            prompts: List of token ID sequences
+            max_tokens: Maximum tokens to generate per prompt
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p (nucleus) sampling parameter
+            **kwargs: Additional parameters (ignored)
+
+        Returns:
+            List of response dictionaries with keys:
+                - output_ids: List of generated token IDs
+                - output_text: Generated text (if available)
+                - metadata: Additional metadata (latencies, etc.)
+        """
+        if not self.initialized:
+            raise RuntimeError(
+                "Backend not initialized. Call initialize() first.")
+
+        results = []
+        for prompt_ids in prompts:
+            start_time = time.time()
+            response = self._send_request(
+                input_ids=prompt_ids,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p
+            )
+            end_time = time.time()
+            latency = end_time - start_time
+
+            # Extract output_ids from response
+            output_ids = []
+            output_text = ""
+            if "error" not in response:
+                output_ids = response.get("output_ids", [])
+                output_text = response.get("text", "")
+
+            result = {
+                "output_ids": output_ids,
+                "output_text": output_text,
+                "metadata": {
+                    "latency": latency,
+                    "completion_tokens": response.get("meta_info", {}).get(
+                        "completion_tokens", len(output_ids)
+                    ),
+                    "error": response.get("error"),
+                }
+            }
+            results.append(result)
+
+        return results
+
+    async def generate_stream(
+        self,
+        input_ids: List[int],
+        max_tokens: int = 100,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        **kwargs
+    ) -> AsyncIterator[Dict[str, Any]]:
+        """Generate response with streaming support.
+
+        Yields incremental responses as tokens are generated.
+
+        Args:
+            input_ids: Token IDs for the prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k parameter
+            top_p: Top-p parameter
+
+        Yields:
+            Dict with:
+                - delta_token_ids: List of new token IDs in this chunk
+                - delta_text: New text in this chunk
+                - is_first_token: True if this is the first token
+                - is_finished: True if generation is complete
+                - accumulated_token_ids: All tokens generated so far
+                - metadata: Additional info (TTFT, completion_tokens, etc.)
+                
+        Note:
+            SGLang's streaming API behavior:
+            - Returns 'output_ids', 'text', and 'meta_info' in each chunk
+            - 'output_ids' can have retractions (length can decrease between chunks)
+            - 'meta_info.completion_tokens' is the RELIABLE cumulative token count
+            - 'finish_reason' in meta_info indicates completion (not a 'finished' flag)
+            - We use completion_tokens for accurate LoadGen token/sec metrics
+        """
+        if not self.initialized:
+            raise RuntimeError(
+                "Backend not initialized. Call initialize() first.")
+
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+            },
+            "stream": True  # Enable streaming
+        }
+
+        start_time = time.time()
+        first_token_time = None
+        accumulated_token_ids = []
+        accumulated_text = ""
+        is_first = True
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    f"{self.server_url}/generate",
+                    json=payload,
+                    timeout=aiohttp.ClientTimeout(total=self.timeout)
+                ) as response:
+                    if response.status != 200:
+                        error_text = await response.text()
+                        logger.error(
+                            f"Streaming request failed: {response.status} - {error_text}")
+                        yield {
+                            "delta_token_ids": [],
+                            "delta_text": "",
+                            "is_first_token": False,
+                            "is_finished": True,
+                            "accumulated_token_ids": [],
+                            "error": f"HTTP {response.status}: {error_text}",
+                            "metadata": {}
+                        }
+                        return
+
+                    # Read streaming response
+                    async for line in response.content:
+                        if not line:
+                            continue
+
+                        # SGLang sends data as "data: {...}\n\n"
+                        line_str = line.decode('utf-8').strip()
+                        if not line_str.startswith('data:'):
+                            continue
+
+                        try:
+                            # Remove "data:" prefix
+                            json_str = line_str[5:].strip()
+                            if json_str == '[DONE]':
+                                break
+
+                            chunk = json.loads(json_str)
+
+                            # Extract text delta
+                            delta_text = chunk.get("text", "")
+
+                            # Check if this is the final chunk
+                            # SGLang uses 'finish_reason' in meta_info, not 'finished' flag
+                            meta_info = chunk.get("meta_info", {})
+                            finish_reason = meta_info.get("finish_reason")
+                            is_finished = (finish_reason is not None and finish_reason != "null") or chunk.get("finished", False)
+
+                            # Extract token information from chunk
+                            # SGLang's output_ids can have retractions, so use meta_info.completion_tokens
+                            # which is the reliable cumulative count
+                            chunk_output_ids = chunk.get("output_ids", [])
+                            completion_tokens = meta_info.get("completion_tokens", 0)
+                            
+                            if completion_tokens > 0:
+                                # Use completion_tokens as the authoritative count
+                                previous_count = len(accumulated_token_ids)
+                                
+                                if completion_tokens > previous_count:
+                                    # New tokens generated
+                                    num_new_tokens = completion_tokens - previous_count
+                                    
+                                    if chunk_output_ids and len(chunk_output_ids) >= num_new_tokens:
+                                        # Use actual token IDs from chunk
+                                        delta_token_ids = chunk_output_ids[-num_new_tokens:] if num_new_tokens > 0 else []
+                                    else:
+                                        # Fallback: create placeholder tokens for counting
+                                        delta_token_ids = list(range(previous_count, completion_tokens))
+                                    
+                                    accumulated_token_ids.extend(delta_token_ids)
+                                else:
+                                    delta_token_ids = []
+                                
+                            else:
+                                # No completion_tokens - fallback to output_ids or text estimation
+                                if chunk_output_ids:
+                                    delta_token_ids = chunk_output_ids
+                                    accumulated_token_ids.extend(delta_token_ids)
+                                elif delta_text:
+                                    # Estimate from text length
+                                    estimated_tokens = max(1, len(delta_text) // 4)
+                                    delta_token_ids = [0] * estimated_tokens
+                                    accumulated_token_ids.extend(delta_token_ids)
+                                else:
+                                    delta_token_ids = []
+                            
+                            # Accumulate text
+                            if delta_text:
+                                accumulated_text += delta_text
+
+                            # Mark first token timing
+                            if is_first and (delta_token_ids or delta_text):
+                                first_token_time = time.time()
+                                is_first = False
+
+                            yield {
+                                "delta_token_ids": delta_token_ids,
+                                "delta_text": delta_text,
+                                "is_first_token": (first_token_time is not None and is_first is False and len(accumulated_token_ids) <= len(delta_token_ids)),
+                                "is_finished": is_finished,
+                                "accumulated_token_ids": accumulated_token_ids.copy(),
+                                "accumulated_text": accumulated_text,
+                                "metadata": {
+                                    "ttft_ms": (first_token_time - start_time) * 1000 if first_token_time else None,
+                                    "latency_ms": (time.time() - start_time) * 1000,
+                                    **chunk.get("meta_info", {})
+                                }
+                            }
+
+                            if is_finished:
+                                break
+
+                        except json.JSONDecodeError as e:
+                            logger.warning(
+                                f"Failed to parse streaming chunk: {e}")
+                            continue
+
+        except asyncio.TimeoutError:
+            logger.error(f"Streaming request timed out after {self.timeout}s")
+            yield {
+                "delta_token_ids": [],
+                "delta_text": "",
+                "is_first_token": False,
+                "is_finished": True,
+                "accumulated_token_ids": accumulated_token_ids,
+                "error": "Timeout",
+                "metadata": {}
+            }
+        except Exception as e:
+            logger.error(f"Streaming request failed: {e}", exc_info=True)
+            yield {
+                "delta_token_ids": [],
+                "delta_text": "",
+                "is_first_token": False,
+                "is_finished": True,
+                "accumulated_token_ids": accumulated_token_ids,
+                "error": str(e),
+                "metadata": {}
+            }
+
+    def cleanup(self) -> None:
+        """Clean up backend resources."""
+        if self.session:
+            self.session.close()
+            self.session = None
+        self.initialized = False
+        logger.info("SGLang backend cleaned up")
diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
new file mode 100644
index 0000000000..2aa2aa0eae
--- /dev/null
+++ b/language/gpt-oss/eval_accuracy.py
@@ -0,0 +1,998 @@
+#!/usr/bin/env python3
+"""
+Standalone evaluation script for mlperf-inference deepseek-r1 dataset.
+
+Expected input format (pickle file with DataFrame):
+- model_output: The model's response text
+- tok_model_output_len: The length of the model's response tokens
+- ground_truth: The expected answer
+- dataset: Dataset name (e.g., 'gpqa', 'mmlu_pro', 'math500', 'livecodebench', 'aime')
+- question: The question text
+
+Output adds columns:
+- extracted_answer: Parsed answer from model output
+- prompt_accuracy: 100.0 if correct, 0.0 if incorrect
+- evaluation_details: Detailed evaluation explanation
+"""
+
+import sys
+import os
+import argparse
+import logging
+import pickle
+import re
+import shutil
+import time
+from functools import lru_cache
+from typing import Dict, Any, Optional, Tuple, Union
+import pandas as pd
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
+import multiprocessing
+from pathlib import Path
+from contextlib import redirect_stdout, redirect_stderr
+
+# MLPerf log processing imports
+import numpy as np
+from transformers import AutoTokenizer
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# Input Validation
+# =============================================================================
+
+
+def detect_pass_k(df: pd.DataFrame) -> int:
+    """Detect if DataFrame has pass@k format and return k.
+
+    Returns:
+        Number of passes (k) if pass@k format detected, otherwise 1
+    """
+    # Check for model_output_0, model_output_1, etc.
+    pass_k = 0
+    while f'model_output_{pass_k}' in df.columns:
+        pass_k += 1
+
+    # If no _0 suffix found, check for single model_output column
+    if pass_k == 0 and 'model_output' in df.columns:
+        return 1
+
+    return pass_k
+
+
+def validate_dataframe(df: pd.DataFrame) -> None:
+    """Validate input DataFrame has required columns."""
+    if not isinstance(df, pd.DataFrame):
+        raise ValueError("Input must be a pandas DataFrame")
+
+    # Detect pass@k format
+    pass_k = detect_pass_k(df)
+
+    if pass_k == 0:
+        raise ValueError(
+            "No model_output columns found (expected 'model_output' or 'model_output_0', 'model_output_1', etc.)")
+
+    # Check for dataset column
+    if 'dataset' not in df.columns:
+        raise ValueError("Missing required column: 'dataset'")
+
+    # Check for tok_model_output_len (either single or with suffixes)
+    has_tok_len = False
+    if pass_k == 1:
+        has_tok_len = 'tok_model_output_len' in df.columns
+    else:
+        has_tok_len = all(
+            f'tok_model_output_len_{i}' in df.columns for i in range(pass_k))
+
+    if not has_tok_len:
+        raise ValueError("Missing required tok_model_output_len column(s)")
+
+    # Check for ground_truth or rubrics depending on dataset
+    has_ground_truth = 'ground_truth' in df.columns
+    has_rubrics = 'rubrics' in df.columns
+
+    if not has_ground_truth and not has_rubrics:
+        raise ValueError(
+            "DataFrame must have either 'ground_truth' or 'rubrics' column")
+
+
+def validate_text_input(text: Any) -> str:
+    """Validate and convert text input to string."""
+    if pd.isna(text) or text is None:
+        return ""
+    return str(text).strip()
+
+
+def validate_dataset_name(dataset: Any) -> str:
+    """Validate dataset name."""
+    if pd.isna(dataset) or not dataset:
+        raise ValueError("Dataset name cannot be empty")
+    return str(dataset).lower()
+
+
+# =============================================================================
+# Answer Parsing Functions
+# =============================================================================
+
+def parse_multiple_choice(text: str, max_option: str = 'D') -> Optional[str]:
+    """Parse multiple choice answer (A-D or A-J)."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    # Clean artifacts
+    if text.startswith(("['", '["')) and text.endswith(("']", '"]')):
+        text = text[2:-2].strip()
+
+    text = text.replace(r'\n', '\n').replace(r'\'', "'")
+
+    # Find ANSWER/FINAL ANSWER pattern
+    pattern = rf"\b(?:ANSWER|FINAL\s*ANSWER)\b\s*[:=]?\s*(?:\(?\s*([A-{max_option}])\s*\)?)(?:\s*$|[^A-Za-z])"
+    matches = list(re.finditer(pattern, text, re.IGNORECASE))
+
+    if matches:
+        return matches[-1].group(1).upper()
+
+    # MMLU-Pro fallback: standalone letter
+    if max_option == 'J':
+        fallback_matches = list(re.finditer(
+            r"\b([A-J])\b", text, re.IGNORECASE))
+        if fallback_matches:
+            return fallback_matches[-1].group(1).upper()
+
+    return None
+
+
+def parse_boxed_math(text: str) -> Optional[str]:
+    """Parse \\boxed{answer} format."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    idx = text.rfind(r"\boxed{")
+    if idx == -1:
+        return None
+
+    # Find matching brace
+    depth, i = 0, idx + 7
+    content_start = i
+    while i < len(text):
+        if text[i] == '{':
+            depth += 1
+        elif text[i] == '}':
+            if depth == 0:
+                return text[content_start:i].strip()
+            depth -= 1
+        i += 1
+    return None
+
+
+def parse_aime_answer(text: str) -> Optional[int]:
+    """Parse AIME integer answer (0-999)."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    # Priority 1: \boxed{digits}
+    boxed_matches = list(re.finditer(r"\\boxed{\s*(\d+)\s*}", text))
+    if boxed_matches:
+        extracted_str = boxed_matches[-1].group(1)
+    else:
+        # Priority 2: Answer: <digits>
+        answer_matches = list(re.finditer(
+            r"Answer:\s*(\d+)(?!\.)\b", text, re.IGNORECASE | re.MULTILINE))
+        if not answer_matches:
+            return None
+        extracted_str = answer_matches[-1].group(1)
+
+    try:
+        val = int(extracted_str)
+        if 0 <= val <= 999:
+            return val
+    except ValueError:
+        pass
+
+    return None
+
+
+def parse_code(text: str) -> Optional[str]:
+    """Parse code from ```python or plain ``` code block.
+
+    Priority:
+    1. Last ```python block
+    2. Last plain ``` block (if it looks like Python code)
+    """
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    # Try ```python blocks first (most specific)
+    python_matches = list(re.finditer(r"```python(.*?)```", text, re.DOTALL))
+    if python_matches:
+        return python_matches[-1].group(1).strip()
+
+    # Fall back to plain ``` blocks
+    plain_matches = list(re.finditer(r"```(.*?)```", text, re.DOTALL))
+    if plain_matches:
+        # Get the last match
+        code = plain_matches[-1].group(1).strip()
+        # Remove language tag if present (e.g., ```python\n or ```py\n)
+        code = re.sub(r'^(?:python|py)\s*\n', '', code, flags=re.IGNORECASE)
+        return code
+
+    return None
+
+
+# =============================================================================
+# Answer Evaluation Functions
+# =============================================================================
+
+def evaluate_multiple_choice(
+        parsed: Optional[str], ground_truth: str, valid_options: str) -> bool:
+    """Evaluate multiple choice answer."""
+    if not parsed or not ground_truth:
+        return False
+
+    parsed = parsed.upper()
+    ground_truth = ground_truth.upper()
+
+    return parsed in valid_options and parsed == ground_truth
+
+
+def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool:
+    """Evaluate MATH-500 using PRM800K grader."""
+    if not parsed or not ground_truth:
+        return False
+
+    parsed = str(parsed).strip()
+    ground_truth = str(ground_truth)
+
+    if not parsed:
+        return False
+
+    # Use sys.path approach for proper module importing
+    workspace_path = os.path.dirname(os.path.abspath(__file__))
+    prm800k_module_path = os.path.join(
+        workspace_path, "submodules", "prm800k", "prm800k")
+
+    if not os.path.exists(prm800k_module_path):
+        raise FileNotFoundError(
+            f"PRM800K module not found at: {prm800k_module_path}")
+
+    # Save current directory and sys.path
+    original_cwd = os.getcwd()
+    original_syspath = sys.path.copy()
+
+    try:
+        # Add prm800k module path to sys.path
+        if prm800k_module_path not in sys.path:
+            sys.path.insert(0, prm800k_module_path)
+
+        # Change directory as some imports might use relative paths
+        os.chdir(prm800k_module_path)
+
+        # Now import should work
+        from grading.grader import grade_answer
+        result = grade_answer(given_answer=parsed, ground_truth=ground_truth)
+    except ImportError as e:
+        raise ImportError(f"Failed to import PRM800K grader: {e}")
+    finally:
+        # Always restore original directory and sys.path
+        os.chdir(original_cwd)
+        sys.path[:] = original_syspath
+
+    return result
+
+
+def evaluate_aime(parsed: Optional[int], ground_truth: Any) -> bool:
+    """Evaluate AIME integer answer."""
+    if parsed is None:
+        return False
+
+    try:
+        gt_int = int(ground_truth)
+        return int(parsed) == gt_int
+    except (ValueError, TypeError):
+        return False
+
+
+@lru_cache(maxsize=1)
+def load_lcb_benchmark() -> Dict[str, Any]:
+    """Load LiveCodeBench benchmark with caching."""
+    lcb_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), "submodules", "LiveCodeBench"))
+
+    if not os.path.isdir(lcb_dir):
+        raise FileNotFoundError(
+            f"LiveCodeBench submodule required at: {lcb_dir}")
+
+    original_cwd = os.getcwd()
+    os.chdir(lcb_dir)
+
+    if lcb_dir not in sys.path:
+        sys.path.insert(0, lcb_dir)
+
+    try:
+        os.environ['TQDM_DISABLE'] = '1'
+
+        from lcb_runner.utils.scenarios import Scenario
+        from lcb_runner.runner.scenario_router import build_prompt_benchmark
+
+        mock_args = argparse.Namespace(
+            scenario=Scenario.codegeneration, release_version="release_v6",
+            subset="code_generation", language="python", not_fast=False,
+            start_date=None, end_date=None, k=[1], num_samples=1,
+            timeout=60, num_workers=1, num_process_evaluate=1,
+            model_name="standalone_eval", output_dir="/tmp",
+            prompt_type="custom", continue_existing=False, evaluate=True
+        )
+
+        full_benchmark, _ = build_prompt_benchmark(mock_args)
+        return {inst.question_id: inst for inst in full_benchmark}
+
+    finally:
+        os.chdir(original_cwd)
+        os.environ.pop('TQDM_DISABLE', None)
+
+
+def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
+    """Evaluate LiveCodeBench code generation.
+
+    Returns:
+        bool: True if all tests passed, False otherwise
+    """
+    result, _ = evaluate_livecodebench_detailed(code, question_id)
+    return result
+
+
+def evaluate_livecodebench_detailed(
+        code: Optional[str], question_id: str) -> Tuple[bool, str]:
+    """Evaluate LiveCodeBench code generation with detailed results.
+
+    Returns:
+        Tuple[bool, str]: (passed, detailed_reason)
+            - passed: True if all tests passed, False otherwise
+            - detailed_reason: Description of test results or error
+    """
+    if not code or not question_id:
+        return False, "No code or question_id provided"
+
+    lcb_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), "submodules", "LiveCodeBench"))
+
+    try:
+        benchmark_map = load_lcb_benchmark()
+    except Exception as e:
+        return False, f"Failed to load benchmark: {type(e).__name__}: {e}"
+
+    instance = benchmark_map.get(question_id)
+    if not instance:
+        return False, f"Question ID '{question_id}' not found in benchmark"
+
+    original_cwd = os.getcwd()
+    temp_dir = f"/tmp/temp_lcb_eval_{question_id}_{int(time.time())}"
+    os.makedirs(temp_dir, exist_ok=True)
+
+    try:
+        os.chdir(lcb_dir)
+        os.environ['TQDM_DISABLE'] = '1'
+
+        from lcb_runner.utils.scenarios import Scenario
+        from lcb_runner.evaluation import extract_instance_results
+        from lcb_runner.runner.scenario_router import sort_and_extract_save_results, get_metrics
+
+        mock_args = argparse.Namespace(
+            scenario=Scenario.codegeneration, release_version="release_v6",
+            subset="code_generation", language="python", not_fast=False,
+            start_date=None, end_date=None, k=[1], num_samples=1,
+            timeout=60, num_workers=1, num_process_evaluate=1,
+            model_name="inline_handler_eval", output_dir=temp_dir,
+            prompt_type="custom", continue_existing=False, evaluate=True,
+        )
+
+        batch_benchmark = [instance]
+        batch_custom_outputs = [[code]]
+
+        save_results = [inst.insert_output(output, output)
+                        for inst, output in zip(batch_benchmark, batch_custom_outputs)]
+
+        _, combined_results = sort_and_extract_save_results(
+            mock_args.scenario, save_results)
+        _, instance_results, _ = get_metrics(
+            mock_args.scenario, mock_args, batch_benchmark, combined_results
+        )
+
+        graded = extract_instance_results(instance_results)
+        passed = graded and graded[0] and graded[0][0]
+
+        # Try to extract detailed results
+        detailed_reason = ""
+        try:
+            if combined_results and len(combined_results) > 0:
+                result_info = combined_results[0]
+                if hasattr(result_info, 'result') and result_info.result:
+                    # Extract test results
+                    test_results = result_info.result
+                    if isinstance(test_results, dict):
+                        detailed_reason = f"Test results: {test_results}"
+                    elif isinstance(test_results, list):
+                        num_passed = sum(1 for r in test_results if r)
+                        num_total = len(test_results)
+                        detailed_reason = f"Passed {num_passed}/{num_total} test cases"
+                    else:
+                        detailed_reason = f"Result: {test_results}"
+                elif hasattr(result_info, 'status'):
+                    detailed_reason = f"Status: {result_info.status}"
+        except Exception:
+            pass
+
+        if not detailed_reason:
+            if passed:
+                detailed_reason = "All tests passed"
+            else:
+                detailed_reason = "Failed one or more test cases"
+
+        return passed, detailed_reason
+
+    except Exception as e:
+        return False, f"Evaluation error: {type(e).__name__}: {str(e)[:200]}"
+    finally:
+        os.chdir(original_cwd)
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        os.environ.pop('TQDM_DISABLE', None)
+
+
+def evaluate_livecodebench_worker(
+        args: Tuple[str, str]) -> Tuple[str, bool, str]:
+    """Worker function for parallel LiveCodeBench evaluation.
+
+    Returns:
+        Tuple[str, bool, str]: (question_id, passed, detailed_reason)
+    """
+    code, question_id = args
+
+    # Suppress all stdout/stderr from worker processes to prevent pollution
+    try:
+        with open(os.devnull, 'w') as devnull:
+            with redirect_stdout(devnull), redirect_stderr(devnull):
+                # Also set environment variable to disable tqdm
+                os.environ['TQDM_DISABLE'] = '1'
+                passed, reason = evaluate_livecodebench_detailed(
+                    code, question_id)
+                return question_id, passed, reason
+    except Exception as e:
+        error_msg = f"Error evaluating {question_id}: {type(e).__name__}: {e}"
+        # Don't use logger here as it might output to stdout in worker process
+        return question_id, False, error_msg
+
+
+# =============================================================================
+# Dataset Configuration
+# =============================================================================
+
+DATASET_EVALUATORS = {
+    'gpqa': {
+        'parse': lambda text: parse_multiple_choice(text, 'D'),
+        'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCD')
+    },
+    'mmlu_pro': {
+        'parse': lambda text: parse_multiple_choice(text, 'J'),
+        'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCDEFGHIJ')
+    },
+    'math500': {
+        'parse': parse_boxed_math,
+        'evaluate': evaluate_math500
+    },
+    'aime': {
+        'parse': parse_aime_answer,
+        'evaluate': evaluate_aime
+    },
+    'livecodebench': {
+        'parse': parse_code,
+        'evaluate': evaluate_livecodebench
+    },
+    'mmlu': {
+        'parse': lambda text: parse_multiple_choice(text, 'J'),
+        'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCDEFGHIJ')
+    },
+
+}
+
+
+def get_evaluator(dataset_name: str) -> Dict[str, Any]:
+    """Get evaluator functions for dataset."""
+    dataset_lower = validate_dataset_name(dataset_name)
+
+    for key, evaluator in DATASET_EVALUATORS.items():
+        if key in dataset_lower:
+            return evaluator
+
+    raise ValueError(f"No evaluator found for dataset: {dataset_name}")
+
+
+# =============================================================================
+# Main Processing Functions
+# =============================================================================
+
+def process_row(row: pd.Series) -> Dict[str, Any]:
+    """Process a single row and return extracted answer and accuracy."""
+    dataset_name = validate_dataset_name(row['dataset'])
+    raw_output = validate_text_input(row['model_output'])
+    ground_truth = row['ground_truth']
+
+    evaluator = get_evaluator(dataset_name)
+    extracted = evaluator['parse'](raw_output)
+
+    is_correct = False
+    if extracted is not None and not pd.isna(ground_truth):
+        is_correct = evaluator['evaluate'](extracted, ground_truth)
+
+    return {
+        'extracted_answer': extracted,
+        'prompt_accuracy': 100.0 if is_correct else 0.0
+    }
+
+
+def process_dataframe(df: pd.DataFrame,
+                      num_lcb_workers: int = 64) -> pd.DataFrame:
+    """Process entire dataframe with optimized batch processing.
+
+    Args:
+        df: Input DataFrame to evaluate
+        num_lcb_workers: Maximum number of parallel workers for LiveCodeBench evaluation
+
+    Supports both single-pass and pass@k formats:
+    - Single-pass: model_output -> extracted_answer, prompt_accuracy
+    - Pass@k: model_output_0, model_output_1, ... -> extracted_answer_0, prompt_accuracy_0, ...
+              and aggregated prompt_accuracy = max(prompt_accuracy_0, prompt_accuracy_1, ...)
+    """
+    validate_dataframe(df)
+
+    df_output = df.copy()
+
+    # Detect pass@k
+    pass_k = detect_pass_k(df)
+    logger.info(f"Detected pass@k format with k={pass_k}")
+
+    # Initialize columns for each pass
+    for pass_num in range(pass_k):
+        suffix = f'_{pass_num}' if pass_k > 1 else ''
+        df_output[f'extracted_answer{suffix}'] = None
+        df_output[f'prompt_accuracy{suffix}'] = 0.0
+        df_output[f'evaluation_details{suffix}'] = None
+
+    # Add aggregated columns for pass@k
+    if pass_k > 1:
+        df_output['prompt_accuracy'] = 0.0  # Will be max of all passes
+        df_output['evaluation_details'] = None  # Will aggregate details
+
+    # Check if we have LiveCodeBench datasets to evaluate
+    has_livecodebench = any('livecodebench' in str(ds).lower()
+                            for ds in df_output['dataset'].unique())
+
+    # Pre-load LiveCodeBench benchmark and create shared process pool for all
+    # LCB evaluations
+    lcb_executor = None
+    if has_livecodebench:
+        try:
+            logger.info(
+                "Pre-loading LiveCodeBench benchmark for worker processes...")
+            # Load benchmark in main process before forking - workers will
+            # inherit via copy-on-write
+            _ = load_lcb_benchmark()
+            logger.info("LiveCodeBench benchmark loaded successfully")
+
+            # Create a single process pool for all LCB evaluations
+            max_workers = min(multiprocessing.cpu_count(), num_lcb_workers)
+            lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
+            logger.info(
+                f"Created shared ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
+        except Exception as e:
+            logger.warning(f"Failed to pre-load LiveCodeBench benchmark: {e}")
+            logger.warning("Will fall back to per-evaluation loading")
+
+    try:
+        # Process by dataset
+        for dataset_name, group_indices in tqdm(df_output.groupby('dataset').groups.items(),
+                                                desc="Processing datasets"):
+            evaluator = get_evaluator(dataset_name)
+
+            # For LiveCodeBench, always use batched evaluation across all
+            # passes
+            is_livecodebench = 'livecodebench' in dataset_name.lower()
+            if is_livecodebench:
+                # Validate prerequisites for batched LCB evaluation
+                if lcb_executor is None:
+                    raise RuntimeError(
+                        "LiveCodeBench evaluation requires a shared executor, but it was not initialized. "
+                        "This may indicate the LiveCodeBench benchmark failed to load.")
+
+                # Parse all passes first
+                logger.info(
+                    f"Parsing {len(group_indices)} rows for dataset '{dataset_name}' across {pass_k} passes")
+                for pass_num in range(pass_k):
+                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    model_output_col = f'model_output{suffix}'
+                    extracted_answer_col = f'extracted_answer{suffix}'
+                    evaluation_details_col = f'evaluation_details{suffix}'
+
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        raw_output = validate_text_input(row[model_output_col])
+                        extracted = evaluator['parse'](raw_output)
+                        df_output.at[idx, extracted_answer_col] = extracted
+
+                        if extracted is None or pd.isna(extracted):
+                            df_output.at[idx,
+                                         evaluation_details_col] = "No answer extracted from model output"
+
+                # Collect all work items from all passes
+                all_work_items = []
+                work_item_metadata = []  # (idx, pass_num)
+                for pass_num in range(pass_k):
+                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    extracted_answer_col = f'extracted_answer{suffix}'
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        extracted = row.get(extracted_answer_col)
+                        ground_truth = row.get('ground_truth')
+
+                        if extracted is not None and not pd.isna(ground_truth):
+                            all_work_items.append((extracted, ground_truth))
+                            work_item_metadata.append((idx, pass_num))
+
+                if all_work_items:
+                    # Submit all work at once for maximum parallelism
+                    max_workers = min(
+                        multiprocessing.cpu_count(), len(all_work_items), num_lcb_workers)
+                    logger.info(
+                        f"Evaluating {len(all_work_items)} LiveCodeBench items across {pass_k} passes with {max_workers} workers")
+
+                    future_to_metadata = {
+                        lcb_executor.submit(evaluate_livecodebench_worker, work_item): metadata
+                        for work_item, metadata in zip(all_work_items, work_item_metadata)
+                    }
+
+                    # Collect results and assign to appropriate pass columns
+                    pass_results = {i: {'correct': 0, 'total': 0}
+                                    for i in range(pass_k)}
+
+                    for future in tqdm(as_completed(future_to_metadata, timeout=1200),
+                                       total=len(future_to_metadata),
+                                       desc=f"Evaluating LiveCodeBench (all passes)"):
+                        idx, pass_num = future_to_metadata[future]
+                        suffix = f'_{pass_num}' if pass_k > 1 else ''
+                        prompt_accuracy_col = f'prompt_accuracy{suffix}'
+                        evaluation_details_col = f'evaluation_details{suffix}'
+
+                        try:
+                            question_id, is_correct, detailed_reason = future.result(
+                                timeout=80)
+                            df_output.at[idx,
+                                         prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                            df_output.at[idx,
+                                         evaluation_details_col] = detailed_reason
+                            pass_results[pass_num]['total'] += 1
+                            if is_correct:
+                                pass_results[pass_num]['correct'] += 1
+                        except TimeoutError:
+                            logger.warning(
+                                f"Timeout evaluating row {idx} pass {pass_num}: Test execution exceeded 80s timeout")
+                            df_output.at[idx, prompt_accuracy_col] = 0.0
+                            df_output.at[idx,
+                                         evaluation_details_col] = "Timeout: Test execution exceeded time limit"
+                            pass_results[pass_num]['total'] += 1
+                        except Exception as e:
+                            logger.error(
+                                f"Error evaluating row {idx} pass {pass_num}: {e}")
+                            df_output.at[idx, prompt_accuracy_col] = 0.0
+                            df_output.at[idx,
+                                         evaluation_details_col] = f"Error: {e}"
+                            pass_results[pass_num]['total'] += 1
+
+                    # Log results for each pass
+                    for pass_num in range(pass_k):
+                        if pass_results[pass_num]['total'] > 0:
+                            correct = pass_results[pass_num]['correct']
+                            total = pass_results[pass_num]['total']
+                            accuracy = correct / total * 100
+                            logger.info(
+                                f"{dataset_name} pass {pass_num} results: {correct}/{total} correct ({accuracy:.1f}% accuracy)")
+
+            else:
+                # Sequential pass processing for non-LCB datasets
+                for pass_num in range(pass_k):
+                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    model_output_col = f'model_output{suffix}'
+                    extracted_answer_col = f'extracted_answer{suffix}'
+                    prompt_accuracy_col = f'prompt_accuracy{suffix}'
+                    evaluation_details_col = f'evaluation_details{suffix}'
+
+                    logger.info(
+                        f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
+
+                    # Parse answers for all rows in this dataset for this pass
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        raw_output = validate_text_input(row[model_output_col])
+                        extracted = evaluator['parse'](raw_output)
+                        df_output.at[idx, extracted_answer_col] = extracted
+
+                        # Set initial evaluation details for rows without extracted
+                        # answers
+                        if extracted is None or pd.isna(extracted):
+                            df_output.at[idx,
+                                         evaluation_details_col] = "No answer extracted from model output"
+
+                    # Evaluate answers for this pass
+                    # Sequential evaluation for all non-LCB datasets
+                    correct_count = 0
+                    total_evaluated = 0
+
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        extracted = row[extracted_answer_col]
+                        ground_truth = row.get('ground_truth')
+
+                        if extracted is not None and not pd.isna(ground_truth):
+                            is_correct = evaluator['evaluate'](
+                                extracted, ground_truth)
+                            df_output.at[idx,
+                                         prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                            total_evaluated += 1
+                            if is_correct:
+                                correct_count += 1
+
+                    # Log results for this pass
+                    if total_evaluated > 0:
+                        accuracy = correct_count / total_evaluated * 100
+                        logger.info(
+                            f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+
+            # Aggregate results across all passes (take max)
+            if pass_k > 1:
+                logger.info(
+                    f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
+                for idx in group_indices:
+                    # Get all accuracy values for this row
+                    accuracies = []
+                    for pass_num in range(pass_k):
+                        acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
+                        accuracies.append(acc if not pd.isna(acc) else 0.0)
+
+                    # Set aggregated accuracy as max
+                    max_accuracy = max(accuracies)
+                    df_output.at[idx, 'prompt_accuracy'] = max_accuracy
+
+                    # Find which pass achieved max accuracy
+                    max_pass = accuracies.index(max_accuracy)
+                    df_output.at[idx,
+                                 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
+
+        return df_output
+    finally:
+        # Clean up shared LiveCodeBench executor
+        if lcb_executor is not None:
+            logger.info(
+                "Shutting down shared LiveCodeBench ProcessPoolExecutor")
+            lcb_executor.shutdown(wait=True)
+
+
+# =============================================================================
+# Unified Evaluation Utilities
+# =============================================================================
+
+def print_evaluation_results(df_evaluated: pd.DataFrame,
+                             logger: Optional[logging.Logger] = None) -> Dict[str, Any]:
+    """Print evaluation results in a unified format.
+
+    Args:
+        df_evaluated: DataFrame with evaluated results
+        logger: Optional logger instance (uses module logger if not provided)
+
+    Returns:
+        Dictionary with evaluation statistics
+    """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+
+    # Detect pass@k
+    pass_k = detect_pass_k(df_evaluated)
+
+    # Calculate statistics
+    if pass_k > 1:
+        # For pass@k, use the aggregated prompt_accuracy (max across passes)
+        # Count from first pass
+        evaluated = df_evaluated['extracted_answer_0'].notna().sum()
+        correct = (df_evaluated['prompt_accuracy'] > 0).sum()
+        accuracy = df_evaluated['prompt_accuracy'].mean()
+
+        # Calculate average token length across all passes
+        all_output_lens = []
+        for i in range(pass_k):
+            all_output_lens.extend(
+                df_evaluated[f'tok_model_output_len_{i}'].tolist())
+        mean_output_len = float(
+            sum(all_output_lens) /
+            len(all_output_lens)) if all_output_lens else 0.0
+    else:
+        # Single pass format
+        suffix = '' if 'extracted_answer' in df_evaluated.columns else '_0'
+        evaluated = df_evaluated[f'extracted_answer{suffix}'].notna().sum()
+        correct = (df_evaluated[f'prompt_accuracy{suffix}'] > 0).sum()
+        accuracy = df_evaluated[f'prompt_accuracy{suffix}'].mean()
+
+        # tok_model_output_len is now a required column
+        tok_len_col = 'tok_model_output_len' if 'tok_model_output_len' in df_evaluated.columns else 'tok_model_output_len_0'
+        mean_output_len = float(df_evaluated[tok_len_col].mean())
+
+    # Use exact_match as the metric key
+    metric_key = 'exact_match'
+
+    results = {
+        # 'evaluated': int(evaluated),
+        # 'correct': int(correct),
+        metric_key: float(accuracy),
+        'tokens_per_sample': mean_output_len,
+        'num-samples': len(df_evaluated),
+    }
+
+    if pass_k > 1:
+        results['pass_k'] = pass_k
+        # Also report individual pass accuracies
+        for i in range(pass_k):
+            pass_acc = df_evaluated[f'prompt_accuracy_{i}'].mean()
+            results[f'{metric_key}_pass_{i}'] = float(pass_acc)
+
+    print("\nResults\n")
+    print(results)
+
+
+def process_and_save_dataframe(df: pd.DataFrame,
+                               output_dir: Optional[Union[str, Path]] = None,
+                               base_filename: Optional[str] = None,
+                               num_lcb_workers: int = 64) -> Tuple[pd.DataFrame, str]:
+    """Process dataframe for evaluation and save the results.
+
+    Args:
+        df: Input DataFrame to evaluate
+        output_dir: Directory to save the evaluated pickle file (defaults to same dir as source)
+        base_filename: Base filename for output (defaults to auto-generated)
+        num_lcb_workers: Maximum number of parallel workers for LiveCodeBench evaluation
+
+    Returns:
+        Tuple of (evaluated_dataframe, saved_file_path)
+    """
+    # Process the dataframe
+    df_evaluated = process_dataframe(df, num_lcb_workers=num_lcb_workers)
+
+    # Determine output path
+    if output_dir is None:
+        # Try to infer from existing path info in the dataframe or use current
+        # directory
+        output_dir = Path.cwd()
+    else:
+        output_dir = Path(output_dir)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Generate filename if not provided
+    if base_filename is None:
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        base_filename = f"results_evaluated_{timestamp}.pkl"
+    elif not base_filename.endswith('_evaluated.pkl'):
+        # Ensure it ends with _evaluated.pkl
+        if base_filename.endswith('.pkl'):
+            base_filename = base_filename[:-4] + '_evaluated.pkl'
+        else:
+            base_filename = base_filename + '_evaluated.pkl'
+
+    output_path = output_dir / base_filename
+
+    # Save the evaluated dataframe
+    with open(output_path, 'wb') as f:
+        pickle.dump(df_evaluated, f)
+
+    logger.info(f"Evaluated results saved to: {output_path}")
+
+    return df_evaluated, str(output_path)
+
+
+# =============================================================================
+# Main Function
+# =============================================================================
+
+def detect_file_type(file_path: Union[str, Path]) -> str:
+    """Detect whether file is MLPerf JSON or pickle format.
+
+    Returns:
+        "mlperf_json" or "pickle"
+    """
+    file_path = Path(file_path)
+
+    # Check by extension first
+    if file_path.suffix.lower() == '.json':
+        return "mlperf_json"
+    elif file_path.suffix.lower() in ['.pkl', '.pickle']:
+        return "pickle"
+
+    # Try to detect by content
+    try:
+        # Try reading as JSON first
+        with open(file_path, 'r') as f:
+            first_char = f.read(1)
+            if first_char in ['[', '{']:
+                # Likely JSON
+                return "mlperf_json"
+    except BaseException:
+        pass
+
+    # Default to pickle
+    return "pickle"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate model outputs - supports both pickle DataFrames and MLPerf JSON logs")
+    parser.add_argument("--input-file", required=True,
+                        help="Input file (pickle DataFrame or MLPerf JSON log)")
+    parser.add_argument(
+        "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
+    parser.add_argument("--num-lcb-workers", type=int, default=64,
+                        help="Maximum number of parallel workers for LiveCodeBench evaluation (default: 64)")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Verbose logging")
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    if not os.path.exists(args.input_file):
+        raise FileNotFoundError(f"Input file not found: {args.input_file}")
+
+    input_path = Path(args.input_file)
+
+    # Detect file type
+    file_type = detect_file_type(input_path)
+    logger.info(f"Detected input file type: {file_type}")
+
+    # Determine output file path
+    if args.output_file:
+        output_path = Path(args.output_file)
+        output_dir = output_path.parent
+        output_filename = output_path.name
+    else:
+        output_dir = input_path.parent
+        output_filename = input_path.stem + "_evaluated.pkl"
+
+    logger.info(f"Processing: {args.input_file}")
+
+    # Handle pickle DataFrame format
+    logger.info("Processing pickle DataFrame file")
+
+    # Load and process data
+    with open(args.input_file, 'rb') as f:
+        df = pickle.load(f)
+
+    logger.info(f"Loaded {len(df)} rows")
+
+    # Process and save with unified function
+    df_evaluated, saved_file_path = process_and_save_dataframe(
+        df,
+        output_dir=output_dir,
+        base_filename=output_filename,
+        num_lcb_workers=args.num_lcb_workers
+    )
+
+    # Print evaluation results with unified function
+    print_evaluation_results(df_evaluated, logger)
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
new file mode 100644
index 0000000000..eeadcde15d
--- /dev/null
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -0,0 +1,578 @@
+#!/usr/bin/env python3
+"""
+Evaluate MLPerf accuracy logs for gpt-oss-120b.
+
+This script takes MLPerf accuracy JSON logs and a reference pickle file,
+evaluates the outputs, and generates accuracy scores by dataset and overall.
+
+Usage:
+    python eval_mlperf_accuracy.py \
+        --mlperf-log mlperf_logs_offline_x8_acc/offline/accuracy/mlperf_log_accuracy.json \
+        --reference-data data/accuracy_eval_tokenized_filtered.pkl \
+        --output-file accuracy_results.json
+"""
+
+import argparse
+import json
+import logging
+import pickle
+import struct
+import multiprocessing
+import os
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
+
+import pandas as pd
+from transformers import AutoTokenizer
+from tqdm import tqdm
+
+# Import evaluation functions from the existing script
+import sys
+sys.path.insert(0, str(Path(__file__).parent))
+from eval_accuracy import (
+    get_evaluator, validate_dataset_name, validate_text_input, DATASET_EVALUATORS,
+    evaluate_livecodebench_worker, load_lcb_benchmark
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def load_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
+    """Load MLPerf accuracy JSON log.
+    
+    Args:
+        log_path: Path to mlperf_log_accuracy.json
+        
+    Returns:
+        List of log entries with seq_id, qsl_idx, data (hex), token_count
+    """
+    logger.info(f"Loading MLPerf log from {log_path}")
+    with open(log_path, 'r') as f:
+        log_data = json.load(f)
+    
+    logger.info(f"Loaded {len(log_data)} log entries")
+    return log_data
+
+
+def decode_hex_to_tokens(hex_data: str) -> List[int]:
+    """Decode hex string to list of token IDs (int32).
+    
+    MLPerf stores token IDs as hex-encoded int32 array.
+    
+    Args:
+        hex_data: Hex string like "450D0300..."
+        
+    Returns:
+        List of token IDs
+    """
+    # Convert hex string to bytes
+    data_bytes = bytes.fromhex(hex_data)
+    
+    # Unpack as int32 array (little-endian)
+    num_tokens = len(data_bytes) // 4
+    token_ids = struct.unpack(f'<{num_tokens}i', data_bytes)
+    
+    return list(token_ids)
+
+
+def detokenize(token_ids: List[int], tokenizer) -> str:
+    """Convert token IDs to text.
+    
+    Args:
+        token_ids: List of integer token IDs
+        tokenizer: HuggingFace tokenizer
+        
+    Returns:
+        Decoded text string
+    """
+    return tokenizer.decode(token_ids, skip_special_tokens=False)
+
+
+def process_livecodebench_batch(
+    entries: List[Dict[str, Any]],
+    reference_df: pd.DataFrame,
+    tokenizer,
+    evaluator: Dict[str, Any],
+    lcb_executor: ProcessPoolExecutor,
+    dataset_name: str,
+    args
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Process a batch of LiveCodeBench entries in parallel.
+    
+    Args:
+        entries: List of MLPerf log entries for this dataset
+        reference_df: Reference DataFrame
+        tokenizer: HuggingFace tokenizer
+        evaluator: Evaluator functions dict
+        lcb_executor: ProcessPoolExecutor for parallel evaluation
+        dataset_name: Dataset name
+        args: Command line arguments
+        
+    Returns:
+        Tuple of (results_list, outputs_list)
+    """
+    # First pass: decode and parse all entries
+    work_items = []
+    entry_metadata = []  # Store (entry, qsl_idx, ref_row, token_ids, model_output)
+    
+    logger.info(f"Parsing {len(entries)} {dataset_name} samples...")
+    for entry in tqdm(entries, desc=f"Parsing {dataset_name}", unit="sample"):
+        seq_id = entry['seq_id']
+        qsl_idx = entry['qsl_idx']
+        hex_data = entry['data']
+        
+        ref_row = reference_df.iloc[qsl_idx]
+        ground_truth = ref_row.get('ground_truth', None)
+        
+        # Decode tokens to text
+        token_ids = decode_hex_to_tokens(hex_data)
+        model_output = detokenize(token_ids, tokenizer)
+        
+        # Parse code from model output
+        extracted_code = evaluator['parse'](model_output)
+        
+        entry_metadata.append({
+            'entry': entry,
+            'qsl_idx': qsl_idx,
+            'ref_row': ref_row,
+            'token_ids': token_ids,
+            'model_output': model_output,
+            'extracted_code': extracted_code,
+            'ground_truth': ground_truth
+        })
+        
+        # Add to work queue if code was extracted
+        if extracted_code is not None and not pd.isna(ground_truth):
+            work_items.append((extracted_code, ground_truth))
+        else:
+            work_items.append(None)  # Placeholder for skipped items
+    
+    # Second pass: batch evaluate code in parallel
+    logger.info(f"Evaluating {len([w for w in work_items if w is not None])} {dataset_name} code samples with parallel workers...")
+    
+    results_list = []
+    outputs_list = []
+    
+    # Submit all work items
+    future_to_idx = {}
+    for idx, work_item in enumerate(work_items):
+        if work_item is not None:
+            future = lcb_executor.submit(evaluate_livecodebench_worker, work_item)
+            future_to_idx[future] = idx
+    
+    # Collect results with progress bar
+    eval_results = [None] * len(work_items)
+    
+    for future in tqdm(as_completed(future_to_idx.keys(), timeout=1200),
+                      total=len(future_to_idx),
+                      desc=f"Evaluating {dataset_name}",
+                      unit="sample"):
+        idx = future_to_idx[future]
+        try:
+            question_id, is_correct, detailed_reason = future.result(timeout=80)
+            eval_results[idx] = (is_correct, detailed_reason)
+        except TimeoutError:
+            logger.warning(f"Timeout evaluating sample {idx}: Test execution exceeded 80s timeout")
+            eval_results[idx] = (False, "Timeout: Test execution exceeded time limit")
+        except Exception as e:
+            logger.error(f"Error evaluating sample {idx}: {e}")
+            eval_results[idx] = (False, f"Error: {e}")
+    
+    # Third pass: compile final results
+    for idx, metadata in enumerate(entry_metadata):
+        entry = metadata['entry']
+        qsl_idx = metadata['qsl_idx']
+        token_ids = metadata['token_ids']
+        model_output = metadata['model_output']
+        extracted_code = metadata['extracted_code']
+        ground_truth = metadata['ground_truth']
+        
+        # Get evaluation result
+        if extracted_code is None or pd.isna(ground_truth):
+            is_correct = False
+            eval_details = "No code extracted from model output" if extracted_code is None else "No ground truth available"
+        else:
+            is_correct, eval_details = eval_results[idx]
+        
+        # Record result
+        result = {
+            'seq_id': entry['seq_id'],
+            'qsl_idx': qsl_idx,
+            'dataset': dataset_name,
+            'is_correct': is_correct,
+            'extracted_answer': str(extracted_code)[:200] if extracted_code is not None else None,
+            'ground_truth': str(ground_truth) if not pd.isna(ground_truth) else None,
+            'evaluation_details': eval_details,
+            'token_count': len(token_ids),
+            'model_output_preview': model_output[:200] if args.verbose else None
+        }
+        results_list.append(result)
+        
+        # Store output data if requested
+        if args.save_outputs:
+            output_record = {
+                'qsl_idx': qsl_idx,
+                'seq_id': entry['seq_id'],
+                'dataset': dataset_name,
+                'ground_truth': ground_truth,
+                'model_output': model_output,
+                'output_token_ids': token_ids,
+                'extracted_answer': extracted_code,
+                'is_correct': is_correct,
+                'evaluation_details': eval_details
+            }
+            outputs_list.append(output_record)
+    
+    return results_list, outputs_list
+
+
+def evaluate_single_entry(
+    model_output: str,
+    ground_truth: str,
+    dataset_name: str
+) -> Tuple[bool, Any, str]:
+    """Evaluate a single model output.
+    
+    Args:
+        model_output: Generated text from model
+        ground_truth: Expected answer
+        dataset_name: Dataset name (e.g., 'gpqa', 'math500')
+        
+    Returns:
+        Tuple of (is_correct, extracted_answer, evaluation_details)
+    """
+    evaluator = get_evaluator(dataset_name)
+    
+    # Parse answer from model output
+    extracted = evaluator['parse'](model_output)
+
+    # Evaluate correctness
+    is_correct = False
+    evaluation_details = ""
+
+    if extracted is None or pd.isna(extracted):
+        evaluation_details = "No answer extracted from model output"
+    else:
+        if not pd.isna(ground_truth):
+            try:
+                is_correct = evaluator['evaluate'](extracted, ground_truth)
+                if is_correct:
+                    evaluation_details = "Correct"
+                else:
+                    evaluation_details = f"Incorrect (extracted: {extracted}, ground_truth: {ground_truth})"
+            except Exception as e:
+                evaluation_details = f"Evaluation error: {e}"
+                logger.warning(f"Error evaluating: {e}")
+        else:
+            evaluation_details = "No ground truth available"
+    
+    return is_correct, extracted, evaluation_details
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate MLPerf accuracy logs for gpt-oss-120b"
+    )
+    parser.add_argument(
+        "--mlperf-log",
+        type=str,
+        required=True,
+        help="Path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--reference-data",
+        type=str,
+        required=True,
+        help="Path to reference pickle file (DataFrame with dataset, ground_truth, etc.)"
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="openai/gpt-oss-120b",
+        help="HuggingFace tokenizer name or path"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default=None,
+        help="Output JSON file for results (optional)"
+    )
+    parser.add_argument(
+        "--save-outputs",
+        type=str,
+        default=None,
+        help="Save detokenized outputs to pickle file (ordered by qsl_idx) for debugging"
+    )
+    parser.add_argument(
+        "--num-lcb-workers",
+        type=int,
+        default=64,
+        help="Number of parallel workers for LiveCodeBench evaluation (default: 64)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Verbose logging"
+    )
+    
+    args = parser.parse_args()
+    
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
+    # Load MLPerf log
+    mlperf_log = load_mlperf_log(args.mlperf_log)
+    
+    # Load reference data
+    logger.info(f"Loading reference data from {args.reference_data}")
+    with open(args.reference_data, 'rb') as f:
+        reference_df = pickle.load(f)
+    
+    logger.info(f"Reference data shape: {reference_df.shape}")
+    logger.info(f"Reference columns: {list(reference_df.columns)}")
+    
+    # Log unique datasets in reference data
+    if 'dataset' in reference_df.columns:
+        unique_datasets = reference_df['dataset'].unique()
+        dataset_counts = reference_df['dataset'].value_counts()
+        logger.info(f"Unique datasets in reference data ({len(unique_datasets)} total):")
+        for ds in sorted(unique_datasets):
+            logger.info(f"  '{ds}' ({dataset_counts[ds]} samples)")
+        
+        logger.info("\nSample rows from reference data:")
+        for idx in [0, 1, 2]:
+            if idx < len(reference_df):
+                logger.info(f"  Row {idx}: dataset='{reference_df.iloc[idx]['dataset']}'")
+        
+        # Show how each will be mapped to evaluators
+        logger.info("\nExpected Dataset → Evaluator mapping:")
+        for ds in sorted(unique_datasets):
+            try:
+                ds_lower = validate_dataset_name(ds)
+                # Find which evaluator key matches
+                matched_key = None
+                for key in DATASET_EVALUATORS.keys():
+                    if key in ds_lower:
+                        matched_key = key
+                        break
+                logger.info(f"  '{ds}' (normalized: '{ds_lower}') → '{matched_key}'")
+            except Exception as e:
+                logger.warning(f"  '{ds}' → ERROR: {e}")
+    
+    # Load tokenizer
+    logger.info(f"Loading tokenizer: {args.tokenizer}")
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+    
+    # Group MLPerf log entries by dataset
+    logger.info("Grouping MLPerf log entries by dataset...")
+    dataset_entries = defaultdict(list)
+    
+    for entry in mlperf_log:
+        qsl_idx = entry['qsl_idx']
+        if qsl_idx >= len(reference_df):
+            logger.warning(f"qsl_idx {qsl_idx} out of range (max: {len(reference_df)-1})")
+            continue
+        
+        ref_row = reference_df.iloc[qsl_idx]
+        dataset_name = validate_dataset_name(ref_row['dataset'])
+        dataset_entries[dataset_name].append(entry)
+    
+    logger.info(f"Grouped entries by dataset:")
+    for ds_name, entries in sorted(dataset_entries.items()):
+        logger.info(f"  {ds_name}: {len(entries)} samples")
+    
+    # Pre-load LiveCodeBench benchmark if needed
+    lcb_executor = None
+    if any('livecodebench' in ds for ds in dataset_entries.keys()):
+        try:
+            logger.info("Pre-loading LiveCodeBench benchmark for parallel evaluation...")
+            os.environ['TQDM_DISABLE'] = '1'  # Disable tqdm in workers
+            _ = load_lcb_benchmark()
+            logger.info("LiveCodeBench benchmark loaded successfully")
+            
+            # Create shared ProcessPoolExecutor for all LCB evaluations
+            max_workers = min(multiprocessing.cpu_count(), args.num_lcb_workers)
+            lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
+            logger.info(f"Created ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
+        except Exception as e:
+            logger.warning(f"Failed to pre-load LiveCodeBench benchmark: {e}")
+            logger.warning("LiveCodeBench evaluation may be slower")
+    
+    # Process each dataset separately with its own progress bar
+    logger.info("\nProcessing MLPerf log entries by dataset...")
+    
+    results = []
+    dataset_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+    outputs_data = []  # For saving detokenized outputs
+    
+    try:
+        for dataset_name in sorted(dataset_entries.keys()):
+            entries = dataset_entries[dataset_name]
+            logger.info(f"\n{'=' * 80}")
+            logger.info(f"Processing {dataset_name}: {len(entries)} samples")
+            logger.info(f"{'=' * 80}")
+            
+            evaluator = get_evaluator(dataset_name)
+            is_livecodebench = 'livecodebench' in dataset_name.lower()
+            
+            if is_livecodebench and lcb_executor is not None:
+                # Batched LiveCodeBench evaluation
+                results_batch, outputs_batch = process_livecodebench_batch(
+                    entries, reference_df, tokenizer, evaluator,
+                    lcb_executor, dataset_name, args
+                )
+                results.extend(results_batch)
+                if args.save_outputs:
+                    outputs_data.extend(outputs_batch)
+                
+                # Update stats
+                for res in results_batch:
+                    dataset_stats[dataset_name]["total"] += 1
+                    if res['is_correct']:
+                        dataset_stats[dataset_name]["correct"] += 1
+            else:
+                # Sequential evaluation for non-LCB datasets
+                for entry in tqdm(entries, desc=f"Evaluating {dataset_name}", unit="sample"):
+                    seq_id = entry['seq_id']
+                    qsl_idx = entry['qsl_idx']
+                    hex_data = entry['data']
+                    
+                    ref_row = reference_df.iloc[qsl_idx]
+                    ground_truth = ref_row.get('ground_truth', None)
+                    
+                    # Decode tokens to text
+                    token_ids = decode_hex_to_tokens(hex_data)
+                    model_output = detokenize(token_ids, tokenizer)
+                    
+                    # Evaluate
+                    try:
+                        is_correct, extracted, eval_details = evaluate_single_entry(
+                            model_output, ground_truth, dataset_name
+                        )
+                    except Exception as e:
+                        logger.warning(f"Evaluation error for qsl_idx={qsl_idx}, dataset={dataset_name}: {e}")
+                        is_correct = False
+                        extracted = None
+                        eval_details = f"Evaluation error: {e}"
+                    
+                    # Record result
+                    result = {
+                        'seq_id': seq_id,
+                        'qsl_idx': qsl_idx,
+                        'dataset': dataset_name,
+                        'is_correct': is_correct,
+                        'extracted_answer': str(extracted) if extracted is not None else None,
+                        'ground_truth': str(ground_truth) if not pd.isna(ground_truth) else None,
+                        'evaluation_details': eval_details,
+                        'token_count': len(token_ids),
+                        'model_output_preview': model_output[:200] if args.verbose else None
+                    }
+                    results.append(result)
+                    
+                    # Store output data for pickle export
+                    if args.save_outputs:
+                        output_record = {
+                            'qsl_idx': qsl_idx,
+                            'seq_id': seq_id,
+                            'dataset': dataset_name,
+                            'ground_truth': ground_truth,
+                            'model_output': model_output,
+                            'output_token_ids': token_ids,
+                            'extracted_answer': extracted,
+                            'is_correct': is_correct,
+                            'evaluation_details': eval_details
+                        }
+                        outputs_data.append(output_record)
+                    
+                    # Update stats
+                    dataset_stats[dataset_name]["total"] += 1
+                    if is_correct:
+                        dataset_stats[dataset_name]["correct"] += 1
+    
+    finally:
+        # Clean up LiveCodeBench executor
+        if lcb_executor is not None:
+            logger.info("Shutting down LiveCodeBench ProcessPoolExecutor")
+            lcb_executor.shutdown(wait=True)
+            os.environ.pop('TQDM_DISABLE', None)
+    
+    # Calculate overall stats
+    total_correct = sum(stats["correct"] for stats in dataset_stats.values())
+    total_samples = sum(stats["total"] for stats in dataset_stats.values())
+    overall_accuracy = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
+    
+    # Print results
+    print("\n" + "=" * 80)
+    print("MLPerf Accuracy Evaluation Results")
+    print("=" * 80)
+    print(f"Total samples: {total_samples}")
+    print(f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+    print("=" * 80)
+    print("\nPer-Dataset Breakdown:")
+    print("-" * 80)
+    
+    for dataset_name in sorted(dataset_stats.keys()):
+        stats = dataset_stats[dataset_name]
+        accuracy = (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0
+        print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+    
+    print("=" * 80)
+    
+    # Save detokenized outputs to pickle if requested
+    if args.save_outputs:
+        logger.info(f"Saving detokenized outputs to {args.save_outputs}...")
+        
+        # Sort by qsl_idx for ordered output
+        outputs_data_sorted = sorted(outputs_data, key=lambda x: x['qsl_idx'])
+        
+        # Convert to DataFrame for easier inspection
+        outputs_df = pd.DataFrame(outputs_data_sorted)
+        
+        output_path = Path(args.save_outputs)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'wb') as f:
+            pickle.dump(outputs_df, f)
+        
+        logger.info(f"Saved {len(outputs_df)} detokenized outputs (ordered by qsl_idx) to: {output_path}")
+        logger.info(f"Columns: {list(outputs_df.columns)}")
+    
+    # Save detailed results if requested
+    if args.output_file:
+        output_data = {
+            "summary": {
+                "total_samples": total_samples,
+                "total_correct": total_correct,
+                "overall_accuracy": overall_accuracy,
+                "per_dataset": {
+                    dataset: {
+                        "correct": stats["correct"],
+                        "total": stats["total"],
+                        "accuracy": (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0
+                    }
+                    for dataset, stats in dataset_stats.items()
+                }
+            },
+            "detailed_results": results if args.verbose else None
+        }
+        
+        output_path = Path(args.output_file)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'w') as f:
+            json.dump(output_data, f, indent=2)
+        
+        logger.info(f"Results saved to: {output_path}")
+    
+    logger.info("Evaluation complete!")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/language/gpt-oss/eval_mlperf_performance.py b/language/gpt-oss/eval_mlperf_performance.py
new file mode 100755
index 0000000000..5ef2b90afe
--- /dev/null
+++ b/language/gpt-oss/eval_mlperf_performance.py
@@ -0,0 +1,630 @@
+#!/usr/bin/env python3
+"""Evaluate MLPerf performance logs and analyze output token lengths.
+
+This script reads MLPerf accuracy logs (mlperf_log_accuracy.json) and
+detokenizes the hex-encoded token IDs to produce human-readable text output.
+Optionally includes input prompts and reference data from a pickle file,
+and generates histogram plots for token length analysis.
+
+Usage:
+    # Basic usage (outputs only)
+    python eval_mlperf_performance.py \
+        --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
+        --output-file detokenized_outputs.json \
+        --tokenizer openai/gpt-oss-120b
+
+    # With reference data (includes inputs and metadata)
+    python eval_mlperf_performance.py \
+        --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
+        --output-file detokenized_outputs.json \
+        --reference-data data/accuracy_eval_tokenized_filtered.pkl \
+        --tokenizer openai/gpt-oss-120b
+
+    # With histogram plots (enables plotting when --plot-dir is specified)
+    python eval_mlperf_performance.py \
+        --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
+        --output-file detokenized_outputs.json \
+        --reference-data data/accuracy_eval_tokenized_filtered.pkl \
+        --plot-dir plots
+
+The output JSON format (with reference data):
+    [
+        {
+            "qsl_idx": 0,
+            "token_ids": [1, 2, 3, ...],
+            "text": "detokenized response text",
+            "num_tokens": 150,
+            "dataset": "gpqa",
+            "input_prompt": "Question: ...",
+            "input_token_ids": [...],
+            "num_input_tokens": 1024,
+            "ground_truth": "Answer"
+        },
+        ...
+    ]
+"""
+
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import matplotlib.pyplot as plt
+import argparse
+import json
+import logging
+import pickle
+import sys
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend for server environments
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Parse MLPerf accuracy JSON and detokenize responses"
+    )
+
+    parser.add_argument(
+        "--mlperf-log",
+        type=str,
+        required=True,
+        help="Path to mlperf_log_accuracy.json file"
+    )
+
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=True,
+        help="Path to output JSON file with detokenized responses"
+    )
+
+    parser.add_argument(
+        "--reference-data",
+        type=str,
+        default=None,
+        help="Path to reference pickle file (DataFrame with prompts, dataset, etc.) - optional"
+    )
+
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="openai/gpt-oss-120b",
+        help="Tokenizer to use for detokenization (default: openai/gpt-oss-120b)"
+    )
+
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print the output JSON with indentation"
+    )
+
+    parser.add_argument(
+        "--plot-dir",
+        type=str,
+        default=None,
+        help="Directory to save histogram plots (enables plotting if specified)"
+    )
+
+    return parser.parse_args()
+
+
+def decode_hex_to_tokens(hex_string: str) -> List[int]:
+    """Decode hex-encoded byte array to list of token IDs.
+
+    MLPerf stores token IDs as hex-encoded bytes where each token is a 4-byte
+    little-endian integer.
+
+    Args:
+        hex_string: Hex-encoded string from MLPerf log
+
+    Returns:
+        List of token IDs
+    """
+    # Remove any whitespace
+    hex_string = hex_string.strip()
+
+    # Convert hex string to bytes
+    byte_data = bytes.fromhex(hex_string)
+
+    # Each token is stored as 4 bytes (int32) in little-endian format
+    token_ids = []
+    for i in range(0, len(byte_data), 4):
+        if i + 4 <= len(byte_data):
+            # Unpack 4 bytes as little-endian int32
+            token_id = int.from_bytes(
+                byte_data[i:i + 4], byteorder='little', signed=True)
+            token_ids.append(token_id)
+
+    return token_ids
+
+
+def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
+    """Parse MLPerf accuracy log file.
+
+    Handles multiple formats:
+    - JSON array: [{"qsl_idx": 0, ...}, ...]
+    - JSONL: one JSON object per line
+    - Concatenated JSON: multiple JSON objects on same line
+
+    Args:
+        log_path: Path to mlperf_log_accuracy.json
+
+    Returns:
+        List of entries with qsl_idx and hex-encoded data
+    """
+    logger.info(f"Reading MLPerf log: {log_path}")
+
+    entries = []
+
+    # First try to load as a single JSON array
+    try:
+        with open(log_path, 'r') as f:
+            log_data = json.load(f)
+        if isinstance(log_data, list):
+            logger.info(f"Loaded {len(log_data)} entries as JSON array")
+            return log_data
+    except json.JSONDecodeError:
+        pass  # Not a valid JSON array, try line-by-line parsing
+
+    # Parse line by line (JSONL or concatenated JSON)
+    decoder = json.JSONDecoder()
+    with open(log_path, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+
+            # Try to parse as single JSON object first
+            try:
+                entry = json.loads(line)
+                entries.append(entry)
+            except json.JSONDecodeError:
+                # Line might have multiple concatenated JSON objects
+                # Extract them one by one using raw_decode
+                remaining = line
+                parsed_count = 0
+                while remaining:
+                    remaining = remaining.lstrip()
+                    if not remaining:
+                        break
+                    try:
+                        obj, end_idx = decoder.raw_decode(remaining)
+                        entries.append(obj)
+                        remaining = remaining[end_idx:]
+                        parsed_count += 1
+                    except json.JSONDecodeError as e:
+                        if parsed_count == 0:
+                            logger.warning(
+                                f"Line {line_num}: Could not parse JSON: {e}")
+                        break
+
+    logger.info(f"Loaded {len(entries)} entries from MLPerf log")
+    return entries
+
+
+def plot_histograms(
+    results: List[Dict[str, Any]],
+    output_dir: str,
+    has_reference: bool = False
+) -> None:
+    """Generate histogram plots for output token lengths and differences.
+
+    Args:
+        results: List of parsed results with token lengths
+        output_dir: Directory to save plots
+        has_reference: Whether reference data is available for difference plots
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    logger.info(f"Generating histogram plots in {output_dir}...")
+
+    # Extract output token lengths
+    output_lengths = [r['num_tokens'] for r in results]
+
+    # Plot 1: Output Sequence Length (OSL) Histogram
+    plt.figure(figsize=(12, 6))
+    plt.hist(
+        output_lengths,
+        bins=50,
+        edgecolor='black',
+        alpha=0.7,
+        color='steelblue')
+    plt.xlabel('Output Token Length (OSL)', fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.title(
+        f'Distribution of Output Token Lengths\n(n={len(output_lengths)}, mean={sum(output_lengths)/len(output_lengths):.1f}, median={sorted(output_lengths)[len(output_lengths)//2]})',
+        fontsize=14)
+    plt.grid(axis='y', alpha=0.3)
+
+    # Add statistics box
+    stats_text = f'Min: {min(output_lengths)}\nMax: {max(output_lengths)}\nStd: {(sum((x - sum(output_lengths)/len(output_lengths))**2 for x in output_lengths) / len(output_lengths))**0.5:.1f}'
+    plt.text(0.98, 0.97, stats_text, transform=plt.gca().transAxes,
+             fontsize=10, verticalalignment='top', horizontalalignment='right',
+             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+
+    osl_plot_path = output_path / 'output_token_length_histogram.png'
+    plt.tight_layout()
+    plt.savefig(osl_plot_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    logger.info(f"✓ Saved OSL histogram: {osl_plot_path}")
+
+    # Plot 2: Token Length Difference Histogram (if reference data available)
+    if has_reference:
+        results_with_diff = [
+            r for r in results if 'output_token_len_diff' in r]
+        if results_with_diff:
+            differences = [r['output_token_len_diff']
+                           for r in results_with_diff]
+
+            plt.figure(figsize=(12, 6))
+            plt.hist(
+                differences,
+                bins=50,
+                edgecolor='black',
+                alpha=0.7,
+                color='coral')
+            plt.xlabel(
+                'Token Length Difference (Actual - Reference)',
+                fontsize=12)
+            plt.ylabel('Frequency', fontsize=12)
+
+            mean_diff = sum(differences) / len(differences)
+            median_diff = sorted(differences)[len(differences) // 2]
+            plt.title(
+                f'Distribution of Output Token Length Differences\n(n={len(differences)}, mean={mean_diff:.1f}, median={median_diff})',
+                fontsize=14)
+            plt.grid(axis='y', alpha=0.3)
+            plt.axvline(
+                x=0,
+                color='red',
+                linestyle='--',
+                linewidth=2,
+                label='Zero difference')
+
+            # Add statistics box
+            longer = sum(1 for d in differences if d > 0)
+            shorter = sum(1 for d in differences if d < 0)
+            exact = sum(1 for d in differences if d == 0)
+            stats_text = f'Min: {min(differences)}\nMax: {max(differences)}\nStd: {(sum((x - mean_diff)**2 for x in differences) / len(differences))**0.5:.1f}\n\nLonger: {longer} ({longer/len(differences)*100:.1f}%)\nShorter: {shorter} ({shorter/len(differences)*100:.1f}%)\nExact: {exact} ({exact/len(differences)*100:.1f}%)'
+            plt.text(0.98, 0.97, stats_text, transform=plt.gca().transAxes,
+                     fontsize=9, verticalalignment='top', horizontalalignment='right',
+                     bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
+
+            plt.legend()
+
+            diff_plot_path = output_path / 'token_length_difference_histogram.png'
+            plt.tight_layout()
+            plt.savefig(diff_plot_path, dpi=300, bbox_inches='tight')
+            plt.close()
+            logger.info(f"✓ Saved difference histogram: {diff_plot_path}")
+
+            # Plot 3: Combined comparison (side by side)
+            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+
+            # Reference vs Actual
+            ref_lengths = [r['ref_num_output_tokens']
+                           for r in results_with_diff]
+            actual_lengths = [r['actual_num_output_tokens']
+                              for r in results_with_diff]
+
+            ax1.hist([ref_lengths, actual_lengths], bins=50, label=['Reference', 'Actual'],
+                     alpha=0.6, edgecolor='black', color=['steelblue', 'coral'])
+            ax1.set_xlabel('Output Token Length', fontsize=12)
+            ax1.set_ylabel('Frequency', fontsize=12)
+            ax1.set_title(
+                f'Reference vs Actual Output Token Lengths\n(n={len(results_with_diff)})',
+                fontsize=13)
+            ax1.legend()
+            ax1.grid(axis='y', alpha=0.3)
+
+            # Scatter plot: Reference vs Actual
+            ax2.scatter(
+                ref_lengths,
+                actual_lengths,
+                alpha=0.4,
+                s=10,
+                color='purple')
+            ax2.plot([min(ref_lengths), max(ref_lengths)], [min(ref_lengths), max(ref_lengths)],
+                     'r--', linewidth=2, label='y=x (perfect match)')
+            ax2.set_xlabel('Reference Token Length', fontsize=12)
+            ax2.set_ylabel('Actual Token Length', fontsize=12)
+            ax2.set_title(
+                'Reference vs Actual Token Lengths (Scatter)',
+                fontsize=13)
+            ax2.legend()
+            ax2.grid(alpha=0.3)
+
+            comparison_plot_path = output_path / 'token_length_comparison.png'
+            plt.tight_layout()
+            plt.savefig(comparison_plot_path, dpi=300, bbox_inches='tight')
+            plt.close()
+            logger.info(f"✓ Saved comparison plot: {comparison_plot_path}")
+        else:
+            logger.warning("No samples with token length differences found")
+
+    logger.info(f"✓ All plots saved to {output_dir}/")
+
+
+def detokenize_responses(
+    entries: List[Dict[str, Any]],
+    tokenizer: Any,
+    reference_df: Optional[pd.DataFrame] = None
+) -> List[Dict[str, Any]]:
+    """Detokenize responses from MLPerf log entries.
+
+    When reference data is provided, input_prompt is generated by detokenizing
+    input token IDs from the reference data (checks: tok_input, input_token_ids,
+    input_tokens, tokenized_input). This shows exactly what was sent to the model
+    (after tokenization), not the original text prompt.
+
+    Args:
+        entries: List of MLPerf log entries with hex-encoded token IDs
+        tokenizer: HuggingFace tokenizer instance
+        reference_df: Optional reference DataFrame with input prompts and metadata
+
+    Returns:
+        List of dictionaries with qsl_idx, token_ids, and detokenized text
+    """
+    logger.info("Detokenizing responses...")
+
+    results = []
+    for entry in tqdm(entries, desc="Detokenizing", unit="response"):
+        qsl_idx = entry.get("qsl_idx")
+        hex_data = entry.get("data", "")
+
+        # Decode hex to token IDs
+        try:
+            token_ids = decode_hex_to_tokens(hex_data)
+        except Exception as e:
+            logger.error(f"Error decoding tokens for qsl_idx={qsl_idx}: {e}")
+            token_ids = []
+
+        # Detokenize to text
+        try:
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+        except Exception as e:
+            logger.error(f"Error detokenizing qsl_idx={qsl_idx}: {e}")
+            text = ""
+
+        # Build result record
+        result = {
+            "qsl_idx": qsl_idx,
+            "token_ids": token_ids,
+            "text": text,
+            "num_tokens": len(token_ids)
+        }
+
+        # Add reference data if available
+        if reference_df is not None and qsl_idx < len(reference_df):
+            ref_row = reference_df.iloc[qsl_idx]
+
+            # Add common fields from reference data
+            if 'dataset' in ref_row:
+                result['dataset'] = ref_row['dataset']
+
+            # Get input token IDs and detokenize to see what was actually sent to the model
+            # Check multiple possible field names for input tokens
+            input_token_ids = None
+            for field in ['tok_input', 'input_token_ids',
+                          'input_tokens', 'tokenized_input']:
+                if field in ref_row:
+                    input_token_ids = ref_row[field]
+                    break
+
+            if input_token_ids is not None:
+                result['input_token_ids'] = input_token_ids
+                if isinstance(input_token_ids, list):
+                    result['num_input_tokens'] = len(input_token_ids)
+                    # Detokenize input tokens to show what was actually sent to
+                    # the model
+                    try:
+                        result['input_prompt'] = tokenizer.decode(
+                            input_token_ids, skip_special_tokens=False)
+                    except Exception as e:
+                        logger.warning(
+                            f"Error detokenizing input tokens for qsl_idx={qsl_idx}: {e}")
+                        result['input_prompt'] = None
+                else:
+                    result['num_input_tokens'] = None
+                    result['input_prompt'] = None
+            else:
+                # Fallback to raw prompt field if input token IDs not available
+                if 'prompt' in ref_row:
+                    result['input_prompt'] = ref_row['prompt']
+                elif 'input_text' in ref_row:
+                    result['input_prompt'] = ref_row['input_text']
+                elif 'text' in ref_row:
+                    result['input_prompt'] = ref_row['text']
+
+            if 'ground_truth' in ref_row:
+                result['ground_truth'] = ref_row['ground_truth']
+
+            # Compute output token length difference
+            # Check for reference output token length in various possible field
+            # names
+            ref_output_len = None
+            for field in ['output_token_ids', 'target_token_ids',
+                          'output_tokens', 'expected_output_token_ids']:
+                if field in ref_row:
+                    ref_tokens = ref_row[field]
+                    if isinstance(ref_tokens, list):
+                        ref_output_len = len(ref_tokens)
+                        result['ref_output_token_ids'] = ref_tokens
+                        break
+                    elif isinstance(ref_tokens, (int, float)) and not pd.isna(ref_tokens):
+                        ref_output_len = int(ref_tokens)
+                        break
+
+            # Also check for direct length field
+            if ref_output_len is None:
+                for field in ['output_len', 'output_length',
+                              'num_output_tokens', 'target_len']:
+                    if field in ref_row and not pd.isna(ref_row[field]):
+                        ref_output_len = int(ref_row[field])
+                        break
+
+            if ref_output_len is not None:
+                actual_output_len = len(token_ids)
+                result['ref_num_output_tokens'] = ref_output_len
+                result['actual_num_output_tokens'] = actual_output_len
+                result['output_token_len_diff'] = actual_output_len - \
+                    ref_output_len
+                result['output_token_len_ratio'] = actual_output_len / \
+                    ref_output_len if ref_output_len > 0 else None
+
+            # Add any other columns that might be useful
+            for col in ['question_id', 'difficulty', 'subject', 'category']:
+                if col in ref_row:
+                    result[col] = ref_row[col]
+
+        results.append(result)
+
+    return results
+
+
+def main():
+    """Main function."""
+    args = parse_args()
+
+    # Validate input file exists
+    log_path = Path(args.mlperf_log)
+    if not log_path.exists():
+        logger.error(f"MLPerf log file not found: {args.mlperf_log}")
+        sys.exit(1)
+
+    logger.info("=" * 80)
+    logger.info("MLPerf Accuracy Log Parser")
+    logger.info("=" * 80)
+    logger.info(f"Input log: {args.mlperf_log}")
+    logger.info(f"Output file: {args.output_file}")
+    logger.info(
+        f"Reference data: {args.reference_data if args.reference_data else 'None (outputs only)'}")
+    logger.info(f"Tokenizer: {args.tokenizer}")
+    logger.info("=" * 80)
+
+    # Load reference data if provided
+    reference_df = None
+    if args.reference_data:
+        logger.info(f"Loading reference data from {args.reference_data}")
+        try:
+            with open(args.reference_data, 'rb') as f:
+                reference_df = pickle.load(f)
+            logger.info(f"✓ Reference data loaded: {reference_df.shape}")
+            logger.info(f"  Columns: {list(reference_df.columns)}")
+        except Exception as e:
+            logger.error(f"Failed to load reference data: {e}")
+            sys.exit(1)
+
+    # Load tokenizer
+    logger.info(f"Loading tokenizer: {args.tokenizer}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        logger.info("✓ Tokenizer loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to load tokenizer: {e}")
+        sys.exit(1)
+
+    # Parse MLPerf log
+    try:
+        entries = parse_mlperf_log(args.mlperf_log)
+    except Exception as e:
+        logger.error(f"Failed to parse MLPerf log: {e}")
+        sys.exit(1)
+
+    if not entries:
+        logger.error("No entries found in MLPerf log")
+        sys.exit(1)
+
+    # Detokenize responses
+    try:
+        results = detokenize_responses(entries, tokenizer, reference_df)
+    except Exception as e:
+        logger.error(f"Failed to detokenize responses: {e}")
+        sys.exit(1)
+
+    # Write output JSON
+    logger.info(f"Writing detokenized outputs to: {args.output_file}")
+    output_path = Path(args.output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, 'w') as f:
+        if args.pretty:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        else:
+            json.dump(results, f, ensure_ascii=False)
+
+    logger.info("=" * 80)
+    logger.info("✓ Parsing completed successfully")
+    logger.info("=" * 80)
+    logger.info(f"Total responses parsed: {len(results)}")
+
+    # Print statistics
+    total_tokens = sum(r["num_tokens"] for r in results)
+    avg_tokens = total_tokens / len(results) if results else 0
+    logger.info(f"Total output tokens: {total_tokens:,}")
+    logger.info(f"Average tokens per response: {avg_tokens:.1f}")
+
+    # Print token length difference statistics if reference data was provided
+    if reference_df is not None:
+        results_with_diff = [
+            r for r in results if 'output_token_len_diff' in r]
+        if results_with_diff:
+            diffs = [r['output_token_len_diff'] for r in results_with_diff]
+            ratios = [r['output_token_len_ratio']
+                      for r in results_with_diff if r['output_token_len_ratio'] is not None]
+
+            logger.info(
+                f"\nOutput Token Length Analysis ({len(results_with_diff)} samples with reference):")
+            logger.info(
+                f"  Mean difference (actual - ref): {sum(diffs) / len(diffs):.2f} tokens")
+            logger.info(f"  Min difference: {min(diffs)} tokens")
+            logger.info(f"  Max difference: {max(diffs)} tokens")
+            if ratios:
+                logger.info(
+                    f"  Mean ratio (actual / ref): {sum(ratios) / len(ratios):.3f}x")
+
+            # Count samples that are longer/shorter
+            longer = sum(1 for d in diffs if d > 0)
+            shorter = sum(1 for d in diffs if d < 0)
+            exact = sum(1 for d in diffs if d == 0)
+            logger.info(
+                f"  Longer than reference: {longer} ({longer/len(diffs)*100:.1f}%)")
+            logger.info(
+                f"  Shorter than reference: {shorter} ({shorter/len(diffs)*100:.1f}%)")
+            logger.info(
+                f"  Exact match: {exact} ({exact/len(diffs)*100:.1f}%)")
+
+    logger.info("=" * 80)
+
+    # Show sample output
+    if results:
+        logger.info("Sample output (first entry):")
+        sample = results[0]
+        logger.info(f"  qsl_idx: {sample['qsl_idx']}")
+        logger.info(f"  num_tokens: {sample['num_tokens']}")
+        logger.info(f"  text preview: {sample['text'][:200]}...")
+        logger.info("=" * 80)
+
+    # Generate histogram plots if plot directory is specified
+    if args.plot_dir:
+        logger.info("\n" + "=" * 80)
+        logger.info("Generating Histogram Plots")
+        logger.info("=" * 80)
+        plot_histograms(
+            results, args.plot_dir, has_reference=(
+                reference_df is not None))
+        logger.info("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/mlperf/__init__.py b/language/gpt-oss/mlperf/__init__.py
new file mode 100644
index 0000000000..c5aaa0d243
--- /dev/null
+++ b/language/gpt-oss/mlperf/__init__.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+"""MLPerf inference integration for gpt-oss."""
+
+from .base_sut import BaseSUT
+from .offline_sut import OfflineSUT
+from .server_sut import ServerSUT
+from .qsl import QuerySampleLibrary
+
+__all__ = [
+    "BaseSUT",
+    "OfflineSUT",
+    "ServerSUT",
+    "QuerySampleLibrary",
+]
diff --git a/language/gpt-oss/mlperf/base_sut.py b/language/gpt-oss/mlperf/base_sut.py
new file mode 100644
index 0000000000..f96af57e16
--- /dev/null
+++ b/language/gpt-oss/mlperf/base_sut.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""Base System Under Test (SUT) class for MLPerf inference benchmarks."""
+
+import abc
+import logging
+import threading
+from typing import List, Dict, Any, Optional
+import mlperf_loadgen as lg
+
+logger = logging.getLogger(__name__)
+
+
+class BaseSUT(abc.ABC):
+    """Base class for MLPerf inference System Under Test (SUT).
+
+    This class defines the interface that all SUTs must implement for MLPerf
+    inference benchmarks. It provides two main methods:
+    - issue_queries: to enqueue prompt tokens
+    - flush_queries: to await completion of all issued queries
+    """
+
+    def __init__(
+            self, backend, dataset: List[List[int]], name: str = "BaseSUT", progress_bar=None):
+        """Initialize the base SUT.
+
+        Args:
+            backend: Backend instance for inference
+            dataset: List of tokenized prompts
+            name: Name of the SUT for logging purposes
+            progress_bar: Optional tqdm progress bar for real-time updates
+        """
+        self.backend = backend
+        self.dataset = dataset
+        self.name = name
+        self.sut = None
+        self.results = {}
+        self.progress_bar = progress_bar
+        
+        # Graceful shutdown support (set on KeyboardInterrupt)
+        self.should_stop = threading.Event()
+        
+        logger.info(f"Initializing {self.name}")
+
+    @abc.abstractmethod
+    def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
+        """Issue queries to the SUT.
+
+        This method should enqueue the provided query samples for processing.
+        It should return immediately without waiting for completion.
+
+        Args:
+            query_samples: List of MLPerf LoadGen query samples to process
+        """
+        raise NotImplementedError("Subclasses must implement issue_queries")
+
+    @abc.abstractmethod
+    def flush_queries(self) -> None:
+        """Flush all pending queries.
+
+        This method should wait for all previously issued queries to complete
+        before returning. It's called by LoadGen to ensure all work is done.
+        """
+        raise NotImplementedError("Subclasses must implement flush_queries")
+
+    def start(self) -> lg.ConstructSUT:
+        """Start the SUT and return the LoadGen SUT handle.
+
+        Returns:
+            LoadGen SUT handle for use with LoadGen
+        """
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        logger.info(f"{self.name} started")
+        return self.sut
+
+    def stop(self) -> None:
+        """Stop the SUT and clean up resources.
+        
+        Signals graceful shutdown and allows subclasses to cancel pending work.
+        """
+        logger.info(f"Stopping {self.name}...")
+        
+        # Signal all workers/tasks to stop
+        self.should_stop.set()
+        
+        # Subclasses should override to add their own cleanup
+        # (e.g., cancel tasks, clear queues)
+        
+        if self.sut:
+            lg.DestroySUT(self.sut)
+            self.sut = None
+            logger.info(f"{self.name} stopped")
+
+    def get_results(self) -> Dict[int, Any]:
+        """Get all results from completed queries.
+
+        Returns:
+            Dictionary mapping query IDs to results
+        """
+        return self.results
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self.start()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.stop()
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
new file mode 100644
index 0000000000..53f436ea4c
--- /dev/null
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""Offline scenario SUT implementation for gpt-oss."""
+
+import logging
+import numpy as np
+import time
+from typing import List, Dict, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import mlperf_loadgen as lg
+from tqdm import tqdm
+from .base_sut import BaseSUT
+
+logger = logging.getLogger(__name__)
+
+
+class OfflineSUT(BaseSUT):
+    """Offline scenario System Under Test.
+
+    In the Offline scenario, all queries are issued at once and can be
+    processed in any order. This allows for maximum batching and throughput.
+    """
+
+    def __init__(
+        self,
+        backend,
+        dataset: List[List[int]],
+        max_tokens: int = 32768,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        name: str = "OfflineSUT",
+        progress_bar=None,
+        max_concurrency: int = 128
+    ):
+        """Initialize the Offline SUT.
+
+        Args:
+            backend: Backend instance for inference
+            dataset: List of tokenized prompts
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p sampling parameter
+            name: Name of the SUT
+            progress_bar: Optional tqdm progress bar for real-time updates
+            max_concurrency: Maximum concurrent requests to backend (SGLang does in-flight batching)
+        """
+        super().__init__(backend, dataset, name, progress_bar)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.pending_queries = []
+        self.max_concurrency = max_concurrency
+
+        logger.info(
+            f"OfflineSUT configured with max_concurrency={max_concurrency} (backend handles batching)")
+
+    def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
+        """Issue queries to the SUT.
+
+        In Offline mode, we accumulate all queries and process them in batch.
+
+        Args:
+            query_samples: List of MLPerf LoadGen query samples
+        """
+        logger.info(f"Received {len(query_samples)} queries")
+
+        # Update progress bar total (Offline gets all queries at once)
+        if self.progress_bar is not None:
+            self.progress_bar.total = len(query_samples)
+            self.progress_bar.refresh()
+
+        # Store queries for batch processing
+        for qs in query_samples:
+            self.pending_queries.append(qs)
+
+    def flush_queries(self) -> None:
+        """Process all accumulated queries with concurrent requests.
+
+        Sends individual requests concurrently up to max_concurrency limit.
+        SGLang handles batching internally via continuous batching.
+        """
+        if not self.pending_queries:
+            logger.info("No pending queries to flush")
+            return
+
+        logger.info(
+            f"Flushing {len(self.pending_queries)} queries with max_concurrency={self.max_concurrency}")
+        start_time = time.time()
+
+        def process_single_query(query_sample):
+            """Process a single query (backend batches automatically via continuous batching)."""
+            # Check if we should stop (e.g., KeyboardInterrupt)
+            if self.should_stop.is_set():
+                logger.info(f"Skipping query {query_sample.id} due to shutdown")
+                return None, None, None
+            
+            query_id = query_sample.id
+            input_ids = self.dataset[query_sample.index]
+
+            # Call backend with single query
+            # SGLang will batch this with other concurrent requests
+            # automatically
+            responses = self.backend.generate(
+                prompts=[input_ids],  # Single query as list
+                max_tokens=self.max_tokens,
+                temperature=self.temperature,
+                top_k=self.top_k,
+                top_p=self.top_p
+            )
+
+            return query_id, query_sample, responses[0]
+
+        try:
+            # Process queries in parallel with max_concurrency
+            logger.info(
+                f"Submitting {len(self.pending_queries)} queries to {self.max_concurrency} concurrent workers...")
+            with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
+                # Submit all queries at once
+                futures = [
+                    executor.submit(
+                        process_single_query,
+                        qs) for qs in self.pending_queries]
+
+                # Process results as they complete
+                completed_count = 0
+                cancelled_count = 0
+                
+                for future in as_completed(futures):
+                    # Check if shutdown was requested
+                    if self.should_stop.is_set():
+                        logger.info("Shutdown requested, cancelling remaining futures...")
+                        for f in futures:
+                            f.cancel()
+                        cancelled_count = sum(1 for f in futures if f.cancelled())
+                        logger.info(f"Cancelled {cancelled_count} pending futures")
+                        break
+                    try:
+                        query_id, query_sample, response = future.result()
+                        
+                        # Skip if query was cancelled/skipped
+                        if query_id is None:
+                            continue
+                        
+                        output_ids = response.get("output_ids", [])
+
+                        # Store results
+                        self.results[query_id] = {
+                            "output_ids": output_ids,
+                            "output_text": response.get("output_text", ""),
+                            "metadata": response.get("metadata", {})
+                        }
+
+                        # Convert output_ids to numpy array for LoadGen
+                        # LoadGen expects int32 token IDs as a contiguous array
+                        if output_ids:
+                            token_array = np.ascontiguousarray(
+                                output_ids, dtype=np.int32)
+                            output_data_ptr = token_array.ctypes.data
+                            output_data_size = token_array.nbytes
+                            n_tokens = len(output_ids)
+                        else:
+                            # Empty response
+                            token_array = np.array([], dtype=np.int32)
+                            output_data_ptr = 0
+                            output_data_size = 0
+                            n_tokens = 0
+
+                        # Create response for LoadGen with token count
+                        response_array = [
+                            lg.QuerySampleResponse(
+                                query_id,
+                                output_data_ptr,
+                                output_data_size,
+                                n_tokens  # Number of output tokens for tokens/sec metric
+                            )
+                        ]
+
+                        # Report completion to LoadGen
+                        lg.QuerySamplesComplete(response_array)
+
+                        # Update progress bar
+                        if self.progress_bar is not None:
+                            self.progress_bar.update(1)
+                            self.progress_bar.refresh()
+
+                        completed_count += 1
+                        # Log progress at debug level only (tqdm shows
+                        # progress)
+                        if completed_count % 100 == 0:
+                            logger.debug(
+                                f"Completed {completed_count}/{len(self.pending_queries)} queries")
+
+                    except Exception as e:
+                        logger.error(
+                            f"Error processing query: {e}", exc_info=True)
+
+            elapsed = time.time() - start_time
+            if cancelled_count > 0:
+                logger.info(
+                    f"Completed {completed_count} queries, cancelled {cancelled_count} queries "
+                    f"in {elapsed:.2f}s"
+                )
+            else:
+                logger.info(
+                    f"Completed {len(self.pending_queries)} queries in {elapsed:.2f}s "
+                    f"({len(self.pending_queries)/elapsed:.2f} QPS)"
+                )
+
+        except Exception as e:
+            logger.error(f"Error during concurrent flush: {e}", exc_info=True)
+            raise
+        finally:
+            # Clear pending queries
+            self.pending_queries = []
diff --git a/language/gpt-oss/mlperf/qsl.py b/language/gpt-oss/mlperf/qsl.py
new file mode 100644
index 0000000000..e7b06a1bb8
--- /dev/null
+++ b/language/gpt-oss/mlperf/qsl.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""Query Sample Library for gpt-oss MLPerf integration."""
+
+import logging
+from typing import List
+import mlperf_loadgen as lg
+
+logger = logging.getLogger(__name__)
+
+
+class QuerySampleLibrary:
+    """Query Sample Library implementation.
+
+    This class manages the dataset of samples that LoadGen will query.
+    """
+
+    def __init__(self, dataset: List[List[int]]):
+        """Initialize the Query Sample Library.
+
+        Args:
+            dataset: List of tokenized prompts (list of token ID lists)
+        """
+        self.dataset = dataset
+        self.qsl = None
+        logger.info(f"Initializing QSL with {len(dataset)} samples")
+
+    def load_query_samples(self, sample_indices: List[int]) -> None:
+        """Load specified query samples into memory.
+
+        Args:
+            sample_indices: List of sample indices to load
+        """
+        # For this implementation, all samples are already in memory
+        logger.info(f"Loading {len(sample_indices)} query samples")
+
+    def unload_query_samples(self, sample_indices: List[int]) -> None:
+        """Unload specified query samples from memory.
+
+        Args:
+            sample_indices: List of sample indices to unload
+        """
+        # For this implementation, we keep all samples in memory
+        logger.info(f"Unloading {len(sample_indices)} query samples")
+
+    def __len__(self) -> int:
+        """Return the number of samples in the dataset."""
+        return len(self.dataset)
+
+    def __enter__(self):
+        """Context manager entry."""
+        self.qsl = lg.ConstructQSL(
+            len(self.dataset),
+            len(self.dataset),  # performance sample count
+            self.load_query_samples,
+            self.unload_query_samples
+        )
+        logger.info("QSL constructed")
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        if self.qsl:
+            lg.DestroyQSL(self.qsl)
+            self.qsl = None
+            logger.info("QSL destroyed")
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
new file mode 100644
index 0000000000..2f9c83532f
--- /dev/null
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -0,0 +1,443 @@
+#!/usr/bin/env python3
+"""Server scenario SUT implementation with streaming support for gpt-oss."""
+
+import asyncio
+import logging
+import numpy as np
+import queue
+import sys
+import threading
+import time
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+import mlperf_loadgen as lg
+from tqdm import tqdm
+
+from .base_sut import BaseSUT
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StreamingQueryState:
+    """State for a streaming query."""
+    query_sample: lg.QuerySample
+    query_id: int
+    input_ids: List[int]
+    accumulated_tokens: List[int]
+    accumulated_text: str
+    first_token_received: bool
+    first_token_time: Optional[float]
+    start_time: float
+    finished: bool
+
+
+class ServerSUT(BaseSUT):
+    """Server scenario SUT with streaming support.
+
+    Properly reports FirstTokenComplete and QuerySamplesComplete to LoadGen.
+    """
+
+    def __init__(
+        self,
+        backend,
+        dataset: List[List[int]],
+        max_tokens: int = 32768,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        num_workers: int = 1,
+        name: str = "ServerSUT",
+        progress_bar=None
+    ):
+        """Initialize the Server SUT.
+
+        Args:
+            backend: Backend instance for inference (must support streaming)
+            dataset: List of tokenized prompts
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p sampling parameter
+            num_workers: Number of worker threads
+            name: Name of the SUT
+            progress_bar: Optional tqdm progress bar for real-time updates
+        """
+        super().__init__(backend, dataset, name, progress_bar)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.num_workers = num_workers
+
+        # Query queue and streaming state
+        self.query_queue = queue.Queue()
+        self.active_streams: Dict[int, StreamingQueryState] = {}
+        self.active_streams_lock = threading.Lock()
+
+        # Track active async tasks for cancellation on KeyboardInterrupt
+        self.active_tasks = set()
+        self.active_tasks_lock = threading.Lock()
+
+        # Worker threads
+        self.workers = []
+
+        # Progress tracking
+        self.queries_completed = 0
+        self.progress_lock = threading.Lock()
+
+        # Event loop for async streaming
+        self.loop = None
+        self.loop_thread = None
+
+        logger.info(
+            f"ServerSUT configured with num_workers={num_workers} (streaming enabled)")
+
+    def start(self) -> lg.ConstructSUT:
+        """Start the SUT and worker threads."""
+        # Start event loop thread for async streaming
+        self._start_event_loop()
+
+        # Start worker threads
+        self._start_workers()
+
+        # Create LoadGen SUT
+        self.sut = lg.ConstructSUT(
+            self.issue_queries,
+            self.flush_queries)
+        logger.info(f"{self.name} started with streaming support")
+        return self.sut
+
+    def _start_event_loop(self):
+        """Start the asyncio event loop in a separate thread."""
+        def run_loop():
+            self.loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self.loop)
+            self.loop.run_forever()
+
+        self.loop_thread = threading.Thread(target=run_loop, daemon=True)
+        self.loop_thread.start()
+
+        # Wait for loop to be ready
+        while self.loop is None:
+            time.sleep(0.001)
+
+        logger.info("Async event loop started")
+
+    def _start_workers(self):
+        """Start worker threads for processing queries."""
+        for i in range(self.num_workers):
+            worker = threading.Thread(
+                target=self._worker_thread,
+                name=f"ServerWorker-{i}",
+                daemon=True
+            )
+            self.workers.append(worker)
+            worker.start()
+        logger.info(f"Started {self.num_workers} worker threads")
+
+    def _worker_thread(self):
+        """Worker thread that processes queries from the queue."""
+        try:
+            while not self.should_stop.is_set():
+                try:
+                    query_sample = self.query_queue.get(timeout=0.1)
+                except queue.Empty:
+                    continue
+                except KeyboardInterrupt:
+                    logger.info(
+                        "Worker thread interrupted, exiting gracefully...")
+                    break
+
+                # Schedule async streaming processing and track task
+                if self.loop and not self.should_stop.is_set():
+                    # Create the coroutine
+                    coro = self._process_streaming_query_tracked(query_sample)
+                    # Schedule it on the event loop
+                    future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+                    # Don't wait for completion - it happens asynchronously
+
+        except Exception as e:
+            logger.error(f"Worker thread error: {e}", exc_info=True)
+
+    async def _process_streaming_query_tracked(
+            self, query_sample: lg.QuerySample):
+        """Wrapper that tracks the async task for cancellation."""
+        task = asyncio.current_task()
+
+        # Add to active tasks
+        with self.active_tasks_lock:
+            self.active_tasks.add(task)
+
+        try:
+            await self._process_streaming_query(query_sample)
+        finally:
+            # Remove from active tasks
+            with self.active_tasks_lock:
+                self.active_tasks.discard(task)
+
+    async def _process_streaming_query(self, query_sample: lg.QuerySample):
+        """Process a single query with streaming support.
+
+        Token reporting to LoadGen:
+        1. When first token arrives → lg.FirstTokenComplete([token_0])
+        2. When generation finishes → lg.QuerySamplesComplete([token_1, token_2, ..., token_n])
+        Args:
+            query_sample: MLPerf LoadGen query sample
+        """
+        query_id = query_sample.id
+        sample_idx = query_sample.index
+        input_ids = self.dataset[sample_idx]
+
+        # Initialize streaming state
+        state = StreamingQueryState(
+            query_sample=query_sample,
+            query_id=query_id,
+            input_ids=input_ids,
+            accumulated_tokens=[],
+            accumulated_text="",
+            first_token_received=False,
+            first_token_time=None,
+            start_time=time.time(),
+            finished=False
+        )
+
+        with self.active_streams_lock:
+            self.active_streams[query_id] = state
+
+        try:
+            # Stream tokens from backend
+            async for chunk in self.backend.generate_stream(
+                input_ids=input_ids,
+                max_tokens=self.max_tokens,
+                temperature=self.temperature,
+                top_k=self.top_k,
+                top_p=self.top_p
+            ):
+                # Update state
+                if chunk.get("delta_token_ids"):
+                    state.accumulated_tokens.extend(chunk["delta_token_ids"])
+                if chunk.get("delta_text"):
+                    state.accumulated_text += chunk["delta_text"]
+
+                # Send FirstTokenComplete on first token
+                if chunk.get(
+                        "is_first_token") and not state.first_token_received:
+                    state.first_token_received = True
+                    state.first_token_time = time.time()
+                    await self._send_first_token_complete(state)
+
+                # Check if finished
+                if chunk.get("is_finished"):
+                    state.finished = True
+                    await self._send_final_response(state)
+                    break
+
+            # If no explicit finish signal, send final response
+            if not state.finished:
+                state.finished = True
+                await self._send_final_response(state)
+
+        except asyncio.CancelledError:
+            # Task was cancelled (e.g., KeyboardInterrupt during graceful
+            # shutdown)
+            logger.info(
+                f"Streaming query {query_id} cancelled during shutdown")
+            # Don't send response to LoadGen - we're shutting down
+            raise  # Re-raise to mark task as cancelled
+        except Exception as e:
+            logger.error(
+                f"Error processing streaming query {query_id}: {e}",
+                exc_info=True)
+            # Send empty response to unblock LoadGen
+            try:
+                await self._send_final_response(state)
+            except BaseException:
+                pass
+        finally:
+            # Clean up
+            with self.active_streams_lock:
+                self.active_streams.pop(query_id, None)
+
+    async def _send_first_token_complete(self, state: StreamingQueryState):
+        """Send FirstTokenComplete to LoadGen for TTFT measurement.
+
+        Only sends the first token for TTFT measurement.
+        """
+        try:
+            logger.debug(
+                f"First token for query {state.query_id} at {state.first_token_time - state.start_time:.3f}s")
+
+            # LoadGen uses this to measure Time To First Token (TTFT)
+            if state.accumulated_tokens and len(state.accumulated_tokens) > 0:
+                # Extract only the first token
+                first_token_only = [state.accumulated_tokens[0]]
+                token_array = np.ascontiguousarray(
+                    first_token_only, dtype=np.int32)
+            else:
+                # No tokens yet - this shouldn't happen but handle gracefully
+                token_array = np.array([], dtype=np.int32)
+                logger.warning(
+                    f"FirstTokenComplete called but no tokens accumulated for query {state.query_id}")
+
+            # Create response
+            response = lg.QuerySampleResponse(
+                state.query_id,
+                token_array.ctypes.data if token_array.size > 0 else 0,
+                token_array.nbytes,
+                len(token_array)
+            )
+
+            # Report to LoadGen
+            lg.FirstTokenComplete([response])
+            logger.debug(
+                f"Sent FirstTokenComplete for query {state.query_id}: 1 token")
+
+        except Exception as e:
+            logger.error(
+                f"Error sending FirstTokenComplete for query {state.query_id}: {e}",
+                exc_info=True)
+
+    async def _send_final_response(self, state: StreamingQueryState):
+        """Send final QuerySamplesComplete to LoadGen. (send all tokens except the first one)
+        """
+        try:
+            num_total_tokens = len(state.accumulated_tokens)
+            logger.debug(
+                f"Final response for query {state.query_id}: {num_total_tokens} total tokens")
+
+            # Store results (all tokens for internal tracking)
+            self.results[state.query_id] = {
+                "output_ids": state.accumulated_tokens,
+                "output_text": state.accumulated_text,
+                "metadata": {
+                    "latency": time.time() - state.start_time,
+                    "ttft": state.first_token_time - state.start_time if state.first_token_time else None,
+                }
+            }
+
+            if state.accumulated_tokens and len(state.accumulated_tokens) > 1:
+                remaining_tokens = state.accumulated_tokens[1:]
+                token_array = np.ascontiguousarray(
+                    remaining_tokens, dtype=np.int32)
+            else:
+                token_array = np.array([], dtype=np.int32)
+
+            # Create response
+            response = lg.QuerySampleResponse(
+                state.query_id,
+                token_array.ctypes.data if token_array.size > 0 else 0,
+                token_array.nbytes,
+                len(token_array)
+            )
+
+            # Report to LoadGen
+            lg.QuerySamplesComplete([response])
+            logger.debug(
+                f"Sent QuerySamplesComplete for query {state.query_id}: "
+                f"{len(token_array)} remaining tokens (total: {num_total_tokens})"
+            )
+
+            # Update progress bar (force refresh for async updates)
+            if self.progress_bar is not None:
+                with self.progress_lock:
+                    self.queries_completed += 1
+                    self.progress_bar.update(1)
+                    self.progress_bar.refresh()  # Force redraw from async context
+                    sys.stdout.flush()  # Force flush for immediate display in async/threaded context
+
+        except Exception as e:
+            logger.error(
+                f"Error sending final response for query {state.query_id}: {e}",
+                exc_info=True)
+
+    def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
+        """Issue queries to the SUT.
+
+        In Server mode, queries are added to a queue for worker threads.
+
+        Args:
+            query_samples: List of MLPerf LoadGen query samples
+        """
+        # Update progress bar total dynamically as queries arrive
+        if self.progress_bar is not None:
+            with self.progress_lock:
+                self.progress_bar.total = (
+                    self.progress_bar.total or 0) + len(query_samples)
+                self.progress_bar.refresh()
+
+        for qs in query_samples:
+            self.query_queue.put(qs)
+
+    def flush_queries(self) -> None:
+        """Flush all pending queries.
+
+        Wait for all issued queries to complete.
+        """
+        logger.info("Flushing server queries...")
+
+        # Wait for queue to empty and all streams to complete
+        while True:
+            queue_empty = self.query_queue.empty()
+
+            with self.active_streams_lock:
+                no_active_streams = len(self.active_streams) == 0
+
+            if queue_empty and no_active_streams:
+                break
+
+            time.sleep(0.01)
+
+        logger.info("Server queries flushed")
+
+    def stop(self) -> None:
+        """Stop the SUT and clean up resources."""
+        if self.should_stop.is_set():
+            logger.info(f"{self.name} already stopping or stopped.")
+            return
+
+        super().stop()
+
+        # Cancel all active streaming tasks
+        logger.info("Cancelling active streaming tasks...")
+        tasks_to_cancel = []
+        with self.active_tasks_lock:
+            tasks_to_cancel = list(self.active_tasks)
+
+        if tasks_to_cancel:
+            logger.info(f"Cancelling {len(tasks_to_cancel)} active tasks")
+            for task in tasks_to_cancel:
+                if not task.done():
+                    task.cancel()
+
+        # Clear pending queries from queue
+        pending_count = 0
+        try:
+            while True:
+                self.query_queue.get_nowait()
+                pending_count += 1
+        except queue.Empty:
+            pass
+
+        if pending_count > 0:
+            logger.info(f"Cleared {pending_count} pending queries from queue")
+
+        # Wait for workers with progress bar
+        with tqdm(total=len(self.workers), desc="Stopping workers", unit="worker") as pbar:
+            for i, worker in enumerate(self.workers):
+                worker.join(timeout=5)
+                if worker.is_alive():
+                    logger.warning(
+                        f"Worker {i+1} did not terminate gracefully")
+                pbar.update(1)
+
+        # Stop event loop
+        if self.loop:
+            self.loop.call_soon_threadsafe(self.loop.stop)
+            if self.loop_thread:
+                self.loop_thread.join(timeout=2)
+
+        logger.info("All workers stopped")
+
+        # Destroy LoadGen SUT
+        super().stop()
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
new file mode 100644
index 0000000000..a3b21cad1e
--- /dev/null
+++ b/language/gpt-oss/mlperf/user.conf
@@ -0,0 +1,13 @@
+gpt-oss-120b.Offline.target_qps = 10.0
+gpt-oss-120b.Offline.min_duration = 60000
+gpt-oss-120b.Offline.min_query_count = 8036
+# gpt-oss-120b.Offline.min_query_count = 1724
+
+gpt-oss-120b.Server.target_qps = 10
+gpt-oss-120b.Server.min_duration = 60000
+gpt-oss-120b.Server.min_query_count = 8036
+# gpt-oss-120b.Server.min_query_count = 1724
+
+gpt-oss-120b.Server.target_latency = 0
+gpt-oss-120b.Server.ttft_latency = 2000
+gpt-oss-120b.Server.tpot_latency = 20
diff --git a/language/gpt-oss/preprocess/harmonize_inputs.py b/language/gpt-oss/preprocess/harmonize_inputs.py
new file mode 100644
index 0000000000..510ec81195
--- /dev/null
+++ b/language/gpt-oss/preprocess/harmonize_inputs.py
@@ -0,0 +1,548 @@
+#!/usr/bin/env python3
+"""
+Multi-Shot Prompting with OpenAI Harmony Format
+
+This example demonstrates how to construct and tokenize a zero/multi-shot prompt using the
+openai-harmony Python package for math500, aime1983, livecodebench, mmlu, gpqa with gpt-oss model series.
+"""
+
+import argparse
+import json
+import os
+import pandas as pd
+from multiprocessing import Pool, cpu_count
+from tqdm import tqdm
+from openai_harmony import (
+    load_harmony_encoding,
+    HarmonyEncodingName,
+    Role,
+    Message,
+    Conversation,
+    SystemContent,
+    DeveloperContent,
+    ReasoningEffort
+)
+
+MOD_PROMPT = "Do not repeat steps and output the final answer immediately once you have it. Once you have a candidate answer, do not spend more than ~100 tokens to verify it - instead, do a quick check and answer immediately. Avoid thinking for a long time. It is important to answer in as few tokens as possible."
+
+
+def _create_base_messages(reasoning_effort, instructions):
+    """
+    Creates system and developer messages for a conversation.
+
+    Args:
+        reasoning_effort: ReasoningEffort enum value
+        instructions: String containing developer instructions
+
+    Returns:
+        list: List containing system and developer messages
+    """
+    # Create the system message with required channels
+    system_message = (
+        SystemContent.new()
+        .with_reasoning_effort(reasoning_effort)
+        .with_conversation_start_date("2025-09-30")
+        .with_required_channels(["analysis", "commentary", "final"])
+    )
+
+    # Create the developer message with instructions
+    developer_message = DeveloperContent.new().with_instructions(instructions)
+
+    return [
+        Message.from_role_and_content(Role.SYSTEM, system_message),
+        Message.from_role_and_content(Role.DEVELOPER, developer_message),
+    ]
+
+
+def _add_multishot_examples(messages, examples):
+    """
+    Adds multi-shot examples to a message list.
+
+    Args:
+        messages: List of messages to append examples to
+        examples: List of tuples (user_content, assistant_content) representing examples
+
+    Returns:
+        list: Updated messages list with examples added
+    """
+    for user_content, assistant_content in examples:
+        messages.append(Message.from_role_and_content(Role.USER, user_content))
+        messages.append(
+            Message.from_role_and_content(
+                Role.ASSISTANT, assistant_content).with_channel("final")
+        )
+    return messages
+
+
+def _finalize_conversation(messages, user_query=None):
+    """
+    Adds the user query, creates the conversation, and renders tokens.
+
+    Args:
+        messages: List of messages (system, developer, and optionally examples)
+        user_query: The actual user query to solve
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+    # Add the actual problem to solve
+    if user_query is not None:
+        messages.append(Message.from_role_and_content(Role.USER, user_query))
+    # Create the conversation
+    convo = Conversation.from_messages(messages)
+
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+
+    return convo, tokens
+
+
+def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a multi-shot prompt for mathematical problem solving using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are a math expert that solves problems step-by-step. "
+        "Always show your work clearly and put your final answer in \\boxed{answer} format. "
+        f"{MOD_PROMPT} "
+        "Follow the format shown in the examples below. "
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    # Define multi-shot examples
+    examples = [
+        # Example 1: Square areas and side lengths
+        (
+            "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?",
+            "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\n\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
+        ),
+        # Example 2: Floor function equation
+        (
+            "Find all $x$ such that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor x + 2 \\rfloor.$",
+            "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\n\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\n\\[2n = n + 3,\\]which means $n = 3.$\n\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\n\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
+        ),
+        # Example 3: Sequences and differences
+        (
+            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\n\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\n\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$",
+            "The terms of sequence $A$ are $2,$ $4,$ $8,$ $16,$ $32,$ $64,$ $128,$ $256,$ $512.$ The terms of sequence $B$ start from $20$ and go up by $20$ each time, so sequence $B$ is precisely all multiples of $20$ from $20$ to $320.$ We thus need to see which term in sequence $A$ is closest to a multiple of $20.$ $16,$ $64,$ and $256$ are the closest, each being $4$ away from a multiple of $20.$ So the least positive difference between a term in sequence $A$ and one in sequence $B$ is $\\boxed{4}.$"
+        ),
+        # Example 4: Probability and Deal or No Deal
+        (
+            "In the game Deal or No Deal, participants choose a box at random from a set of $26,$ one containing each of the following values: \\begin{tabular}{|c|c|}\\hline\\$.01&\\$1,000\\\\\\hline\\$1&\\$5,000\\\\\\hline\\$5&\\$10,000\\\\\\hline\\$10&\\$25,000\\\\\\hline\\$25&\\$50,000\\\\\\hline\\$50&\\$75,000\\\\\\hline\\$75&\\$100,000\\\\\\hline\\$100&\\$200,000\\\\\\hline\\$200&\\$300,000\\\\\\hline\\$300&\\$400,000\\\\\\hline\\$400&\\$500,000\\\\\\hline\\$500&\\$750,000\\\\\\hline\\$750&\\$1,000,000\\\\\\hline\\end{tabular} After choosing a box, participants eliminate other boxes by opening them, showing the amount of money in the box to the crowd, and then removing that box (and its money!) from the game. What is the minimum number of boxes a participant needs to eliminate in order to have a half chance of holding at least $\\$100,\\!000$ as his or her chosen box?",
+            "Seven of the boxes contain at least this amount. If a participant is going to be holding one of these boxes with a probability of $1/2,$ there can be at most $7$ other boxes left. This means that at least $26-7-7=\\boxed{12}$ boxes must be eliminated."
+        ),
+        # Example 5: Domain of composite function
+        (
+            "Find the domain of the function $f(x) = \\tan(\\arccos(x^2)).$",
+            "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\n\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
+        ),
+    ]
+
+    _add_multishot_examples(messages, examples)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a zero-shot prompt for mathematical problem solving using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are a math expert that solves problems step-by-step. "
+        f"{MOD_PROMPT} "
+        "The final line of your response should contain the final answer as an integer enclosed in \\boxed{answer}."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_livecodebench_prompt(
+        user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a zero-shot prompt for livecodebench problem solving using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are a python coding expert that solves problems step-by-step. "
+        "You must provide the reasoning to arriving at your solution and the code to solve the problem."
+        f"{MOD_PROMPT} "
+        "The code should be enclosed within ```python delimiters."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a multi-shot prompt for multiple choice question answering using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are an expert test-taker that answers multiple choice questions accurately. "
+        f"{MOD_PROMPT} "
+        "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    # Define multi-shot examples
+    examples = [
+        # Example 1: Abstract Algebra - Ring Theory
+        (
+            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\nA) 0\nB) 30\nC) 3\nD) 10\nE) 12\nF) 50\nG) 2\nH) 100\nI) 20\nJ) 5",
+            "Answer: A"
+        ),
+        # Example 2: Linear Algebra - Transformations
+        (
+            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\nA) ST + TS is the identity map of V onto itself.\nB) TS = 0\nC) ST = 1\nD) ST - TS = 0\nE) ST = T\nF) ST = 0\nG) ST = TS\nH) ST - TS is the identity map of V onto itself.\nI) TS = T\nJ) ST = S",
+            "Answer: H"
+        ),
+        # Example 3: Number Theory - Diophantine Equations
+        (
+            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\nA) -5\nB) 0\nC) -3\nD) -7\nE) -4\nF) -6\nG) -1\nH) -2\nI) -9\nJ) N/A",
+            "Answer: E"
+        ),
+        # Example 4: Differential Equations - Salt Tank Problem
+        (
+            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\nA) 3 + e^-2\nB) 2 - e^-4\nC) 2 - e^-2\nD) 3 + e^-4\nE) 2 + e^-3\nF) 2 - e^-3\nG) 3 - e^-2\nH) 2 + e^-2\nI) 2 + e^-4\nJ) 2",
+            "Answer: I"
+        ),
+        # Example 5: Basic Arithmetic - Division
+        (
+            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\nA) Multiply 5 by 5 to find 25 teams.\nB) Divide 30 by 5 to find 6 teams.\nC) Add 5 to 30 to find 35 teams.\nD) Subtract 30 from 5 to find -25 teams.\nE) Divide 5 by 30 to find 0.1667 teams.\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\nG) N/A\nH) N/A\nI) N/A\nJ) N/A",
+            "Answer: B"
+        ),
+    ]
+
+    _add_multishot_examples(messages, examples)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a multi-shot prompt for Biology, Physics and Chemistry question answering using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are an expert in Biology, Physics and Chemistry who answers scientific questions accurately. "
+        f"{MOD_PROMPT} "
+        "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    # Define multi-shot examples
+    examples = [
+        # Example 1: Molecular Biology - Gene Therapy
+        (
+            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\nA) polyA tail\nB) lariat\nC) antisense\nD) R-loops",
+            "The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Morpholino is bound to the pre-mRNA in an antisense orientation. Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded. The spliced RNA is polyadenylated at the 3' end. R-loops are triple helix of DNA and the pre-mRNA and a consequence of the RNA transcription, not splicing and RNA maturation.\n\nAnswer: D"
+        ),
+        # Example 2: Stereochemistry - Optical Activity
+        (
+            "How many of the following compounds exhibit optical activity?\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\n2,3,3,3-tetrafluoroprop-1-ene\ndi(cyclohex-2-en-1-ylidene)methane\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\n[1,1'-biphenyl]-3,3'-diol\n8,8-dichlorobicyclo[4.2.0]octan-7-one\ncyclopent-2-en-1-one\nA) 6\nB) 5\nC) 4\nD) 3",
+            "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\n\nAnswer: C"
+        ),
+    ]
+
+    _add_multishot_examples(messages, examples)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a zero-shot prompt for healthbench problem solving using Harmony format.
+    prompt: list of messages (message: Dict[str, str] (content, role)) giving conversation history
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "Your task is to answer the following health-related query as a responsible assistant."
+        "Provide responses that are factually accurate, up-to-date, and consistent with current clinical guidelines. Adjust the detail, terminology, and tone based on the user's apparent background (layperson or healthcare professional)."
+        "If the evidence is limited or uncertain, clearly acknowledge this in your answer. When essential information is missing for a safe, thorough response, ask clarifying questions before proceeding."
+        "Do not provide specific medical diagnoses or emergency recommendations unless explicitly appropriate and safe to do so."
+        "All responses should promote user safety, accuracy, completeness, clear communication, and adherence to user instructions while maintaining clinical standards."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    role_map = {
+        "user": Role.USER,
+        "assistant": Role.ASSISTANT
+    }
+
+    for message in prompt:
+        role = message["role"]
+        assert role in role_map, f"Unknown role: {role}"
+        role = role_map[role]
+
+        content = message["content"]
+        messages.append(Message.from_role_and_content(role, content))
+
+    return _finalize_conversation(messages, None)
+
+
+def create_arxiv_summarization_prompt(
+        user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a zero-shot prompt for arXiv paper summarization using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are an expert at reading and summarizing academic research papers. "
+        "Your task is to provide clear, concise, and accurate summaries of research papers. "
+        "Focus on the key contributions, methodology, results, and implications. "
+        "Structure your summary to be accessible while maintaining technical accuracy."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_default_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a default zero-shot prompt for general problem solving using Harmony format.
+    This is used when no specific dataset is specified.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are a helpful AI assistant that solves user questions. "
+        "Provide a well-structured answer to the user's question."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def process_row(args):
+    """
+    Worker function to process a single row from the dataframe.
+
+    Args:
+        args: tuple of (index, row, dataset_function_map, reasoning_effort)
+
+    Returns:
+        tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
+    """
+    index, row, dataset_function_map, reasoning_effort = args
+
+    # Check if dataset column exists, use default if not
+    if "dataset" in row:
+        dataset_name = row["dataset"]
+    else:
+        dataset_name = "default"
+
+    if dataset_name == "healthbench":
+        user_query = row["prompt"]
+    else:
+        # Try to get question from common column names
+        if "question" in row:
+            user_query = row["question"]
+        elif "prompt" in row:
+            user_query = row["prompt"]
+        elif "query" in row:
+            user_query = row["query"]
+        else:
+            error_msg = f"No query column found (tried: question, prompt, query) at index {index}"
+            return (index, None, None, dataset_name, error_msg)
+
+    try:
+        # Get the appropriate function based on dataset type
+        if dataset_name in dataset_function_map:
+            create_prompt_func = dataset_function_map[dataset_name]
+            convo, tokens = create_prompt_func(user_query, reasoning_effort)
+            return (index, convo, tokens, dataset_name)
+        else:
+            error_msg = f"Unknown dataset '{dataset_name}' at index {index}"
+            return (index, None, None, dataset_name, error_msg)
+    except Exception as e:
+        error_msg = f"Error processing {dataset_name} at index {index}: {str(e)}"
+        return (index, None, None, dataset_name, error_msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data-file",
+        type=str,
+        default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl")
+    parser.add_argument("--num-processes", type=int, default=cpu_count(),
+                        help="Number of processes to use for parallel processing (default: number of CPU cores)")
+    parser.add_argument("--max-rows", type=int, default=None,
+                        help="Maximum number of rows to process (default: process all rows)")
+    parser.add_argument("--output-file", type=str, required=True,
+                        help="Output pickle file path to save the processed data as pandas DataFrame")
+    parser.add_argument("--reasoning-effort", type=str, default="high",
+                        help="Reasoning effort to use for the prompt")
+    parser.add_argument("--dataset", type=str, default=None,
+                        help="Filter to only process rows from this dataset (e.g., 'gpqa', 'mmlu_pro', 'math500', 'aime1983', 'livecodebench')")
+    args = parser.parse_args()
+    df = pd.read_pickle(args.data_file)
+
+    reasoning_effort = {
+        "high": ReasoningEffort.HIGH,
+        "medium": ReasoningEffort.MEDIUM,
+        "low": ReasoningEffort.LOW
+    }[args.reasoning_effort.lower()]
+
+    # Filter by dataset if specified
+    if args.dataset is not None:
+        if 'dataset' not in df.columns:
+            print(
+                f"WARNING: No 'dataset' column found in dataframe. Cannot filter by dataset.")
+            print(f"All rows will be processed using the default prompt function.")
+        else:
+            original_len = len(df)
+            available_datasets = sorted(df['dataset'].unique().tolist())
+            df = df[df['dataset'] == args.dataset].copy()
+            print(
+                f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
+            if len(df) == 0:
+                print(f"ERROR: No rows found for dataset '{args.dataset}'")
+                print(f"Available datasets: {available_datasets}")
+                import sys
+                sys.exit(1)
+
+    # Apply row limit if specified
+    if args.max_rows is not None:
+        df = df.head(args.max_rows)
+        print(f"Limited to first {args.max_rows} rows")
+
+    # Create mapping from dataset names to prompt creation functions
+    dataset_function_map = {
+        'aime1983': create_aime1983_prompt,
+        'aime2025': create_aime1983_prompt,
+        'arxiv_summarization': create_arxiv_summarization_prompt,
+        'pubmed_summarization': create_arxiv_summarization_prompt,
+        'gpqa': create_gpqa_prompt,
+        'livecodebench': create_livecodebench_prompt,
+        'math500': create_math500_prompt,
+        'mmlu_pro': create_mmlu_prompt,
+        'mmlu': create_mmlu_prompt,
+        'healthbench': create_healthbench_prompt,
+        'default': create_default_prompt,
+    }
+
+    # Prepare data for parallel processing
+    process_args = [(index, row, dataset_function_map, reasoning_effort)
+                    for index, row in df.iterrows()]
+
+    # Don't use more processes than we have rows
+    num_processes = min(args.num_processes, len(df))
+
+    print(f"Processing {len(df)} queries using {num_processes} processes...")
+
+    # Process rows in parallel with progress bar
+    with Pool(processes=num_processes) as pool:
+        results = list(tqdm(
+            pool.imap(process_row, process_args),
+            total=len(process_args),
+            desc="Processing queries",
+            unit="query"
+        ))
+
+    # Sort results by index to preserve original order
+    results.sort(key=lambda x: x[0])
+
+    # Ensure output directory exists
+    output_dir = os.path.dirname(args.output_file)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Process results and modify original DataFrame
+    successful_count = 0
+    error_count = 0
+
+    # Initialize columns for harmony tokenized input
+    df['tok_input'] = None
+    df['tok_input_len'] = None
+    df['text_input'] = None
+
+    for result in tqdm(results, desc="Processing results", unit="entry"):
+        if len(result) == 5:  # Error case
+            index, convo, tokens, dataset_name, error_msg = result
+            error_count += 1
+
+            # Update the original DataFrame with error data
+            df.at[index, 'tok_input'] = None
+            df.at[index, 'tok_input_len'] = None
+            df.at[index, 'text_input'] = None
+        else:  # Success case
+            index, convo, tokens, dataset_name = result
+            successful_count += 1
+
+            # Convert conversation to string format
+            conversation_parts = []
+            for message in convo.messages:
+                # Get role from message.author.role
+                role = message.author.role.value if hasattr(
+                    message.author.role, 'value') else str(
+                    message.author.role)
+
+                # Get content from message.content (which is a list)
+                content_parts = []
+                for content_item in message.content:
+                    if hasattr(content_item, 'text'):
+                        content_parts.append(content_item.text)
+                    else:
+                        content_parts.append(str(content_item))
+                content = ' '.join(content_parts)
+
+                # Format as "Role: content"
+                conversation_parts.append(f"{role}: {content}")
+
+            conversation_string = '\n'.join(conversation_parts)
+
+            # Update the original DataFrame with successful data
+            df.at[index, 'tok_input'] = tokens.tolist() if hasattr(
+                tokens, 'tolist') else list(tokens)
+            df.at[index, 'tok_input_len'] = len(
+                tokens) if hasattr(tokens, '__len__') else 0
+            df.at[index, 'text_input'] = conversation_string
+
+    # Verify input and output have identical column lists
+    input_columns = list(pd.read_pickle(args.data_file).columns)
+    output_columns = list(df.columns)
+    print(f"Input columns: {input_columns}")
+    print(f"Output columns: {output_columns}")
+
+    # Save the modified DataFrame as pickle
+    print("Saving modified DataFrame to pickle...")
+    df.to_pickle(args.output_file)
+
+    print(f"\nProcessing completed:")
+    print(f"  - Successfully processed: {successful_count} queries")
+    print(f"  - Errors: {error_count} queries")
+    print(f"  - Output saved to: {args.output_file} (pickle format)")
+    print(f"  - Total processed: {successful_count + error_count} queries")
+    print(f"  - Modified DataFrame shape: {df.shape}")
+    print(f"  - Updated columns: tok_input, tok_input_len, text_input")
diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
new file mode 100644
index 0000000000..aab1573917
--- /dev/null
+++ b/language/gpt-oss/requirements.txt
@@ -0,0 +1,11 @@
+audioread>=2.1.9
+joblib>=0.14
+msgpack>=1.0
+numba>=0.51.0
+pooch>=1.0
+scikit-learn>=0.20.0
+soxr>=0.3.2
+absl-py>=2.3.1
+lazy-loader>=0.1
+datasets>=2.0.0,<3.0.0
+anthropic~=0.72.0
diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
new file mode 100644
index 0000000000..adac3c6271
--- /dev/null
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -0,0 +1,642 @@
+#!/usr/bin/env python3
+"""
+Script to send text prompts to TensorRT-LLM server via OpenAI completions endpoint.
+Supports round-robin load balancing across multiple server endpoints.
+
+Usage:
+    python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
+
+Arguments:
+    --input-tokens     Path to pickle file containing data with text_input column from harmony-tokens.py
+    --server-url       TensorRT-LLM server URL(s) - comma-separated for round-robin (e.g., "localhost:8000,localhost:8001")
+    --max-samples      Maximum number of samples to process (default: all)
+    --max-tokens       Maximum tokens to generate per request (default: 100)
+    --max-concurrency  Maximum number of concurrent requests (default: 256)
+    --output           Output pickle file for responses (optional)
+    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
+
+Examples:
+    # Single server
+    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000
+
+    # Multiple servers with round-robin
+    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000,localhost:8001,localhost:8002
+"""
+
+import asyncio
+import argparse
+import time
+import logging
+from typing import List, Dict, Any, Tuple
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import httpx
+from openai import AsyncOpenAI
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Suppress verbose HTTP logs from httpx and openai
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("openai").setLevel(logging.WARNING)
+
+# Initialize tokenizer
+MODEL_NAME = "openai/gpt-oss-120b"
+tokenizer = None
+
+
+def get_tokenizer():
+    """Get or initialize the tokenizer."""
+    global tokenizer
+    if tokenizer is None:
+        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        logger.info("Tokenizer loaded successfully")
+    return tokenizer
+
+
+class TRTLLMClient:
+    """Client for TensorRT-LLM server using OpenAI-compatible endpoint with round-robin support."""
+
+    def __init__(self,
+                 server_urls: List[str] = None,
+                 temperature: float = 0.001,
+                 top_k: int = 1,
+                 top_p: float = 1.0,
+                 max_concurrency: int = 256,
+                 timeout: int = 1200):
+        # Support multiple server URLs for round-robin load balancing
+        if server_urls is None:
+            server_urls = ["localhost:8000"]
+        self.server_urls = server_urls
+        self.num_servers = len(server_urls)
+        self.current_server_index = 0
+
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.max_concurrency = max_concurrency
+        self.timeout = timeout
+        self.model_name = MODEL_NAME
+
+        # Initialize async OpenAI clients (one per server)
+        self.http_clients = []
+        self.async_clients = []
+        self.concurrency_semaphore = None
+
+        logger.info(
+            f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
+
+    async def initialize(self):
+        """Initialize OpenAI clients for all servers."""
+        # Create semaphore for concurrency control
+        self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
+
+        # Create HTTP and OpenAI clients for each server
+        for server_url in self.server_urls:
+            # Setup HTTP client with proper connection limits for high
+            # concurrency
+            http_client = httpx.AsyncClient(
+                timeout=httpx.Timeout(self.timeout),
+                limits=httpx.Limits(
+                    max_keepalive_connections=self.max_concurrency * 2,
+                    max_connections=self.max_concurrency * 2,
+                ),
+                http2=True
+            )
+
+            # Setup OpenAI client with the configured HTTP client
+            async_client = AsyncOpenAI(
+                api_key='dummy',  # TensorRT-LLM server doesn't require real API key
+                base_url=f"http://{server_url}/v1/",
+                timeout=self.timeout,
+                max_retries=10,
+                http_client=http_client,
+            )
+
+            self.http_clients.append(http_client)
+            self.async_clients.append(async_client)
+
+        logger.info(f"Initialized {len(self.async_clients)} OpenAI client(s)")
+
+    def _get_next_client(self) -> AsyncOpenAI:
+        """Get the next client using round-robin selection."""
+        client = self.async_clients[self.current_server_index]
+        self.current_server_index = (
+            self.current_server_index + 1) % self.num_servers
+        return client
+
+    async def send_request(
+            self, prompt: str, max_tokens: int = 100,
+            sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
+        """Send a single request to the TensorRT-LLM server using round-robin.
+
+        Args:
+            prompt: Text prompt to send
+            max_tokens: Maximum tokens to generate
+            sample_id: Sample identifier
+            pass_num: Pass number for pass@k strategy
+
+        Returns:
+            Tuple of (sample_id, pass_num, response, latency)
+        """
+        # Prepare generation parameters using OpenAI completions format (as per
+        # TensorRT-LLM docs)
+        extra_body = {
+            # TensorRT-LLM specific parameters
+            "min_tokens": 1,
+        }
+
+        # Only include top_k if it's not 0 (so it can default to None on server
+        # side)
+        if self.top_k != 0:
+            extra_body["top_k"] = self.top_k
+
+        gen_params = {
+            "model": self.model_name,
+            "prompt": prompt,
+            "max_tokens": max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "stream": False,
+            "extra_body": extra_body,
+        }
+
+        try:
+            # Track latency: time from request sent to response received
+            start_time = time.time()
+
+            # Select client using round-robin
+            client = self._get_next_client()
+
+            # Use semaphore for concurrency control
+            async with self.concurrency_semaphore:
+                completion = await client.completions.create(**gen_params)
+
+            end_time = time.time()
+            latency = end_time - start_time
+
+            # Extract response text from completions format
+            response_text = completion.choices[0].text
+
+            # Tokenize the response to get output_ids (similar to SGLang
+            # format)
+            tokenizer = get_tokenizer()
+            output_ids = tokenizer.encode(
+                response_text, add_special_tokens=False)
+
+            # Format response similar to SGLang format for compatibility
+            response = {
+                "output_ids": output_ids,
+                "text": response_text,
+                "meta_info": {
+                    "completion_tokens": len(output_ids),
+                }
+            }
+
+            return sample_id, pass_num, response, latency
+
+        except Exception as e:
+            logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
+            return sample_id, pass_num, {"error": str(e)}, None
+
+    async def shutdown(self):
+        """Clean up resources for all clients."""
+        for http_client in self.http_clients:
+            if http_client:
+                await http_client.aclose()
+
+
+def load_tokenized_data(data_file: str) -> pd.DataFrame:
+    """Load data from pickle file produced by harmony-tokens.py."""
+    logger.info(f"Loading data from {data_file}")
+
+    # Load DataFrame from pickle
+    df = pd.read_pickle(data_file)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+
+    # Check if text_input column exists and has valid data
+    if 'text_input' in df.columns:
+        # Check for any None values in text_input
+        failed_mask = df['text_input'].isna()
+        failed_count = failed_mask.sum()
+
+        if failed_count > 0:
+            failed_indices = df[failed_mask].index.unique()
+            error_msg = f"Found {failed_count} samples with missing text_input at indices: {failed_indices.tolist()}"
+            logger.error(error_msg)
+            raise AssertionError(error_msg)
+
+        # Check first sample
+        first_text = df.iloc[0]['text_input']
+        if isinstance(first_text, str):
+            logger.info(
+                f"First sample text length: {len(first_text)} characters")
+        else:
+            logger.warning(
+                "text_input column exists but first sample is not a string")
+
+        logger.info(f"All {len(df)} samples have valid text_input")
+    else:
+        logger.error("No 'text_input' column found in DataFrame")
+        raise ValueError("DataFrame must contain 'text_input' column")
+
+    return df
+
+
+async def send_requests_async(
+        tokenized_df: pd.DataFrame, server_urls: List[str],
+        max_tokens: int = 100, max_concurrency: int = 256,
+        temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+        timeout: int = 1200, pass_k: int = 1):
+    """Send all requests to TensorRT-LLM server(s) asynchronously with round-robin load balancing.
+
+    Args:
+        server_urls: List of server URLs for round-robin load balancing
+        pass_k: Number of inference passes per sample for pass@k strategy
+
+    Returns:
+        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
+    """
+    num_samples = len(tokenized_df)
+    total_requests = num_samples * pass_k
+    logger.info(
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) with {max_concurrency} concurrent workers...")
+
+    # Initialize client with multiple servers for round-robin
+    client = TRTLLMClient(
+        server_urls=server_urls,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        max_concurrency=max_concurrency,
+        timeout=timeout
+    )
+    await client.initialize()
+
+    # Prepare all tasks - create pass_k requests per sample
+    tasks = []
+    for idx, row in tokenized_df.iterrows():
+        for pass_num in range(pass_k):
+            task = client.send_request(
+                row['text_input'],
+                max_tokens=max_tokens,
+                sample_id=idx,
+                pass_num=pass_num
+            )
+            tasks.append(task)
+
+    start_time = time.time()
+
+    # Execute all tasks concurrently with progress bar
+    results = []
+    for coro in tqdm(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Sending requests",
+            unit="request"):
+        result = await coro
+        results.append(result)
+
+    # Group results by sample_id and pass_num
+    responses_by_pass = {}
+    latencies_by_pass = {}
+    for sample_id, pass_num, response, latency in results:
+        responses_by_pass[(sample_id, pass_num)] = response
+        latencies_by_pass[(sample_id, pass_num)] = latency
+
+    total_time = time.time() - start_time
+    logger.info(
+        f"Completed {total_requests} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
+
+    # Log latency statistics
+    valid_latencies = [
+        lat for lat in latencies_by_pass.values() if lat is not None]
+    if valid_latencies:
+        avg_latency = sum(valid_latencies) / len(valid_latencies)
+        min_latency = min(valid_latencies)
+        max_latency = max(valid_latencies)
+        logger.info(
+            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
+
+    # Shutdown client
+    await client.shutdown()
+
+    return responses_by_pass, latencies_by_pass
+
+
+def extract_response_ids(
+        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
+    """Extract response output_ids from TensorRT-LLM responses for all passes.
+
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        tokenized_df: DataFrame with samples
+        pass_k: Number of passes per sample
+
+    Returns:
+        Dict mapping (sample_id, pass_num) to output_ids list
+    """
+    logger.info("Extracting response output_ids...")
+
+    response_ids_by_pass = {}
+    total_responses = len(tokenized_df) * pass_k
+
+    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
+        for idx, row in tokenized_df.iterrows():
+            for pass_num in range(pass_k):
+                response = responses_by_pass.get((idx, pass_num), {})
+                response_id = []
+                if "error" not in response and "output_ids" in response:
+                    try:
+                        # TensorRT-LLM returns the generated token IDs in the
+                        # 'output_ids' field
+                        response_id = response["output_ids"]
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
+                response_ids_by_pass[(idx, pass_num)] = response_id
+                pbar.update(1)
+
+    logger.info("Response output_ids extraction complete")
+    return response_ids_by_pass
+
+
+def detokenize_output_ids(
+        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
+    """Detokenize output_ids back to text using AutoTokenizer for all passes.
+
+    Args:
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        pass_k: Number of passes per sample
+
+    Returns:
+        Dict mapping (sample_id, pass_num) to detokenized text
+    """
+    logger.info("Detokenizing output_ids to text...")
+
+    tokenizer = get_tokenizer()
+    detokenized_texts_by_pass = {}
+
+    for (sample_id, pass_num), token_ids in tqdm(
+            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
+        try:
+            # Detokenize the token IDs back to text
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+            detokenized_texts_by_pass[(sample_id, pass_num)] = text
+        except Exception as e:
+            logger.warning(
+                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
+            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
+
+    logger.info("Output detokenization complete")
+    return detokenized_texts_by_pass
+
+
+def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
+                   response_ids_by_pass: Dict[tuple, List[int]],
+                   detokenized_texts_by_pass: Dict[tuple, str],
+                   latencies_by_pass: Dict[tuple, float],
+                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
+    """Save all responses to DataFrame and optionally to pickle file.
+
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
+        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
+        tokenized_df: Original DataFrame with samples
+        pass_k: Number of passes per sample
+        output_file: Optional output pickle file
+
+    Returns:
+        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
+    """
+    logger.info("Processing responses and updating DataFrame...")
+
+    # Work with the original DataFrame
+    result_df = tokenized_df.copy()
+
+    # Create columns for each pass with _0, _1, _2, ... suffixes
+    for pass_num in range(pass_k):
+        # Lists to store data for this pass
+        model_outputs = []
+        tok_model_outputs = []
+        tok_model_output_lens = []
+        infer_times = []
+
+        for idx in tokenized_df.index:
+            key = (idx, pass_num)
+            detokenized_text = detokenized_texts_by_pass.get(key, "")
+            response_ids = response_ids_by_pass.get(key, [])
+            latency = latencies_by_pass.get(key, None)
+
+            model_outputs.append(detokenized_text)
+            tok_model_outputs.append(response_ids)
+            tok_model_output_lens.append(len(response_ids))
+            infer_times.append(latency)
+
+        # Add columns with suffixes
+        result_df[f'model_output_{pass_num}'] = model_outputs
+        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
+        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
+        result_df[f'infer_time_{pass_num}'] = infer_times
+
+    # Calculate output token lengths for logging
+    all_output_token_lengths = []
+    for idx in tokenized_df.index:
+        for pass_num in range(pass_k):
+            key = (idx, pass_num)
+            response = responses_by_pass.get(key, {})
+            response_ids = response_ids_by_pass.get(key, [])
+            try:
+                output_token_length = response.get(
+                    "meta_info", {}).get(
+                    "completion_tokens", len(response_ids))
+                all_output_token_lengths.append(output_token_length)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
+                all_output_token_lengths.append(len(response_ids))
+
+    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
+    new_columns = [
+        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
+    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
+    if all_output_token_lengths:
+        logger.info(
+            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
+
+    # Save to pickle file if output_file is provided
+    if output_file:
+        logger.info(f"Saving responses to {output_file}...")
+        result_df.to_pickle(output_file)
+        logger.info(f"Responses saved to {output_file}")
+
+    return result_df
+
+
+async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[str],
+                                 max_samples: int = None, max_tokens: int = 100,
+                                 max_concurrency: int = 256, output_file: str = None,
+                                 temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+                                 timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
+    """Main processing function that handles requests and response extraction.
+
+    Args:
+        server_urls: List of server URLs for round-robin load balancing
+        pass_k: Number of inference passes per sample for pass@k strategy
+    """
+
+    # Step 1: Limit samples if specified
+    if max_samples is not None:
+        tokenized_df = tokenized_df.head(max_samples)
+        logger.info(f"Limited to first {max_samples} samples")
+
+    # Step 2: Send all requests asynchronously (k passes per sample)
+    responses_by_pass, latencies_by_pass = await send_requests_async(
+        tokenized_df,
+        server_urls,
+        max_tokens,
+        max_concurrency,
+        temperature,
+        top_k,
+        top_p,
+        timeout,
+        pass_k)
+
+    # Step 3: Extract response output_ids for all passes
+    response_ids_by_pass = extract_response_ids(
+        responses_by_pass, tokenized_df, pass_k)
+
+    # Step 4: Detokenize output_ids to text for model_output for all passes
+    detokenized_texts_by_pass = detokenize_output_ids(
+        response_ids_by_pass, pass_k)
+
+    # Step 5: Save all results and return DataFrame
+    result_df = save_responses(
+        responses_by_pass,
+        response_ids_by_pass,
+        detokenized_texts_by_pass,
+        latencies_by_pass,
+        tokenized_df,
+        pass_k,
+        output_file)
+
+    return result_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Send text prompts to TensorRT-LLM server via OpenAI completions endpoint")
+    parser.add_argument("--input-tokens", required=True,
+                        help="Path to pickle file containing data with text_input column from harmony-tokens.py")
+    parser.add_argument("--server-url", default="localhost:8000",
+                        help="TensorRT-LLM server URL(s) - comma-separated for round-robin load balancing (default: localhost:8000)")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Maximum number of samples to process (default: all)")
+    parser.add_argument("--max-tokens", type=int, default=100,
+                        help="Maximum tokens to generate per request")
+    parser.add_argument("--max-concurrency", type=int, default=256,
+                        help="Maximum number of concurrent requests (default: 256)")
+    parser.add_argument("--output", default=None,
+                        help="Output pickle file for responses (optional)")
+    parser.add_argument("--pass-k", type=int, default=1,
+                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
+    parser.add_argument("--temperature", type=float, default=0.001,
+                        help="Temperature for sampling (default: 0.001)")
+    parser.add_argument("--top-k", type=int, default=1,
+                        help="Top-k for sampling (default: 1)")
+    parser.add_argument("--top-p", type=float, default=1.0,
+                        help="Top-p for sampling (default: 1.0)")
+    parser.add_argument("--timeout", type=int, default=1200,
+                        help="Timeout for requests (default: 1200)")
+
+    args = parser.parse_args()
+
+    # Parse comma-separated server URLs
+    server_urls = [url.strip() for url in args.server_url.split(',')]
+    logger.info(
+        f"Configured {len(server_urls)} server(s) for round-robin load balancing")
+
+    # Test connection
+    async def test_connection():
+        logger.info(f"Testing server connection(s)...")
+        client = TRTLLMClient(
+            server_urls=server_urls,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            max_concurrency=1,
+            timeout=args.timeout
+        )
+        await client.initialize()
+
+        try:
+            _, _, test_response, _ = await client.send_request(
+                prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
+            if "error" in test_response:
+                logger.error(
+                    f"Server connection failed: {test_response['error']}")
+                logger.error(
+                    "Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
+                return False
+            logger.info("Server connection successful")
+            return True
+        finally:
+            await client.shutdown()
+
+    # Run connection test
+    if not asyncio.run(test_connection()):
+        return
+
+    # Load pre-tokenized data
+    tokenized_df = load_tokenized_data(args.input_tokens)
+
+    # Process requests and get result DataFrame
+    result_df = asyncio.run(process_requests_async(
+        tokenized_df, server_urls,
+        max_samples=args.max_samples,
+        max_tokens=args.max_tokens,
+        max_concurrency=args.max_concurrency,
+        output_file=args.output,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        timeout=args.timeout,
+        pass_k=args.pass_k))
+
+    # Print summary
+    logger.info(f"\nProcessing completed:")
+    logger.info(f"  - Total samples processed: {len(result_df)}")
+    logger.info(f"  - Number of passes per sample: {args.pass_k}")
+    logger.info(
+        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
+
+    # Calculate average output length across all passes
+    if args.pass_k == 1:
+        avg_output_len = result_df['tok_model_output_len_0'].mean()
+        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
+    else:
+        all_output_lens = []
+        for i in range(args.pass_k):
+            all_output_lens.extend(
+                result_df[f'tok_model_output_len_{i}'].tolist())
+        avg_output_len = sum(all_output_lens) / \
+            len(all_output_lens) if all_output_lens else 0
+        logger.info(
+            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
+
+    if args.output:
+        logger.info(f"  - Results saved to: {args.output}")
+    else:
+        logger.info("  - Results returned as DataFrame (not saved to file)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
new file mode 100755
index 0000000000..86e588f2c0
--- /dev/null
+++ b/language/gpt-oss/run_mlperf.py
@@ -0,0 +1,505 @@
+#!/usr/bin/env python3
+"""MLPerf inference benchmark runner for gpt-oss.
+
+This script integrates the gpt-oss model with MLPerf LoadGen for
+performance and accuracy benchmarking.
+
+Usage:
+    # Offline scenario (performance)
+    python run_mlperf.py --scenario offline --input-file data/accuracy_eval_tokenized.pkl
+
+    # Server scenario (performance)
+    python run_mlperf.py --scenario server --input-file data/accuracy_eval_tokenized.pkl
+
+    # Accuracy mode
+    python run_mlperf.py --scenario offline --accuracy --input-file data/accuracy_eval_tokenized.pkl
+"""
+
+import argparse
+import json
+import logging
+import os
+import sys
+import threading
+from pathlib import Path
+from typing import Optional, Dict, Any
+
+import mlperf_loadgen as lg
+import pandas as pd
+from tqdm import tqdm
+
+from backends import SGLangBackend
+from mlperf import OfflineSUT, ServerSUT, QuerySampleLibrary
+from utils import load_tokenized_dataset, StandardTokenizer
+
+# Disable tokenizers parallelism to avoid forking issues
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def load_generation_config(config_path: str) -> Dict[str, Any]:
+    """Load generation configuration from JSON file.
+
+    Args:
+        config_path: Path to generation_config.json
+
+    Returns:
+        Dictionary with generation parameters
+    """
+    logger.info(f"Loading generation config from {config_path}")
+
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+
+    # Filter out comment fields (starting with _)
+    gen_params = {k: v for k, v in config.items() if not k.startswith('_')}
+
+    return gen_params
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """Create argument parser for MLPerf runner."""
+    parser = argparse.ArgumentParser(
+        description="Run MLPerf inference benchmarks for gpt-oss"
+    )
+
+    # Scenario selection
+    parser.add_argument(
+        "--scenario",
+        type=str,
+        default="offline",
+        choices=["offline", "server"],
+        help="MLPerf scenario (offline or server)"
+    )
+
+    # Dataset
+    parser.add_argument(
+        "--input-file",
+        type=str,
+        required=True,
+        help="Path to tokenized dataset (pickle file)"
+    )
+
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=None,
+        help="Maximum number of samples to use (None for all)"
+    )
+
+    # MLPerf configuration
+    parser.add_argument(
+        "--mlperf-conf",
+        type=str,
+        default="/home/scratch.shobhitv_coreai/mlcinf-repos/gpt-oss-perf/loadgen/mlperf.conf",
+        help="Path to MLPerf configuration file"
+    )
+
+    parser.add_argument(
+        "--user-conf",
+        type=str,
+        default="mlperf/user.conf",
+        help="Path to user configuration file"
+    )
+
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy mode instead of performance"
+    )
+
+    # Output configuration
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="mlperf_results",
+        help="Directory for MLPerf output logs"
+    )
+
+    # Backend configuration
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="sglang",
+        choices=["sglang"],
+        help="Backend to use for inference"
+    )
+
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://localhost:30000",
+        help="Server URL for backend (SGLang)"
+    )
+
+    # Generation configuration
+    parser.add_argument(
+        "--generation-config",
+        type=str,
+        default="generation_config.json",
+        help="Path to generation configuration JSON file"
+    )
+
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=None,
+        help="Override max_new_tokens from generation config (default: use value from config)"
+    )
+
+    # Server scenario specific
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Number of worker threads (for server scenario)"
+    )
+
+    # Concurrency control
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=128,
+        help="Maximum concurrent requests to backend (SGLang handles batching internally)"
+    )
+
+    return parser
+
+
+def configure_loadgen(
+    scenario: str,
+    accuracy_mode: bool,
+    mlperf_conf: Optional[str] = None,
+    user_conf: Optional[str] = None,
+    log_dir: Optional[str] = None,
+    model_name: str = "gpt-oss-120b"
+) -> lg.TestSettings:
+    """Configure LoadGen test settings.
+
+    Args:
+        scenario: MLPerf scenario ("offline" or "server")
+        accuracy_mode: Whether to run in accuracy mode
+        mlperf_conf: Path to MLPerf config file
+        user_conf: Path to user config file
+        log_dir: Directory for logs
+        model_name: Model name for configuration
+
+    Returns:
+        LoadGen TestSettings
+    """
+    settings = lg.TestSettings()
+
+    # Set scenario
+    if scenario.lower() == "offline":
+        settings.scenario = lg.TestScenario.Offline
+    elif scenario.lower() == "server":
+        settings.scenario = lg.TestScenario.Server
+    else:
+        raise ValueError(f"Unknown scenario: {scenario}")
+
+    # Set mode
+    if accuracy_mode:
+        settings.mode = lg.TestMode.AccuracyOnly
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+
+    # Load configurations if files exist
+    # conf_type: 2 = mlperf.conf, 1 = user.conf
+    # LoadGen tracks config calls and only allows one user.conf for official
+    # submissions
+    if mlperf_conf and Path(mlperf_conf).exists():
+        logger.debug(f"Loading MLPerf config from {mlperf_conf}")
+        settings.FromConfig(mlperf_conf, model_name, scenario.capitalize(), 2)
+    else:
+        logger.warning(f"MLPerf config not found: {mlperf_conf}")
+
+    if user_conf and Path(user_conf).exists():
+        logger.debug(f"Loading user config from {user_conf}")
+        settings.FromConfig(user_conf, model_name, scenario.capitalize(), 1)
+    else:
+        logger.warning(f"User config not found: {user_conf}")
+
+    return settings
+
+
+def main():
+    """Main function."""
+    parser = create_argument_parser()
+    args = parser.parse_args()
+
+    # Track resources for cleanup
+    sut = None
+    qsl = None
+    backend = None
+    pbar = None
+    cleanup_done = False
+
+    def do_cleanup():
+        """Perform cleanup once and only once."""
+        nonlocal cleanup_done, pbar, sut, qsl, backend
+
+        if cleanup_done:
+            return
+        cleanup_done = True
+
+        logger.info("Performing cleanup...")
+
+        # 1. Close progress bar first (before any LoadGen cleanup)
+        try:
+            if pbar is not None:
+                pbar.close()
+                pbar = None
+                logger.debug("  ✓ Progress bar closed")
+        except Exception as e:
+            logger.debug(f"  ! Error closing progress bar: {e}")
+
+        # Small delay to let LoadGen internal threads finish
+        import time
+        time.sleep(0.5)
+
+        # 2. Stop SUT (this will stop worker threads and flush)
+        try:
+            if sut is not None:
+                logger.info("  - Stopping SUT and worker threads...")
+                sut.stop()
+                sut = None
+                logger.info("    ✓ SUT stopped")
+        except Exception as e:
+            logger.warning(f"    ! Error stopping SUT: {e}")
+
+        # 3. Destroy QSL
+        try:
+            if qsl is not None and qsl.qsl is not None:
+                logger.info("  - Destroying Query Sample Library...")
+                lg.DestroyQSL(qsl.qsl)
+                qsl.qsl = None
+                logger.info("    ✓ QSL destroyed")
+        except Exception as e:
+            logger.warning(f"    ! Error destroying QSL: {e}")
+
+        # 4. Cleanup backend last
+        try:
+            if backend is not None and backend.initialized:
+                logger.info("  - Cleaning up backend connection...")
+                backend.cleanup()
+                backend = None
+                logger.info("    ✓ Backend cleaned up")
+        except Exception as e:
+            logger.warning(f"    ! Error cleaning up backend: {e}")
+
+    try:
+        # Create output directories
+        output_dir = Path(args.output_dir)
+        log_dir = output_dir / args.scenario / \
+            ("accuracy" if args.accuracy else "performance")
+        log_dir.mkdir(parents=True, exist_ok=True)
+
+        logger.info("=" * 80)
+        logger.info("MLPerf Inference Benchmark Runner for GPT-OSS")
+        logger.info("=" * 80)
+        logger.info(f"Backend: {args.backend}")
+        logger.info(f"Scenario: {args.scenario}")
+        logger.info(f"Accuracy: {args.accuracy}")
+        logger.info(f"Input file: {args.input_file}")
+        logger.info(f"Output directory: {log_dir}")
+        logger.info("=" * 80)
+
+        # Load dataset
+        logger.debug("Loading tokenized dataset...")
+        with tqdm(total=1, desc="Loading dataset", unit="file") as pbar:
+            dataset_info = load_tokenized_dataset(
+                args.input_file,
+                max_samples=args.max_samples
+            )
+            prompts = dataset_info["prompts"]
+            df = dataset_info["dataframe"]
+            pbar.update(1)
+
+        logger.info(f"Loaded {len(prompts)} prompts from dataset")
+
+        # Load generation configuration
+        logger.info("Loading generation configuration...")
+        gen_config = load_generation_config(args.generation_config)
+
+        # Extract generation parameters with defaults
+        # CLI override takes precedence over config file
+        if args.max_new_tokens is not None:
+            max_tokens = args.max_new_tokens
+            logger.info(
+                f"Using max_new_tokens from CLI override: {max_tokens}")
+        else:
+            max_tokens = gen_config.get('max_new_tokens', 10240)
+            logger.info(f"Using max_new_tokens from config: {max_tokens}")
+
+        temperature = gen_config.get('temperature', 1.0)
+        top_k = gen_config.get('top_k', -1)
+        top_p = gen_config.get('top_p', 1.0)
+
+        logger.info("Generation parameters:")
+        logger.info(f"  max_new_tokens: {max_tokens}")
+        logger.info(f"  temperature: {temperature}")
+        logger.info(f"  top_k: {top_k}")
+        logger.info(f"  top_p: {top_p}")
+
+        # Initialize backend
+        logger.debug(f"Initializing {args.backend} backend...")
+        if args.backend == "sglang":
+            # Set pool size to match max_concurrency with small safety margin
+            # This prevents "connection pool is full" warnings
+            pool_size = int(args.max_concurrency * 1.1)  # 10% safety margin
+            backend = SGLangBackend(
+                server_url=args.server_url,
+                timeout=1200,
+                max_pool_size=pool_size
+            )
+        else:
+            raise ValueError(f"Unknown backend: {args.backend}")
+
+        # Initialize backend
+        backend.initialize()
+
+        # Create progress bar early so subsequent logs print below it
+        # Total will be dynamically updated by SUT based on actual queries from LoadGen:
+        # - Offline: Set once when all queries arrive
+        # - Server: Incremented as queries arrive
+        pbar = tqdm(
+            total=0,  # Will be updated dynamically by SUT
+            desc=f"MLPerf {args.scenario}",
+            unit="query",
+            leave=True,
+            position=0,
+            mininterval=0.1,
+            smoothing=0.1,
+            dynamic_ncols=True,
+            file=sys.stdout  # Force unbuffered output for async updates
+        )
+
+        # Create SUT with progress bar
+        logger.debug(f"Creating {args.scenario} SUT...")
+        if args.scenario == "offline":
+            sut = OfflineSUT(
+                backend=backend,
+                dataset=prompts,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                name=f"gpt-oss-120b_offline_sut",
+                progress_bar=pbar,
+                max_concurrency=args.max_concurrency
+            )
+        else:  # server
+            sut = ServerSUT(
+                backend=backend,
+                dataset=prompts,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_workers=args.num_workers,
+                name=f"gpt-oss-120b_server_sut",
+                progress_bar=pbar
+            )
+
+        # Create QSL
+        logger.info("Creating Query Sample Library...")
+        qsl = QuerySampleLibrary(prompts)
+        qsl.qsl = lg.ConstructQSL(
+            len(prompts),
+            len(prompts),
+            qsl.load_query_samples,
+            qsl.unload_query_samples
+        )
+
+        # Configure LoadGen
+        settings = configure_loadgen(
+            scenario=args.scenario,
+            accuracy_mode=args.accuracy,
+            mlperf_conf=args.mlperf_conf,
+            user_conf=args.user_conf,
+            log_dir=str(log_dir)
+        )
+
+        # Configure logging
+        log_settings = lg.LogSettings()
+        log_settings.log_output.outdir = str(log_dir)
+        log_settings.log_output.copy_summary_to_stdout = True
+        log_settings.enable_trace = False
+
+        # Start the SUT and run test
+        logger.info("Running LoadGen test...")
+        sut.start()
+        lg.StartTestWithLogSettings(
+            sut.sut,
+            qsl.qsl,
+            settings,
+            log_settings
+        )
+        logger.info("LoadGen test completed successfully")
+
+        # Give LoadGen a moment to finish internal cleanup
+        import time
+        time.sleep(0.2)
+
+        # Flush queries
+        logger.info("Flushing queries...")
+        with tqdm(total=1, desc="Flushing queries", unit="batch") as pbar:
+            sut.flush_queries()
+            pbar.update(1)
+
+        # Get results
+        logger.info("Retrieving results...")
+        with tqdm(total=1, desc="Getting results", unit="batch") as pbar:
+            results = sut.get_results()
+            pbar.update(1)
+        logger.info(f"Retrieved {len(results)} results from SUT")
+
+        logger.info(f"MLPerf results saved to: {log_dir}")
+
+        # If in accuracy mode, prompt user to run evaluation
+        if args.accuracy:
+            logger.info("=" * 80)
+            logger.info("Accuracy mode completed!")
+            logger.info("To evaluate accuracy, run:")
+            logger.info(
+                f"  python eval_accuracy.py --input-file {log_dir}/mlperf_log_accuracy.json")
+            logger.info("=" * 80)
+
+    except KeyboardInterrupt:
+        logger.info("\n" + "=" * 80)
+        logger.info("⚠️  Test interrupted by user (Ctrl+C)")
+        logger.info("=" * 80)
+        do_cleanup()
+        logger.info("=" * 80)
+        logger.info("✓ Cleanup completed successfully")
+        logger.info("=" * 80)
+        # Exit immediately to prevent finally block from running
+        os._exit(130)  # Use os._exit to skip finally block
+
+    except Exception as e:
+        logger.error("\n" + "=" * 80)
+        logger.error(f"❌ Error during test: {e}")
+        logger.error("=" * 80)
+        logger.error("Stack trace:", exc_info=True)
+        do_cleanup()
+        logger.error("=" * 80)
+        # Exit immediately to prevent finally block from running
+        os._exit(1)
+
+    finally:
+        # Only run cleanup if not already done (normal exit path)
+        if not cleanup_done:
+            do_cleanup()
+            logger.info("=" * 80)
+            logger.info("✓ Cleanup completed successfully")
+            logger.info("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/setup.sh b/language/gpt-oss/setup.sh
new file mode 100755
index 0000000000..23188a0cbd
--- /dev/null
+++ b/language/gpt-oss/setup.sh
@@ -0,0 +1,3 @@
+pip install -r requirements.txt
+git_dir=$(git rev-parse --show-toplevel)
+pip install $git_dir/loadgen
\ No newline at end of file
diff --git a/language/gpt-oss/setup_enroot.sh b/language/gpt-oss/setup_enroot.sh
new file mode 100755
index 0000000000..c534ded13e
--- /dev/null
+++ b/language/gpt-oss/setup_enroot.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+sqsh_location=$(readlink -f $(dirname $0))/sqsh_files
+sandbox_name=sglang_v0.5.4.post2
+docker_image=lmsysorg/sglang:v0.5.4.post2
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --docker_image)
+            docker_image=$2
+            shift 2
+            ;;
+        --sandbox_name)
+            sandbox_name=$2
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --docker_image <docker_image> --sandbox_name <sandbox_name>"
+            exit 1
+            ;;
+    esac
+done
+
+mkdir -p $sqsh_location
+enroot import -o $sqsh_location/$sandbox_name.sqsh docker://$docker_image
+enroot create --name $sandbox_name $sqsh_location/$sandbox_name.sqsh
+# enroot start --mount $(pwd):$(pwd) --root --rw $sandbox_name
diff --git a/language/gpt-oss/sglang/run_server.sh b/language/gpt-oss/sglang/run_server.sh
new file mode 100755
index 0000000000..3fd01b6629
--- /dev/null
+++ b/language/gpt-oss/sglang/run_server.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+pip install -r requirements.txt
+
+dp=1
+model_path=openai/gpt-oss-120b
+eagle_path=""
+stream_interval=500
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dp)
+            dp=$2
+            shift 2
+            ;;
+        --model_path)
+            model_path=$2
+            shift 2
+            ;;
+        --eagle_path)
+            eagle_path=$2
+            shift 2
+            ;;
+        --stream_interval)
+            stream_interval=$2
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
+    esac
+done
+
+args=" --model-path $model_path \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --tp-size=1 \
+    --data-parallel-size=$dp \
+    --max-running-requests $((dp * 512)) \
+    --mem-fraction-static 0.85 \
+    --chunked-prefill-size 16384 \
+    --ep-size=1 \
+    --stream-interval $stream_interval "
+
+if [ -n "$eagle_path" ]; then
+    args="$args --speculative-draft-model-path $eagle_path \
+        --speculative-algorithm EAGLE3"
+fi
+
+# --speculative-num-steps 1 \
+# --speculative-eagle-topk 1 \
+# --speculative-num-draft-tokens 3 \
+
+
+set -x;
+python3 -m sglang.launch_server $args
diff --git a/language/gpt-oss/submodules/LiveCodeBench b/language/gpt-oss/submodules/LiveCodeBench
new file mode 120000
index 0000000000..d1e5c66592
--- /dev/null
+++ b/language/gpt-oss/submodules/LiveCodeBench
@@ -0,0 +1 @@
+../../deepseek-r1/submodules/LiveCodeBench
\ No newline at end of file
diff --git a/language/gpt-oss/submodules/prm800k b/language/gpt-oss/submodules/prm800k
new file mode 120000
index 0000000000..1b078c3842
--- /dev/null
+++ b/language/gpt-oss/submodules/prm800k
@@ -0,0 +1 @@
+../../deepseek-r1/submodules/prm800k
\ No newline at end of file
diff --git a/language/gpt-oss/utils/__init__.py b/language/gpt-oss/utils/__init__.py
new file mode 100644
index 0000000000..9b3b53963d
--- /dev/null
+++ b/language/gpt-oss/utils/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+"""Utilities for gpt-oss MLPerf integration."""
+
+from .tokenization import StandardTokenizer, load_tokenized_dataset
+
+__all__ = [
+    "StandardTokenizer",
+    "load_tokenized_dataset",
+]
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
new file mode 100644
index 0000000000..a64d77a5f9
--- /dev/null
+++ b/language/gpt-oss/utils/tokenization.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Tokenization utilities for gpt-oss."""
+
+import logging
+from typing import List, Dict, Any, Optional
+import pandas as pd
+from transformers import AutoTokenizer
+
+logger = logging.getLogger(__name__)
+
+MODEL_NAME = "openai/gpt-oss-120b"
+
+
+class StandardTokenizer:
+    """Standard tokenizer wrapper for gpt-oss model."""
+
+    def __init__(self, model_name: str = MODEL_NAME):
+        """Initialize the tokenizer.
+
+        Args:
+            model_name: HuggingFace model name or path
+        """
+        self.model_name = model_name
+        self.tokenizer = None
+        logger.info(f"Initializing tokenizer for {model_name}")
+
+    def load(self) -> None:
+        """Load the tokenizer."""
+        if self.tokenizer is None:
+            logger.info(f"Loading tokenizer from {self.model_name}")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            logger.info("Tokenizer loaded successfully")
+
+    def encode(self, text: str) -> List[int]:
+        """Encode text to token IDs.
+
+        Args:
+            text: Input text
+
+        Returns:
+            List of token IDs
+        """
+        if self.tokenizer is None:
+            self.load()
+        return self.tokenizer.encode(text)
+
+    def decode(self, token_ids: List[int],
+               skip_special_tokens: bool = True) -> str:
+        """Decode token IDs to text.
+
+        Args:
+            token_ids: List of token IDs
+            skip_special_tokens: Whether to skip special tokens
+
+        Returns:
+            Decoded text
+        """
+        if self.tokenizer is None:
+            self.load()
+        return self.tokenizer.decode(
+            token_ids, skip_special_tokens=skip_special_tokens)
+
+    def __call__(self, text: str) -> List[int]:
+        """Encode text to token IDs (callable interface).
+
+        Args:
+            text: Input text
+
+        Returns:
+            List of token IDs
+        """
+        return self.encode(text)
+
+
+def load_tokenized_dataset(
+    dataset_path: str,
+    max_samples: Optional[int] = None
+) -> Dict[str, Any]:
+    """Load a tokenized dataset from pickle file.
+
+    Args:
+        dataset_path: Path to the pickle file containing tokenized data
+        max_samples: Maximum number of samples to load (None for all)
+
+    Returns:
+        Dictionary containing:
+            - prompts: List of tokenized prompts
+            - dataframe: Original DataFrame
+            - metadata: Additional metadata
+    """
+    logger.info(f"Loading tokenized dataset from {dataset_path}")
+
+    # Load DataFrame from pickle
+    df = pd.read_pickle(dataset_path)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+
+    # Limit samples if specified
+    if max_samples is not None:
+        df = df.head(max_samples)
+        logger.info(f"Limited to {max_samples} samples")
+
+    # Extract tokenized prompts
+    if 'tok_input' not in df.columns:
+        raise ValueError(
+            "Dataset must have 'tok_input' column with tokenized prompts")
+
+    # Verify tokenization
+    failed_mask = df['tok_input'].isna()
+    if failed_mask.any():
+        failed_count = failed_mask.sum()
+        logger.error(f"Found {failed_count} samples with failed tokenization")
+        raise ValueError(f"{failed_count} samples have invalid tokenization")
+
+    prompts = df['tok_input'].tolist()
+    logger.info(f"Loaded {len(prompts)} tokenized prompts")
+
+    # Log statistics
+    prompt_lengths = [len(p) for p in prompts]
+    logger.info(
+        f"Prompt length stats - "
+        f"min: {min(prompt_lengths)}, "
+        f"max: {max(prompt_lengths)}, "
+        f"mean: {sum(prompt_lengths)/len(prompt_lengths):.1f}"
+    )
+
+    return {
+        "prompts": prompts,
+        "dataframe": df,
+        "metadata": {
+            "num_samples": len(prompts),
+            "min_length": min(prompt_lengths),
+            "max_length": max(prompt_lengths),
+            "mean_length": sum(prompt_lengths) / len(prompt_lengths)
+        }
+    }
diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index c1abea9d14..4c5ca66f0c 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -459,8 +459,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
             std::stringstream ss;
             ss << "IssueQueryThread " << thread_idx
-               << " Ending early: Too many outstanding queries." << " issued "
-               << queries_issued_total << " outstanding "
+               << " Ending early: Too many outstanding queries."
+               << " issued " << queries_issued_total << " outstanding "
                << queries_outstanding;
             MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
@@ -499,8 +499,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max query count reached." << " query_count "
-           << queries_issued;
+           << " Ending early: Max query count reached."
+           << " query_count " << queries_issued;
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
@@ -519,8 +519,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max test duration reached." << " duration_ns "
-           << duration.count();
+           << " Ending early: Max test duration reached."
+           << " duration_ns " << duration.count();
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index 807c1954a8..d7e83e54b9 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -812,7 +812,8 @@ void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
   if (max_entry_vector_size > kTlsLogReservedEntryCount) {
 #if USE_NEW_LOGGING_FORMAT
     std::stringstream msg;
-    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
+    msg << "Logging allocation detected:"
+        << " tid: " << tls_logger->Tid()
         << " reserved_entries: " << kTlsLogReservedEntryCount
         << " max_entries: " << max_entry_vector_size;
     MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1b825514bd..af5b9f81a7 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -25,6 +25,7 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
+gpt-oss-120b.*.performance_sample_count_override = 8036
 whisper.*.performance_sample_count_override = 1633
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
@@ -66,6 +67,7 @@ llama3_1-8b.*.sample_concatenate_permutation = 1
 llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
+gpt-oss-120b.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
@@ -90,6 +92,7 @@ llama3_1-8b.*.use_token_latencies = 1
 llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
+gpt-oss-120b.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
 
 # gptj benchmark infers token latencies
@@ -132,6 +135,10 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
+gpt-oss-120b.Server.target_latency = 0
+gpt-oss-120b.Server.ttft_latency = 2000
+gpt-oss-120b.Server.tpot_latency = 20
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -155,10 +162,11 @@ llama3_1-8b-edge.Offline.min_query_count = 5000
 mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
+gpt-oss-120b.Offline.min_query_count = 8036
 whisper.Offline.min_query_count = 1633
 
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10
 *.MultiStream.target_latency = 80
-*.Server.target_qps = 1.0
+*.Server.target_qps = 8.0
 *.Offline.target_qps = 1.0