vllm-project · WoosukKwon · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
@@ -197,6 +197,7 @@ def check_available_online(
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
                                           {"1b": "EleutherAI/pythia-1.4b"}),
+    "GptOssForCausalLM": _HfExamplesInfo("openai/gpt-oss-20b"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
     "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -247,6 +247,34 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             config.max_model_len)
 
 
+class GptOssConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        decoding_config = vllm_config.decoding_config
+        if decoding_config.reasoning_backend == "":
+            decoding_config.reasoning_backend = "openai"
+
+        # Increase the max capture size from 512 to 1024 for performance.
+        # NOTE(woosuk): This will increase the number of CUDA graphs
+        # from 67 to 83.
+        scheduler_config = vllm_config.scheduler_config
+        if len(scheduler_config.cuda_graph_sizes) == 1:
+            max_capture_size = scheduler_config.cuda_graph_sizes[0]
+            # FIXME(woosuk): When using full cuda graph with FA3, the max
+            # supported size is 992.
+            if max_capture_size < 1024:
+                cuda_graph_sizes = [1, 2, 4]
+                # Step size 8 for small batch sizes
+                cuda_graph_sizes += [i for i in range(8, 256, 8)]
+                # Step size 16 for larger batch sizes
+                cuda_graph_sizes += [i for i in range(256, 1025, 16)]
+                scheduler_config.cuda_graph_sizes = cuda_graph_sizes
+                logger.info(
+                    "Overriding max cuda graph capture size to "
+                    "%d for performance.", 1024)
+
+
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
 
     @classmethod
@@ -345,4 +373,5 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
     "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
     "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
     "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
+    "GptOssForCausalLM": GptOssConfig,
 }