Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ exclude = [
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
# Python 3.8 typing - skip V0 code
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"]
"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
# Python 3.8 typing - skip utils for ROCm
"vllm/utils.py" = ["UP006", "UP035"]

[tool.ruff.lint]
Expand Down
25 changes: 13 additions & 12 deletions vllm/model_executor/model_loader/bitsandbytes_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import itertools
import math
import os
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
from collections.abc import Generator
from typing import Any, Callable, Optional

import numpy as np
import torch
Expand Down Expand Up @@ -49,21 +50,21 @@ def __init__(self, load_config: LoadConfig):
super().__init__(load_config)

# Save the module names without sharding.
self.unsharded_weights_modules: List[str] = []
self.unsharded_weights_modules: list[str] = []
# Save the module names that are sharded by column.
self.column_sharded_weights_modules: List[str] = []
self.column_sharded_weights_modules: list[str] = []
# Store all module names (from transformers) that support
# BNB quantization.
self.target_modules: List[str] = []
self.target_modules: list[str] = []
# mapping weight names from transformers to vllm.
self.weight_mapper: Callable = lambda name: name

def _get_weight_files(
self,
model_name_or_path: str,
allowed_patterns: List[str],
allowed_patterns: list[str],
revision: Optional[str] = None,
) -> Tuple[str, List[str], str]:
) -> tuple[str, list[str], str]:
"""Retrieve weight files. Download the files if necessary.

Return the weight files and the file pattern."""
Expand Down Expand Up @@ -95,7 +96,7 @@ def _get_weight_files(
f"No model weights found in: `{model_name_or_path}`")

def _prepare_weights(self, model_name_or_path: str,
revision: Optional[str]) -> Tuple[List[str], bool]:
revision: Optional[str]) -> tuple[list[str], bool]:
"""Prepare weight files for the model."""

allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
Expand Down Expand Up @@ -155,7 +156,7 @@ def _get_quantized_weights_iterator(
revision: Optional[str],
pre_quant: bool,
load_8bit: bool,
) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str,
) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str,
Any]]:
"""Get an iterator to the model weights with bitsandbytes quantization,
as well as the quantization state dictionary."""
Expand All @@ -175,7 +176,7 @@ def _get_quantized_weights_iterator(
hf_weights_files, use_safetensors = self._prepare_weights(
model_name_or_path, revision)

quant_state_dict: Dict[str, Any] = {}
quant_state_dict: dict[str, Any] = {}

if pre_quant:
if load_8bit:
Expand Down Expand Up @@ -257,7 +258,7 @@ def _quantized_4bit_generator(self, hf_weights_files, use_safetensors,

# Closure to parse quant_state for each prequant weight
def _parse_quant_state(param_name: str,
temp_state_dict: Dict) -> QuantState:
temp_state_dict: dict) -> QuantState:
quant_state = {}
for k in temp_state_dict:
if param_name + "." in k:
Expand Down Expand Up @@ -415,7 +416,7 @@ def _load_weights(self, model_config: ModelConfig,

# Modules whose weights might have fused on disk
# we need their output_sizes to make shard in flight correctly with TP
self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
self.maybe_fused_weights_modules: dict[str, list[int]] = {}
self._get_bnb_target_modules(model)
for name, module in model.named_modules():
# Some modules like `ReplicatedLinear` should not have their weights
Expand Down Expand Up @@ -480,7 +481,7 @@ def _load_weights(self, model_config: ModelConfig,
torch.cuda.empty_cache()

param_dict = dict(model.named_parameters())
stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
stacked_quant_state_dict: dict[str, dict[int, Any]] = {}
# TODO: Change this lazy import to normal import
# after the checks are updated to run on a new version
from vllm.model_executor.models.utils import is_pp_missing_parameter
Expand Down
11 changes: 6 additions & 5 deletions vllm/model_executor/model_loader/default_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import glob
import os
import time
from typing import Generator, Iterable, List, Optional, Tuple, cast
from collections.abc import Generator, Iterable
from typing import Optional, cast

import huggingface_hub
import torch
Expand Down Expand Up @@ -92,7 +93,7 @@ def _prepare_weights(
revision: Optional[str],
fall_back_to_pt: bool,
allow_patterns_overrides: Optional[list[str]],
) -> Tuple[str, List[str], bool]:
) -> tuple[str, list[str], bool]:
"""Prepare weights for the model.

If the model is not local, it will be downloaded."""
Expand Down Expand Up @@ -138,7 +139,7 @@ def _prepare_weights(
else:
hf_folder = model_name_or_path

hf_weights_files: List[str] = []
hf_weights_files: list[str] = []
for pattern in allow_patterns:
hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
if len(hf_weights_files) > 0:
Expand Down Expand Up @@ -173,7 +174,7 @@ def _prepare_weights(

def _get_weights_iterator(
self, source: "Source"
) -> Generator[Tuple[str, torch.Tensor], None, None]:
) -> Generator[tuple[str, torch.Tensor], None, None]:
"""Get an iterator for the model weights based on the load format."""
hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
source.model_or_path, source.revision, source.fall_back_to_pt,
Expand Down Expand Up @@ -238,7 +239,7 @@ def get_all_weights(
self,
model_config: ModelConfig,
model: nn.Module,
) -> Generator[Tuple[str, torch.Tensor], None, None]:
) -> Generator[tuple[str, torch.Tensor], None, None]:
primary_weights = DefaultModelLoader.Source(
model_config.model,
model_config.revision,
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/model_loader/gguf_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
import os
from typing import Dict, Generator, Tuple
from collections.abc import Generator

import gguf
import torch
Expand Down Expand Up @@ -84,8 +84,8 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
return gguf_to_hf_name_map

def _get_weights_iterator(
self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str]
) -> Generator[Tuple[str, torch.Tensor], None, None]:
self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
) -> Generator[tuple[str, torch.Tensor], None, None]:
return gguf_quant_weights_iterator(model_name_or_path,
gguf_to_hf_name_map)

Expand Down
10 changes: 5 additions & 5 deletions vllm/model_executor/model_loader/neuron.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import copy
import importlib
import os
from typing import Dict, List, Optional, Tuple
from typing import Optional

import torch
import torch.nn as nn
Expand Down Expand Up @@ -33,7 +33,7 @@
}

# Models supported by Neuron.
_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = {
_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = {
"LlamaForCausalLM": ("transformers_neuronx.llama.model",
"LlamaForSampling", "LlamaForCausalLM"),
"MistralForCausalLM": ("transformers_neuronx.mistral.model",
Expand Down Expand Up @@ -146,7 +146,7 @@ def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[List[SamplerOutput]]:
) -> Optional[list[SamplerOutput]]:
batch_size, num_steps = logits.shape
seq_ids = [
seq_id for sg in sampling_metadata.seq_groups
Expand Down Expand Up @@ -188,7 +188,7 @@ def _get_model_architecture(config: PretrainedConfig) -> str:
f"{list(_NEURON_SUPPORTED_MODELS.keys())}")


def _get_buckets(env: str, default_value: List[int]) -> List[int]:
def _get_buckets(env: str, default_value: list[int]) -> list[int]:
env_value = os.getenv(env)
if env_value is None:
return default_value
Expand Down Expand Up @@ -464,7 +464,7 @@ def get_neuron_eagle_speculation_model(model_config: ModelConfig,

draft_model.eval()

token_tree: Dict[int, List[int]] = ast.literal_eval(
token_tree: dict[int, list[int]] = ast.literal_eval(
speculation_config.speculative_token_tree)

speculation_model = EagleSpeculativeDecoder(draft_model.model,
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/model_loader/neuronx_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import multiprocessing
import os
import shutil
from typing import Dict, List, Optional, Tuple
from typing import Optional

import torch
import torch.nn as nn
Expand Down Expand Up @@ -46,7 +46,7 @@
}

# Models supported by Neuronx distributed for inference.
_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str]] = {
_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = {
"LlamaForCausalLM":
("neuronx_distributed_inference.models.llama.modeling_llama",
"NeuronLlamaForCausalLM"),
Expand Down Expand Up @@ -365,7 +365,7 @@ def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[List[SamplerOutput]]:
) -> Optional[list[SamplerOutput]]:
batch_size, num_steps = logits.shape
seq_ids = [
seq_id for sg in sampling_metadata.seq_groups
Expand Down
7 changes: 4 additions & 3 deletions vllm/model_executor/model_loader/runai_streamer_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# ruff: noqa: SIM117
import glob
import os
from typing import Generator, List, Optional, Tuple
from collections.abc import Generator
from typing import Optional

import torch
from torch import nn
Expand Down Expand Up @@ -48,7 +49,7 @@ def __init__(self, load_config: LoadConfig):
os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url

def _prepare_weights(self, model_name_or_path: str,
revision: Optional[str]) -> List[str]:
revision: Optional[str]) -> list[str]:
"""Prepare weights for the model.

If the model is not local, it will be downloaded."""
Expand Down Expand Up @@ -87,7 +88,7 @@ def _prepare_weights(self, model_name_or_path: str,

def _get_weights_iterator(
self, model_or_path: str,
revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
revision: str) -> Generator[tuple[str, torch.Tensor], None, None]:
"""Get an iterator for the model weights based on the load format."""
hf_weights_files = self._prepare_weights(model_or_path, revision)
return runai_safetensors_weights_iterator(
Expand Down
13 changes: 7 additions & 6 deletions vllm/model_executor/model_loader/sharded_state_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import collections
import glob
import os
from typing import Any, Dict, Generator, List, Optional, Tuple
from collections.abc import Generator
from typing import Any, Optional

import torch
from torch import nn
Expand Down Expand Up @@ -48,12 +49,12 @@ def __init__(self,

@staticmethod
def _filter_subtensors(
tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]:
tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]:
"""
Filter out all tensors that share the same memory or a subset of the
memory of another tensor.
"""
same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = (
same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = (
collections.defaultdict(list))
for key, tensor in tensors.items():
if tensor.numel():
Expand All @@ -63,7 +64,7 @@ def _filter_subtensors(
def get_end_ptr(tensor: torch.Tensor) -> int:
return tensor.view(-1)[-1].data_ptr() + tensor.element_size()

result: Dict[str, torch.Tensor] = {}
result: dict[str, torch.Tensor] = {}
for group in same_storage_groups.values():
for k, t in group:
a, b = t.data_ptr(), get_end_ptr(t)
Expand Down Expand Up @@ -160,7 +161,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
return model.eval()

def iterate_over_files(
self, paths) -> Generator[Tuple[str, torch.Tensor], None, None]:
self, paths) -> Generator[tuple[str, torch.Tensor], None, None]:
if self.runai_model_streamer:
yield from runai_safetensors_weights_iterator(paths, True)
else:
Expand Down Expand Up @@ -188,7 +189,7 @@ def save_model(
part_idx = 0
total_size = 0
state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
state_dict_part: Dict[str, torch.Tensor] = {}
state_dict_part: dict[str, torch.Tensor] = {}
for key, tensor in state_dict.items():
param_size = tensor.nelement() * tensor.element_size()
if max_size is not None and total_size + param_size > max_size:
Expand Down
7 changes: 4 additions & 3 deletions vllm/model_executor/model_loader/tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import os
import re
import time
from collections.abc import Generator
from dataclasses import dataclass
from functools import partial
from typing import BinaryIO, Generator, Optional, Tuple, Type, Union
from typing import BinaryIO, Optional, Union

import torch
from torch import nn
Expand Down Expand Up @@ -67,7 +68,7 @@ class TensorizerConfig:
s3_access_key_id: Optional[str] = None
s3_secret_access_key: Optional[str] = None
s3_endpoint: Optional[str] = None
model_class: Optional[Type[torch.nn.Module]] = None
model_class: Optional[type[torch.nn.Module]] = None
hf_config: Optional[PretrainedConfig] = None
dtype: Optional[Union[str, torch.dtype]] = None
_is_sharded: bool = False
Expand Down Expand Up @@ -365,7 +366,7 @@ def deserialize(self):

def tensorizer_weights_iterator(
tensorizer_args: "TensorizerArgs"
) -> Generator[Tuple[str, torch.Tensor], None, None]:
) -> Generator[tuple[str, torch.Tensor], None, None]:
logger.warning("Deserializing HuggingFace models is not optimized for "
"loading on vLLM, as tensorizer is forced to load to CPU. "
"Consider deserializing a vLLM model instead for faster "
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/model_loader/tensorizer_loader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: SIM117
import copy
from typing import Generator, Tuple
from collections.abc import Generator

import torch
from torch import nn
Expand Down Expand Up @@ -36,7 +36,7 @@ def _verify_config(self, model_config: ModelConfig,
self.tensorizer_config.verify_with_parallel_config(parallel_config)

def _get_weights_iterator(
self, ) -> Generator[Tuple[str, torch.Tensor], None, None]:
self, ) -> Generator[tuple[str, torch.Tensor], None, None]:
tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
return tensorizer_weights_iterator(tensorizer_args)

Expand Down
Loading