optional torchtune

Mark Obozov · Mark Obozov · commit 6f3967ac4594 · 2025-05-06T15:13:38.000+03:00
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -37,15 +37,6 @@
 from torchchat.utils.quantize import quantize_model
 
 
-from torchtune.models.convert_weights import meta_to_tune
-
-from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
-
-from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
-
-from torchtune.training import set_default_dtype
-
-
 @dataclass
 class BuilderArgs:
     checkpoint_path: Optional[Union[Path, str]] = None
@@ -416,6 +407,8 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model:
 
 
 def _load_checkpoint(builder_args: BuilderArgs):
+    from torchtune.models.convert_weights import meta_to_tune
+
     if builder_args.params_table and builder_args.params_table.endswith("Tune"):
         print("Loading Tune checkpoint")
         meta_checkpoint = torch.load(
@@ -458,6 +451,10 @@ def _load_checkpoint(builder_args: BuilderArgs):
 
 
 def _load_model_default(builder_args: BuilderArgs) -> Model:
+    from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
+    from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune
+    from torchtune.training import set_default_dtype
+
     assert not builder_args.gguf_path
 
     model: Model = _init_model_on_meta_device(builder_args)
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -30,14 +30,6 @@
 
 from PIL import Image
 
-# torchtune model definition dependencies
-from torchtune.data import Message, padded_collate_tiled_images_and_mask
-
-from torchtune.generation import sample as tune_sample
-
-from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
-from torchtune.training import set_default_dtype
-
 from torchchat.cli.builder import (
     _initialize_model,
     _initialize_tokenizer,
@@ -450,6 +442,8 @@ def prefill(
         sequential_prefill=True,
         **sampling_kwargs,
     ) -> torch.Tensor:
+        from torchtune.generation import sample as tune_sample
+
         logger.debug("x: %s, input_pos: %s", x, input_pos)
         width = x.size(1)
         assert input_pos.size(0) == width
@@ -870,6 +864,11 @@ def _gen_model_input(
         max_new_tokens: Optional[int] = None,
         max_seq_len: Optional[int] = 2048,
     ) -> Tuple[torch.Tensor, Optional[Dict[str, Any]]]:
+        # torchtune model definition dependencies
+        from torchtune.data import Message, padded_collate_tiled_images_and_mask
+        from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
+        from torchtune.training import set_default_dtype
+
         """
         Convert prompt and image prompts into consumable model input args.
 
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -37,14 +37,6 @@
 except Exception:
     pass
 
-from torchtune.models.clip import clip_vision_encoder
-from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder
-from torchtune.models.llama3_2_vision._component_builders import (
-    llama3_2_vision_decoder,
-    llama3_2_vision_encoder,
-)
-from torchtune.modules.model_fusion import DeepFusionModel
-
 from torchchat.utils.build_utils import find_multiple, get_precision
 
 config_path = Path(f"{str(Path(__file__).parent)}/model_params")
@@ -214,6 +206,7 @@ def _text_only(cls):
 
     @classmethod
     def _llama3_1(cls):
+        from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder
         return cls(
             model_type=ModelType.Llama3_1,
             modules={"text": llama3_1_builder},
@@ -222,6 +215,12 @@ def _llama3_1(cls):
 
     @classmethod
     def _flamingo(cls):
+        from torchtune.models.llama3_2_vision._component_builders import (
+            llama3_2_vision_decoder,
+            llama3_2_vision_encoder,
+        )
+        from torchtune.modules.model_fusion import DeepFusionModel
+
         return cls(
             model_type=ModelType.Flamingo,
             modules={
@@ -233,6 +232,7 @@ def _flamingo(cls):
 
     @classmethod
     def _llava(cls):
+        from torchtune.models.clip import clip_vision_encoder
         return cls(
             model_type=ModelType.Llava,
             modules={
@@ -504,10 +504,16 @@ def build_model(self) -> nn.Module:
 
         # Temporary add extra params to the DeepFusionModel.
         # TODO: Remove it once we can make fusion model configurable in model_param.
-        if recipe.fusion_class == DeepFusionModel:
-            modules["encoder_trainable"] = False
-            modules["decoder_trainable"] = False
-            modules["fusion_trainable"] = False
+        try:
+            from torchtune.modules.model_fusion import DeepFusionModel
+            if recipe.fusion_class == DeepFusionModel:
+                modules["encoder_trainable"] = False
+                modules["decoder_trainable"] = False
+                modules["fusion_trainable"] = False
+        except ModuleNotFoundError:
+            # In case it is actually DeepFusionModel and torchtune is not installed,
+            # it will fail with an error further without unexpected behavior.
+            pass
 
         return recipe.fusion_class(**modules)
 
diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py
@@ -19,10 +19,6 @@
 
 from PIL import Image
 
-from torchtune.data import Message, padded_collate_tiled_images_and_mask
-
-from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
-
 from torchchat.cli.download import is_model_downloaded, load_model_configs
 from torchchat.generate import LocalGenerator, DistributedGenerator, GeneratorArgs
 from torchchat.model import FlamingoModel
@@ -304,7 +300,7 @@ def __init__(self, *args, **kwargs):
 
     def _gen_model_inputs_from_openai_completion_request(
         self, completion_request: CompletionRequest
-    ) -> List[Message]:
+    ) -> List:
         """Generate model inputs from an OpenAI completion request.
 
         Args: