NVIDIA · akoumpa · May 5, 2025 · May 5, 2025 · May 5, 2025 · May 5, 2025
diff --git a/examples/llm/sft/automodel.py → examples/llm/finetune/automodel.py b/examples/llm/sft/automodel.py → examples/llm/finetune/automodel.py
@@ -26,7 +26,7 @@
 
 # Run this example with torchrun, for example:
 # torchrun --nproc-per-node=8 \
-#   examples/llm/peft/automodel.py \
+#   examples/llm/finetune/automodel.py
 #   --strategy fsdp2 \
 #   --devices 8 \
 #   --model meta-llama/Llama-3.2-1B \
@@ -269,6 +269,9 @@ def main():
         'are currently supported only with position_ids and not attention_mask. Hence packed sequences needs to be'
         'run with --attn-implementation=flash_attention_2',
     )
+    parser.add_argument(
+        '--lora', action='store_true', help='Enables LoRA finetuning (PEFT); Default:  Supervised fine-tuning (SFT).'
+    )
 
     args = parser.parse_args()
 
@@ -341,7 +344,7 @@ def main():
         model,
         args.devices,
         args.num_nodes,
-        False,
+        args.lora,
         args.enable_cpu_offload,
         dp_size=args.dp_size,
         tp_size=args.tp_size,
@@ -411,6 +414,14 @@ def main():
         optim=optimizer,
         log=logger(args.ckpt_folder, args.max_steps // 2),
         resume=resume,
+        peft=(
+            llm.peft.LoRA(
+                target_modules=['*_proj'],
+                dim=8,
+            )
+            if args.lora
+            else None
+        ),
     )
 
 

diff --git a/examples/llm/sft/automodel_vllm.py → examples/llm/finetune/automodel_vllm.py b/examples/llm/sft/automodel_vllm.py → examples/llm/finetune/automodel_vllm.py
@@ -31,11 +31,19 @@
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', required=True, type=str, help="Local path or model name on Hugging Face")
+    parser.add_argument('--lora', required=True, type=str, default=None, help="Local path of the lora model")
     parser.add_argument('--triton-model-name', required=True, type=str, help="Name for the service")
     args = parser.parse_args()
 
     exporter = vLLMHFExporter()
     exporter.export(model=args.model)
+    if args.lora is not None:
+        exporter.add_lora_models(lora_model_name=lora_model_name, lora_model=args.lora_model)
+        print(
+            "------------- Output: ",
+            exporter.forward(input_texts=["How are you doing?"], lora_model_name=lora_model_name),
+        )
+        quit()
 
     nm = DeployPyTriton(
         model=exporter,

diff --git a/examples/llm/peft/automodel.py b/examples/llm/peft/automodel.py
diff --git a/examples/llm/peft/automodel_vllm.py b/examples/llm/peft/automodel_vllm.py
diff --git a/scripts/vlm/automodel.py → examples/vlm/finetune/automodel.py b/scripts/vlm/automodel.py → examples/vlm/finetune/automodel.py
@@ -79,7 +79,9 @@ def make_strategy(strategy, model, devices, num_nodes, adapter_only=False):
         default="quintend/rdr-items",
         help="Path to the dataset. Can be a local path or a HF dataset name",
     )
-    parser.add_argument("--peft", type=str, default="none", choices=["lora", "none"], help="Which peft to use")
+    parser.add_argument(
+        "--lora", action='store_ture', help='Enables LoRA finetuning (PEFT); Default:  Supervised fine-tuning (SFT).'
+    )
     parser.add_argument("--freeze-vision-model", action="store_true", help="Freeze the vision model parameters")
     parser.add_argument("--freeze-language-model", action="store_true", help="Freeze the language model parameters")
     args = parser.parse_args()
@@ -119,7 +121,7 @@ def make_strategy(strategy, model, devices, num_nodes, adapter_only=False):
     )
 
     peft = None
-    if args.peft == 'lora':
+    if args.lora:
         peft = llm.peft.LoRA(
             target_modules=['*_proj'],
             dim=16,