add sample run and training script

movelikeriver · movelikeriver · commit 221b03243d27 · 2023-04-14T21:54:29.000-07:00
diff --git a/examples/dreambooth/sample_training_dreambooth.sh b/examples/dreambooth/sample_training_dreambooth.sh
@@ -0,0 +1,39 @@
+set -euv
+
+# TODO: how to fine tune?
+# for training, need to install `diffusers` from local:
+# https://huggingface.co/docs/diffusers/installation#install-from-source
+#
+# pip install -e ".[torch]"
+
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+# moved from ~/.cache/huggingface/.....
+# export MODEL_NAME='sd-compvis-model'
+
+OUTPUT_DIR="dreambooth_model1"
+INSTANCE_DIR="data/instance_images"
+CAPTIONS_DIR="data/captions"
+
+# if GPU, set --mixed_precision="fp16"
+#
+# training param to tune
+#  --max_train_steps=15000 \
+#  --learning_rate=1e-05 \
+#  --use_8bit_adam \
+#   --captions_dir="$CAPTIONS_DIR" \
+
+# dump only textenc
+
+accelerate launch --mixed_precision="no" examples/dreambooth/train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400
+
diff --git a/examples/text_to_image/sample_inference_txt2img.py b/examples/text_to_image/sample_inference_txt2img.py
@@ -0,0 +1,70 @@
+"""
+usage:
+python sample_inference_txt2img.py
+
+sample code from https://github.com/huggingface/diffusers
+
+one time setup:
+
+conda create -n sd2 pytorch==1.12.1 torchvision==0.13.1
+conda activate sd2
+# conda install -c conda-forge diffusers==0.12.1    <-- conda version
+# conda install -c conda-forge transformers==4.19.2   <-- one repo needs this version
+conda install -c conda-forge transformers==4.27.4
+conda install -c conda-forge accelerate==0.18.0
+conda install -c conda-forge datasets==2.11.0
+conda install -c conda-forge ftfy==6.1.1
+pip install invisible-watermark
+
+
+for training, need to install `diffusers` from local:
+https://huggingface.co/docs/diffusers/installation#install-from-source
+"""
+
+import time
+import torch
+from diffusers import StableDiffusionPipeline
+
+
+load_from_local = False
+
+if not load_from_local:
+    # option-1: download from Hub
+    # will download to ~/.cache/huggingface/...
+    model_path = 'runwayml/stable-diffusion-v1-5'
+    # model_path = '~/.cache/huggingface/diffusers/models--runwayml--stable-diffusion-v1-5/snapshots/39593d5650112b4cc580433f6b0435385882d819'
+    # model_path = 'CompVis/stable-diffusion-v1-4'
+    # model_path = '~/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/249dd2d739844dea6a0bc7fc27b3c1d014720b28'
+    # model_path = 'sd-compvis-model'  # moved from ~/.cache/huggingface/...
+    print(f"downloading {model_path}")
+
+else:
+    # option-2: load from local path
+    model_path = 'sd-pokemon-model'
+    print(f"loading from local path {model_path}")
+
+start = time.time()
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32, safety_checker=None, requires_safety_checker=False)
+pipe = pipe.to("cpu")
+# Recommended if your computer has < 64 GB of RAM
+pipe.enable_attention_slicing()
+
+# Note: maximum sequence length for this model
+# prompt = "yoda"
+prompt = "This Elegant 14K Solid Two Tone Gold Mens Wedding Band is 6mm wide.  Center of the Ring has a Satin Finished and edges are Shiny Finish. This Ring is comfort Fitted.\n\n Manufactured in New York, USA. Available in different Metals, Widths, Colors and Finishing."
+# prompt = "beautiful elven woman sitting in a white elven city, (full body), (blush), (sitting on stone staircase), pinup pose, (world of warcraft blood elf), (cosplay wig), (medium blonde hair:1.3), (light blue eyes:1.2), ((red, and gold elf minidress)), intricate elven dress"
+
+print(f"=== prompt ===\n{prompt}\n===========\n")
+
+# First-time "warmup" pass (see explanation above)
+_ = pipe(prompt, num_inference_steps=1)
+
+# Results match those from the CPU device after the warmup pass.
+img_list = pipe(prompt, num_inference_steps=80).images
+
+print(len(img_list))
+image = img_list[0]
+
+output_fn = 'output1.png'
+print(f"after {(time.time() - start) / 60.0 :.2f} minutes, saving file into {output_fn}")
+image.save(output_fn)
diff --git a/examples/text_to_image/sample_training_txt2img.sh b/examples/text_to_image/sample_training_txt2img.sh
@@ -0,0 +1,38 @@
+set -euv
+
+# TODO: how to fine tune?
+# for training, need to install `diffusers` from local:
+# https://huggingface.co/docs/diffusers/installation#install-from-source
+#
+# pip install -e ".[torch]"
+
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+# moved from ~/.cache/huggingface/.....
+# export MODEL_NAME='sd-compvis-model'
+
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+
+# if GPU, set --mixed_precision="fp16"
+#
+# training param to tune
+#  --max_train_steps=15000 \
+#  --learning_rate=1e-05 \
+#  --use_8bit_adam \
+
+accelerate launch --mixed_precision="no"  examples/text_to_image/train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=1000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --checkpointing_steps=5 \
+  --checkpoints_total_limit=2 \
+  --resume_from_checkpoint="latest" \
+  --output_dir="sd-pokemon-model" 
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
@@ -780,9 +780,14 @@ def collate_fn(examples):
     progress_bar.set_description("Steps")
 
     for epoch in range(first_epoch, args.num_train_epochs):
+        logger.info(f"epoch = {epoch}: start")
         unet.train()
+        logger.info(f"epoch = {epoch}: finish train()")
+
         train_loss = 0.0
         for step, batch in enumerate(train_dataloader):
+            logger.info(f"step = {step}: start")
+
             # Skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
                 if step % args.gradient_accumulation_steps == 0:
@@ -802,6 +807,7 @@ def collate_fn(examples):
                         (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
                     )
 
+                print(f"latents.shape = {latents.shape}")
                 bsz = latents.shape[0]
                 # Sample a random timestep for each image
                 timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
@@ -823,6 +829,7 @@ def collate_fn(examples):
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                 # Predict the noise residual and compute loss
+                logger.info(f"step = {step}: Predict the noise residual and compute loss")
                 model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                 if args.snr_gamma is None:
@@ -842,17 +849,23 @@ def collate_fn(examples):
                     loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
                     loss = loss.mean()
 
+                logger.info(f"step = {step}: after accelerator loss={loss:.3f} ==")
+
                 # Gather the losses across all processes for logging (if we use distributed training).
                 avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
                 train_loss += avg_loss.item() / args.gradient_accumulation_steps
 
                 # Backpropagate
+                logger.info(f"step = {step}: Backpropagate with {loss:.3f}")
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
                 optimizer.step()
+                logger.info(f"step = {step}: optimizer.step() done")
                 lr_scheduler.step()
+                logger.info(f"step = {step}: lr_scheduler done")
                 optimizer.zero_grad()
+                logger.info(f"step = {step}: optimizer.zero_grad() done")
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
@@ -872,10 +885,13 @@ def collate_fn(examples):
             logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
 
+            logger.info(f"step_loss: {loss.detach().item()}, lr: {lr_scheduler.get_last_lr()[0]}")
+
             if global_step >= args.max_train_steps:
                 break
 
         if accelerator.is_main_process:
+            logger.info("== accelerator.is_main_process ==")
             if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
                 if args.use_ema:
                     # Store the UNet parameters temporarily and load the EMA parameters to perform inference.