Single Video mode (Experimental)

semjon00 · semjon00 · commit aa4ef10ccacd · 2023-07-31T15:36:44.000+03:00
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
@@ -2,6 +2,7 @@
 import gradio as gr
 from modules import shared
 import modules.scripts as scripts
+from PIL import Image
 
 from src import backbone
 from src import common_ui
@@ -46,8 +47,8 @@ def run(self, p, *inputs):
         gen_obj = core_generation_funnel(p.outpath_samples, inputimages, None, None, inputs, backbone.gather_ops())
 
         for input_i, type, result in gen_obj:
-            if type in ['simple_mesh', 'inpainted_mesh']:
-                continue  # We are in script mode: do nothing with the filenames
+            if not isinstance(result, Image.Image):
+                continue
 
             # get generation parameters
             # TODO: could reuse
diff --git a/scripts/depthmap_api.py b/scripts/depthmap_api.py
@@ -67,8 +67,9 @@ async def process(
 
         results_based = []
         for count, type, result in gen_obj:
-            if type not in ['simple_mesh', 'inpainted_mesh']:
-                results_based += [encode_to_base64(result)]
+            if not isinstance(result, Image.Image):
+                continue
+            results_based += [encode_to_base64(result)]
         return {"images": results_based, "info": "Success"}
 
 
diff --git a/src/common_constants.py b/src/common_constants.py
@@ -25,7 +25,8 @@ def __init__(self, default_value=None, *args):
     DO_OUTPUT_DEPTH = True
     OUTPUT_DEPTH_INVERT = False
     OUTPUT_DEPTH_COMBINE = False
-    OUTPUT_DEPTH_COMBINE_AXIS = "Horizontal"
+    OUTPUT_DEPTH_COMBINE_AXIS = "Horizontal"  # Format (str) is subject to change
+    DO_OUTPUT_DEPTH_PREDICTION = False  # Hidden, do not use, subject to change
 
     CLIPDEPTH = False
     CLIPDEPTH_FAR = 0.0
diff --git a/src/common_ui.py b/src/common_ui.py
@@ -3,7 +3,7 @@
 import gradio as gr
 from PIL import Image
 
-from src import backbone
+from src import backbone, video_mode
 from src.core import core_generation_funnel, unload_models, run_makevideo
 from src.depthmap_generation import ModelHolder
 from src.gradio_args_transport import GradioComponentBundle
@@ -217,6 +217,33 @@ def open_folder_action():
     else:
         sp.Popen(["xdg-open", path])
 
+
+def depthmap_mode_video(inp):
+    inp += gr.File(elem_id='depthmap_input_video', label="Video or animated file",
+                   file_count="single", interactive=True, type="file")
+    inp += gr.Checkbox(elem_id="depthmap_vm_custom_checkbox",
+                       label="Use custom/pregenerated DepthMap video", value=False)
+    inp += gr.File(elem_id='depthmap_vm_custom', file_count="single",
+                   interactive=True, type="file", visible=False)
+    with gr.Row():
+        inp += gr.Checkbox(elem_id='depthmap_vm_compress_checkbox', label="Compress colorvideos?", value=False)
+        inp += gr.Slider(elem_id='depthmap_vm_compress_bitrate', label="Bitrate (kbit)", visible=False,
+                         minimum=1000, value=15000, maximum=50000, step=250)
+
+    inp['depthmap_vm_custom_checkbox'].change(
+        fn=lambda v: inp['depthmap_vm_custom'].update(visible=v),
+        inputs=[inp['depthmap_vm_custom_checkbox']],
+        outputs=[inp['depthmap_vm_custom']]
+    )
+
+    inp['depthmap_vm_compress_checkbox'].change(
+        fn=lambda v: inp['depthmap_vm_compress_bitrate'].update(visible=v),
+        inputs=[inp['depthmap_vm_compress_checkbox']],
+        outputs=[inp['depthmap_vm_compress_bitrate']]
+    )
+
+    return inp
+
 def on_ui_tabs():
     inp = GradioComponentBundle()
     with gr.Blocks(analytics_enabled=False, title="DepthMap") as depthmap_interface:
@@ -248,6 +275,8 @@ def on_ui_tabs():
                                            label="Skip generation and use (edited/custom) depthmaps "
                                                  "in output directory when a file already exists.",
                                            value=True)
+                    with gr.TabItem('Single Video') as depthmap_mode_3:
+                        inp = depthmap_mode_video(inp)
                 submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
                 inp |= main_ui_panel(True)  # Main panel is inserted here
                 unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
@@ -310,6 +339,7 @@ def on_ui_tabs():
         depthmap_mode_0.select(lambda: '0', None, inp['depthmap_mode'])
         depthmap_mode_1.select(lambda: '1', None, inp['depthmap_mode'])
         depthmap_mode_2.select(lambda: '2', None, inp['depthmap_mode'])
+        depthmap_mode_3.select(lambda: '3', None, inp['depthmap_mode'])
 
         def custom_depthmap_change_fn(turned_on):
             return inp['custom_depthmap_img'].update(visible=turned_on), \
@@ -369,6 +399,18 @@ def custom_depthmap_change_fn(turned_on):
     return depthmap_interface
 
 
+def format_exception(e: Exception):
+    traceback.print_exc()
+    msg = '<h3>' + 'ERROR: ' + str(e) + '</h3>' + '\n'
+    if 'out of GPU memory' not in msg:
+        msg += \
+            'Please report this issue ' \
+            f'<a href="https://github.com/thygate/{REPOSITORY_NAME}/issues">here</a>. ' \
+            'Make sure to provide the full stacktrace: \n'
+        msg += '<code style="white-space: pre;">' + traceback.format_exc() + '</code>'
+    return msg
+
+
 def run_generate(*inputs):
     inputs = GradioComponentBundle.enkey_to_dict(inputs)
     depthmap_mode = inputs['depthmap_mode']
@@ -381,10 +423,21 @@ def run_generate(*inputs):
     custom_depthmap_img = inputs['custom_depthmap_img']
 
     inputimages = []
-    # Allow supplying custom depthmaps
-    inputdepthmaps = []
-    # Also keep track of original file names
-    inputnames = []
+    inputdepthmaps = []  # Allow supplying custom depthmaps
+    inputnames = []  # Also keep track of original file names
+
+    if depthmap_mode == '3':
+        try:
+            custom_depthmap = inputs['depthmap_vm_custom'] \
+                if inputs['depthmap_vm_custom_checkbox'] else None
+            colorvids_bitrate = inputs['depthmap_vm_compress_bitrate'] \
+                if inputs['depthmap_vm_compress_checkbox'] else None
+            ret = video_mode.gen_video(
+                inputs['depthmap_input_video'], backbone.get_outpath(), inputs, custom_depthmap, colorvids_bitrate)
+            return [], None, None, ret
+        except Exception as e:
+            ret = format_exception(e)
+        return [], None, None, ret
 
     if depthmap_mode == '2' and depthmap_batch_output_dir != '':
         outpath = depthmap_batch_output_dir
@@ -410,7 +463,9 @@ def run_generate(*inputs):
             image = Image.open(os.path.abspath(img.name))
             inputimages.append(image)
             inputnames.append(os.path.splitext(img.orig_name)[0])
+        print(f'{len(inputimages)} images will be processed')
     elif depthmap_mode == '2':  # Batch from Directory
+        # TODO: There is a RAM leak when we process batches, I can smell it! Or maybe it is gone.
         assert not backbone.get_cmd_opt('hide_ui_dir_config', False), '--hide-ui-dir-config option must be disabled'
         if depthmap_batch_input_dir == '':
             return [], None, None, "Please select an input directory."
@@ -444,40 +499,40 @@ def run_generate(*inputs):
 
     gen_obj = core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inputs, backbone.gather_ops())
 
-    show_images = []
+    # Saving images
+    img_results = []
+    results_total = 0
     inpainted_mesh_fi = mesh_simple_fi = None
     msg = ""  # Empty string is never returned
     while True:
         try:
             input_i, type, result = next(gen_obj)
+            results_total += 1
         except StopIteration:
             # TODO: return more info
-            msg = '<h3>Successfully generated.</h3>'
+            msg = '<h3>Successfully generated</h3>' if results_total > 0 else \
+                '<h3>Successfully generated nothing - please check the settings and try again</h3>'
             break
         except Exception as e:
-            traceback.print_exc()
-            msg = '<h3>' + 'ERROR: ' + str(e) + '</h3>' + '\n'
-            if 'out of GPU memory' not in msg:
-                msg +=\
-                    'Please report this issue ' \
-                    f'<a href="https://github.com/thygate/{REPOSITORY_NAME}/issues">here</a>. ' \
-                    'Make sure to provide the full stacktrace: \n'
-                msg += '<code style="white-space: pre;">' + traceback.format_exc() + '</code>'
+            msg = format_exception(e)
             break
         if type == 'simple_mesh':
             mesh_simple_fi = result
             continue
         if type == 'inpainted_mesh':
             inpainted_mesh_fi = result
             continue
+        if not isinstance(result, Image.Image):
+            print(f'This is not supposed to happen! Somehow output type {type} is not supported! Input_i: {input_i}.')
+            continue
+        img_results += [(input_i, type, result)]
 
-        basename = 'depthmap'
-        if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != backbone.get_opt('outdir_extras_samples', None):
-            basename = Path(inputnames[input_i]).stem
-
-        show_images += [result]
         if inputs["save_outputs"]:
             try:
+                basename = 'depthmap'
+                if depthmap_mode == '2' and inputnames[input_i] is not None:
+                    if outpath != backbone.get_opt('outdir_extras_samples', None):
+                        basename = Path(inputnames[input_i]).stem
                 suffix = "" if type == "depth" else f"{type}"
                 backbone.save_image(result, path=outpath, basename=basename, seed=None,
                            prompt=None, extension=backbone.get_opt('samples_format', 'png'), short_filename=True,
@@ -496,4 +551,4 @@ def run_generate(*inputs):
         if backbone.get_opt('depthmap_script_show_3d_inpaint', True):
             if inpainted_mesh_fi is not None and len(inpainted_mesh_fi) > 0:
                 display_mesh_fi = inpainted_mesh_fi
-    return show_images, inpainted_mesh_fi, display_mesh_fi, msg.replace('\n', '<br>')
+    return map(lambda x: x[2], img_results), inpainted_mesh_fi, display_mesh_fi, msg.replace('\n', '<br>')
diff --git a/src/core.py b/src/core.py
@@ -1,4 +1,6 @@
 from pathlib import Path
+
+import PIL.Image
 from PIL import Image
 
 try:
@@ -37,6 +39,14 @@
 model_holder = ModelHolder()
 
 
+def convert_to_i16(arr):
+    # Single channel, 16 bit image. This loses some precision!
+    # uint16 conversion uses round-down, therefore values should be [0; 2**16)
+    numbytes = 2
+    max_val = (2 ** (8 * numbytes))
+    out = np.clip(arr * max_val, 0, max_val - 0.1)  # -0.1 from above is needed to avoid overflowing
+    return out.astype("uint16")
+
 def convert_i16_to_rgb(image, like):
     # three channel, 8 bits per channel image
     output = np.zeros_like(like)
@@ -50,6 +60,10 @@ class CoreGenerationFunnelInp:
     """This class takes a dictionary and creates a core_generation_funnel inp.
     Non-applicable parameters are silently discarded (no error)"""
     def __init__(self, values):
+        if isinstance(values, CoreGenerationFunnelInp):
+            values = values.values
+        values = {(k.name if isinstance(k, GenerationOptions) else k).lower(): v for k, v in values.items()}
+
         self.values = {}
         for setting in GenerationOptions:
             name = setting.name.lower()
@@ -74,7 +88,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
     inp = CoreGenerationFunnelInp(inp)
 
     if ops is None:
-        ops = {}
+        ops = backbone.gather_ops()
     model_holder.update_settings(**ops)
 
     # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
@@ -127,17 +141,37 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
             raw_prediction_invert = False
             """True if near=dark on raw_prediction"""
             out = None
+
             if inputdepthmaps is not None and inputdepthmaps[count] is not None:
                 # use custom depthmap
-                dimg = inputdepthmaps[count]
-                # resize if not same size as input
-                if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
-                    dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
-
-                if dimg.mode == 'I' or dimg.mode == 'P' or dimg.mode == 'L':
-                    out = np.asarray(dimg, dtype="float")
+                dp = inputdepthmaps[count]
+                if isinstance(dp, Image.Image):
+                    if dp.width != inputimages[count].width or dp.height != inputimages[count].height:
+                        try:  # LANCZOS may fail on some formats
+                            dp = dp.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
+                        except:
+                            dp = dp.resize((inputimages[count].width, inputimages[count].height))
+                    # Trying desperately to rescale image to [0;1) without actually normalizing it
+                    # Normalizing is avoided, because we want to preserve the scale of the original depthmaps
+                    # (batch mode, video mode).
+                    if len(dp.getbands()) == 1:
+                        out = np.asarray(dp, dtype="float")
+                        out_max = out.max()
+                        if out_max < 256:
+                            bit_depth = 8
+                        elif out_max < 65536:
+                            bit_depth = 16
+                        else:
+                            bit_depth = 32
+                        out /= 2.0 ** bit_depth
+                    else:
+                        out = np.asarray(dp, dtype="float")[:, :, 0]
+                        out /= 256.0
                 else:
-                    out = np.asarray(dimg, dtype="float")[:, :, 0]
+                    # Should be in interval [0; 1], values outside of this range will be clipped.
+                    out = np.asarray(dp, dtype="float")
+                    assert inputimages[count].height == out.shape[0], "Custom depthmap height mismatch"
+                    assert inputimages[count].width == out.shape[1], "Custom depthmap width mismatch"
             else:
                 # override net size (size may be different for different images)
                 if inp[go.NET_SIZE_MATCH]:
@@ -156,20 +190,20 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
                     # TODO: some models may output negative values, maybe these should be clamped to zero.
                     if raw_prediction_invert:
                         out *= -1
+                    if inp[go.DO_OUTPUT_DEPTH_PREDICTION]:
+                        yield count, 'depth_prediction', np.copy(out)
                     if inp[go.CLIPDEPTH]:
                         out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
                         out = np.clip(out, inp[go.CLIPDEPTH_FAR], inp[go.CLIPDEPTH_NEAR])
+                    out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
                 else:
                     # Regretfully, the depthmap is broken and will be replaced with a black image
                     out = np.zeros(raw_prediction.shape)
-            out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
-
-            # Single channel, 16 bit image. This loses some precision!
-            # uint16 conversion uses round-down, therefore values should be [0; 2**16)
-            numbytes = 2
-            max_val = (2 ** (8 * numbytes))
-            out = np.clip(out * max_val, 0, max_val - 0.1)  # Clipping form above is needed to avoid overflowing
-            img_output = out.astype("uint16")
+
+            # Maybe we should not use img_output for everything, since we get better accuracy from
+            # the raw_prediction. However, it is not always supported. We maybe would like to achieve
+            # reproducibility, so depthmap of the image should be the same as generating the depthmap one more time.
+            img_output = convert_to_i16(out)
             """Depthmap (near=bright), as uint16"""
 
             # if 3dinpainting, store maps for processing in second pass
@@ -198,8 +232,8 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
 
             # A weird quirk: if user tries to save depthmap, whereas custom depthmap is used,
             # depthmap will not be outputed, even if output_depth_combine is used.
-            if inp[go.DO_OUTPUT_DEPTH] and inputdepthmaps[count] is None:
-                if inp[go.DO_OUTPUT_DEPTH]:
+            if inp[go.DO_OUTPUT_DEPTH]:
+                if inputdepthmaps[count] is None:
                     img_depth = cv2.bitwise_not(img_output) if inp[go.OUTPUT_DEPTH_INVERT] else img_output
                     if inp[go.OUTPUT_DEPTH_COMBINE]:
                         axis = 1 if inp[go.OUTPUT_DEPTH_COMBINE_AXIS] == 'Horizontal' else 0
@@ -209,6 +243,13 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
                         yield count, 'concat_depth', img_concat
                     else:
                         yield count, 'depth', Image.fromarray(img_depth)
+                else:
+                    # TODO: make it better
+                    # Yes, this seems stupid, but this is, logically, what should happen -
+                    # and this improves clarity of some other code.
+                    # But we won't return it if there is only one image.
+                    if len(inputimages) > 1:
+                        yield count, 'depth', Image.fromarray(img_output)
 
             if inp[go.GEN_STEREO]:
                 print("Generating stereoscopic images..")
@@ -319,7 +360,6 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
 
 
 def get_uniquefn(outpath, basename, ext):
-    # Inefficient and may fail, maybe use unbounded binary search?
     basecount = backbone.get_next_sequence_number(outpath, basename)
     if basecount > 0: basecount = basecount - 1
     fullfn = None
diff --git a/src/video_mode.py b/src/video_mode.py