From 57c049858c3a0b319153c7e4b98c5fadc2132be4 Mon Sep 17 00:00:00 2001
From: Manuel Schmid <dev@mash1t.de>
Date: Sun, 9 Jun 2024 22:31:41 +0200
Subject: [PATCH] feat: add debug dino and mask dilate and erode

---
 extras/inpaint_mask.py | 45 +++++++++++++++++-------------------------
 language/en.json       |  5 +++++
 webui.py               | 29 ++++++++++++++-------------
 3 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/extras/inpaint_mask.py b/extras/inpaint_mask.py
index ea6e8819..8026d7e8 100644
--- a/extras/inpaint_mask.py
+++ b/extras/inpaint_mask.py
@@ -1,26 +1,9 @@
-from PIL import Image
 import numpy as np
-import torch
 from rembg import remove, new_session
 from extras.GroundingDINO.util.inference import default_groundingdino
 
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
-
-def run_grounded_sam(input_image, text_prompt, box_threshold, text_threshold):
-
-    # run grounding dino model
-    detections, _, _, _ = default_groundingdino(
-        image=np.array(input_image),
-        caption=text_prompt,
-        box_threshold=box_threshold,
-        text_threshold=text_threshold
-    )
-
-    return detections.xyxy
-
-
-def generate_mask_from_image(image, mask_model, extras, box_erode_or_dilate: int=0):
+def generate_mask_from_image(image: np.ndarray, mask_model: str, extras: dict, box_erode_or_dilate: int=0, debug_dino: bool=False) -> np.ndarray | None:
     if image is None:
         return
 
@@ -28,29 +11,37 @@ def generate_mask_from_image(image, mask_model, extras, box_erode_or_dilate: int
         image = image['image']
 
     if mask_model == 'sam':
-        img = Image.fromarray(image)
-        boxes = run_grounded_sam(img, extras['sam_prompt_text'], box_threshold=extras['box_threshold'], text_threshold=extras['text_threshold'])
+        detections, _, _, _ = default_groundingdino(
+            image=image,
+            caption=extras['sam_prompt_text'],
+            box_threshold=extras['box_threshold'],
+            text_threshold=extras['text_threshold']
+        )
+        detection_boxes = detections.xyxy
         # use full image if no box has been found
-        boxes = np.array([[0, 0, image.shape[1], image.shape[0]]]) if len(boxes) == 0 else boxes
+        detection_boxes = np.array([[0, 0, image.shape[1], image.shape[0]]]) if len(detection_boxes) == 0 else detection_boxes
 
         extras['sam_prompt'] = []
-        # from PIL import ImageDraw
-        # draw = ImageDraw.Draw(img)
-        for idx, box in enumerate(boxes):
+        for idx, box in enumerate(detection_boxes):
             box_list = box.tolist()
             if box_erode_or_dilate != 0:
                 box_list[0] -= box_erode_or_dilate
                 box_list[1] -= box_erode_or_dilate
                 box_list[2] += box_erode_or_dilate
                 box_list[3] += box_erode_or_dilate
-        #     draw.rectangle(box_list, fill=128, outline ="red")
             extras['sam_prompt'] += [{"type": "rectangle", "data": box_list}]
-        # img.show()
+
+        if debug_dino:
+            from PIL import ImageDraw, Image
+            image_with_boxes = Image.new("RGB", (image.shape[1], image.shape[0]), color="black")
+            draw = ImageDraw.Draw(image_with_boxes)
+            for box in extras['sam_prompt']:
+                draw.rectangle(box['data'], fill="white")
+            return np.array(image_with_boxes)
 
     return remove(
         image,
         session=new_session(mask_model, **extras),
         only_mask=True,
-        # post_process_mask=True,
         **extras
     )
diff --git a/language/en.json b/language/en.json
index a5f1e271..62c0ac31 100644
--- a/language/en.json
+++ b/language/en.json
@@ -377,10 +377,15 @@
     "Disable preview during generation.": "Disable preview during generation.",
     "Disable Intermediate Results": "Disable Intermediate Results",
     "Disable intermediate results during generation, only show final gallery.": "Disable intermediate results during generation, only show final gallery.",
+    "Debug Inpaint Preprocessing": "Debug Inpaint Preprocessing",
+    "Debug GroundingDINO": "Debug GroundingDINO",
+    "Used for SAM object detection and box generation": "Used for SAM object detection and box generation",
+    "GroundingDINO Box Erode or Dilate": "GroundingDINO Box Erode or Dilate",
     "Inpaint Engine": "Inpaint Engine",
     "v1": "v1",
     "Version of Fooocus inpaint model": "Version of Fooocus inpaint model",
     "v2.5": "v2.5",
+    "v2.6": "v2.6",
     "Control Debug": "Control Debug",
     "Debug Preprocessors": "Debug Preprocessors",
     "Mixing Image Prompt and Vary/Upscale": "Mixing Image Prompt and Vary/Upscale",
diff --git a/webui.py b/webui.py
index db1c98d0..f929e49f 100644
--- a/webui.py
+++ b/webui.py
@@ -231,7 +231,7 @@ with shared.gradio_root:
                                     inpaint_mask_text_threshold = gr.Slider(label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.05)
                                 generate_mask_button = gr.Button(value='Generate mask from image')
 
-                                def generate_mask(image, mask_model, cloth_category, sam_prompt_text, sam_model, sam_quant, box_threshold, text_threshold):
+                                def generate_mask(image, mask_model, cloth_category, sam_prompt_text, sam_model, sam_quant, box_threshold, text_threshold, debug_dino, dino_erode_or_dilate):
                                     from extras.inpaint_mask import generate_mask_from_image
 
                                     extras = {}
@@ -244,19 +244,7 @@ with shared.gradio_root:
                                         extras['box_threshold'] = box_threshold
                                         extras['text_threshold'] = text_threshold
 
-                                    return generate_mask_from_image(image, mask_model, extras)
-
-                                generate_mask_button.click(fn=generate_mask,
-                                                           inputs=[
-                                                               inpaint_input_image, inpaint_mask_model,
-                                                               inpaint_mask_cloth_category,
-                                                               inpaint_mask_sam_prompt_text,
-                                                               inpaint_mask_sam_model,
-                                                               inpaint_mask_sam_quant,
-                                                               inpaint_mask_box_threshold,
-                                                               inpaint_mask_text_threshold
-                                                           ],
-                                                           outputs=inpaint_mask_image, show_progress=True, queue=True)
+                                    return generate_mask_from_image(image, mask_model, extras, dino_erode_or_dilate, debug_dino)
 
                                 inpaint_mask_model.change(lambda x: [gr.update(visible=x == 'u2net_cloth_seg'), gr.update(visible=x == 'sam'), gr.update(visible=x == 'sam')],
                                                           inputs=inpaint_mask_model,
@@ -570,6 +558,8 @@ with shared.gradio_root:
 
                     with gr.Tab(label='Inpaint'):
                         debugging_inpaint_preprocessor = gr.Checkbox(label='Debug Inpaint Preprocessing', value=False)
+                        debug_dino = gr.Checkbox(label='Debug GroundingDINO', value=False,
+                                                 info='Used for SAM object detection and box generation')
                         inpaint_disable_initial_latent = gr.Checkbox(label='Disable initial latent in inpaint', value=False)
                         inpaint_engine = gr.Dropdown(label='Inpaint Engine',
                                                      value=modules.config.default_inpaint_engine_version,
@@ -592,6 +582,10 @@ with shared.gradio_root:
                                                             info='Positive value will make white area in the mask larger, '
                                                                  'negative value will make white area smaller.'
                                                                  '(default is 0, always process before any mask invert)')
+                        dino_erode_or_dilate = gr.Slider(label='GroundingDINO Box Erode or Dilate',
+                                                         minimum=-64, maximum=64, step=1, value=0,
+                                                         info='Positive value will make white area in the mask larger, '
+                                                              'negative value will make white area smaller.')
                         inpaint_mask_upload_checkbox = gr.Checkbox(label='Enable Mask Upload', value=False)
                         invert_mask_checkbox = gr.Checkbox(label='Invert Mask', value=False)
 
@@ -741,6 +735,13 @@ with shared.gradio_root:
             inpaint_strength, inpaint_respective_field
         ], show_progress=False, queue=False)
 
+        generate_mask_button.click(fn=generate_mask,
+                                   inputs=[inpaint_input_image, inpaint_mask_model, inpaint_mask_cloth_category,
+                                           inpaint_mask_sam_prompt_text, inpaint_mask_sam_model, inpaint_mask_sam_quant,
+                                           inpaint_mask_box_threshold, inpaint_mask_text_threshold, debug_dino,
+                                           dino_erode_or_dilate],
+                                   outputs=inpaint_mask_image, show_progress=True, queue=True)
+
         ctrls = [currentTask, generate_image_grid]
         ctrls += [
             prompt, negative_prompt, translate_prompts, style_selections,