Merge branch 'feature/multi-step-rendering'

2024-06-25 21:10:58 +02:00 · 2024-06-25 21:10:58 +02:00 · 89f8725228
parent 722f170703 bf95342057
commit 89f8725228
15 changed files with 2056 additions and 897 deletions
--- a/args_manager.py
+++ b/args_manager.py
@ -28,8 +28,8 @@ args_parser.parser.add_argument("--disable-metadata", action='store_true',
 args_parser.parser.add_argument("--disable-preset-download", action='store_true',
                                help="Disables downloading models for presets", default=False)

-args_parser.parser.add_argument("--enable-describe-uov-image", action='store_true',
-                                help="Disables automatic description of uov images when prompt is empty", default=False)
+args_parser.parser.add_argument("--enable-auto-describe-image", action='store_true',
+                                help="Enables automatic description of uov and enhance image when prompt is empty", default=False)

 args_parser.parser.add_argument("--always-download-new-model", action='store_true',
                                help="Always download newer models ", default=False)
--- a/css/style.css
+++ b/css/style.css
@ -99,7 +99,7 @@ div:has(> #positive_prompt) {
 }

 .advanced_check_row {
-  width: 250px !important;
+  width: 330px !important;
 }

 .min_check {
--- a/experiments_mask_generation.py
+++ b/experiments_mask_generation.py
@ -0,0 +1,24 @@
+# https://github.com/sail-sg/EditAnything/blob/main/sam2groundingdino_edit.py
+
+import numpy as np
+from PIL import Image
+
+from extras.inpaint_mask import SAMOptions, generate_mask_from_image
+
+original_image = Image.open('cat.webp')
+image = np.array(original_image, dtype=np.uint8)
+
+sam_options = SAMOptions(
+    dino_prompt='eye',
+    dino_box_threshold=0.3,
+    dino_text_threshold=0.25,
+    dino_erode_or_dilate=0,
+    dino_debug=False,
+    max_detections=2,
+    model_type='vit_b'
+)
+
+mask_image, _, _, _ = generate_mask_from_image(image, sam_options=sam_options)
+
+merged_masks_img = Image.fromarray(mask_image)
+merged_masks_img.show()
--- a/extras/GroundingDINO/util/inference.py
+++ b/extras/GroundingDINO/util/inference.py
@ -25,7 +25,7 @@ class GroundingDinoModel(Model):
            caption: str,
            box_threshold: float = 0.35,
            text_threshold: float = 0.25
-    ) -> Tuple[sv.Detections, List[str]]:
+    ) -> Tuple[sv.Detections, torch.Tensor, torch.Tensor, List[str]]:
        if self.model is None:
            filename = load_file_from_url(
                url="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth",
@ -56,7 +56,7 @@ class GroundingDinoModel(Model):
            source_w=source_w,
            boxes=boxes,
            logits=logits)
-        return detections, phrases
+        return detections, boxes, logits, phrases


 def predict(
--- a/extras/censor.py
+++ b/extras/censor.py
@ -41,7 +41,7 @@ class Censor:
        model_management.load_model_gpu(self.safety_checker_model)

        single = False
-        if not isinstance(images, list) or isinstance(images, np.ndarray):
+        if not isinstance(images, (list, np.ndarray)):
            images = [images]
            single = True

--- a/extras/inpaint_mask.py
+++ b/extras/inpaint_mask.py
@ -1,42 +1,130 @@
-from PIL import Image
+import sys
+
+import modules.config
 import numpy as np
 import torch
-from rembg import remove, new_session
 from extras.GroundingDINO.util.inference import default_groundingdino
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+from extras.sam.predictor import SamPredictor
+from rembg import remove, new_session
+from segment_anything import sam_model_registry
+from segment_anything.utils.amg import remove_small_regions


-def run_grounded_sam(input_image, text_prompt, box_threshold, text_threshold):
+class SAMOptions:
+    def __init__(self,
+                 # GroundingDINO
+                 dino_prompt: str = '',
+                 dino_box_threshold=0.3,
+                 dino_text_threshold=0.25,
+                 dino_erode_or_dilate=0,
+                 dino_debug=False,

-    # run grounding dino model
-    boxes, _ = default_groundingdino(
-        image=np.array(input_image),
-        caption=text_prompt,
-        box_threshold=box_threshold,
-        text_threshold=text_threshold
-    )
-
-    return boxes.xyxy
+                 # SAM
+                 max_detections=2,
+                 model_type='vit_b'
+                 ):
+        self.dino_prompt = dino_prompt
+        self.dino_box_threshold = dino_box_threshold
+        self.dino_text_threshold = dino_text_threshold
+        self.dino_erode_or_dilate = dino_erode_or_dilate
+        self.dino_debug = dino_debug
+        self.max_detections = max_detections
+        self.model_type = model_type


-def generate_mask_from_image(image, mask_model, extras):
+def optimize_masks(masks: torch.Tensor) -> torch.Tensor:
+    """
+    removes small disconnected regions and holes
+    """
+    fine_masks = []
+    for mask in masks.to('cpu').numpy():  # masks: [num_masks, 1, h, w]
+        fine_masks.append(remove_small_regions(mask[0], 400, mode="holes")[0])
+    masks = np.stack(fine_masks, axis=0)[:, np.newaxis]
+    return torch.from_numpy(masks)
+
+
+def generate_mask_from_image(image: np.ndarray, mask_model: str = 'sam', extras=None,
+                             sam_options: SAMOptions | None = SAMOptions) -> tuple[np.ndarray | None, int | None, int | None, int | None]:
+    dino_detection_count = 0
+    sam_detection_count = 0
+    sam_detection_on_mask_count = 0
+
    if image is None:
-        return
+        return None, dino_detection_count, sam_detection_count, sam_detection_on_mask_count
+
+    if extras is None:
+        extras = {}

    if 'image' in image:
        image = image['image']

-    if mask_model == 'sam':
-        boxes = run_grounded_sam(Image.fromarray(image), extras['sam_prompt_text'], box_threshold=extras['box_threshold'], text_threshold=extras['text_threshold'])
-        boxes = np.array([[0, 0, image.shape[1], image.shape[0]]]) if len(boxes) == 0 else boxes
-        extras['sam_prompt'] = []
-        for idx, box in enumerate(boxes):
-            extras['sam_prompt'] += [{"type": "rectangle", "data": box.tolist()}]
+    if mask_model != 'sam' or sam_options is None:
+        result = remove(
+            image,
+            session=new_session(mask_model, **extras),
+            only_mask=True,
+            **extras
+        )

-    return remove(
-        image,
-        session=new_session(mask_model, **extras),
-        only_mask=True,
-        **extras
+        return result, dino_detection_count, sam_detection_count, sam_detection_on_mask_count
+
+    detections, boxes, logits, phrases = default_groundingdino(
+        image=image,
+        caption=sam_options.dino_prompt,
+        box_threshold=sam_options.dino_box_threshold,
+        text_threshold=sam_options.dino_text_threshold
    )
+
+    H, W = image.shape[0], image.shape[1]
+    boxes = boxes * torch.Tensor([W, H, W, H])
+    boxes[:, :2] = boxes[:, :2] - boxes[:, 2:] / 2
+    boxes[:, 2:] = boxes[:, 2:] + boxes[:, :2]
+
+    sam_checkpoint = modules.config.download_sam_model(sam_options.model_type)
+    sam = sam_model_registry[sam_options.model_type](checkpoint=sam_checkpoint)
+
+    sam_predictor = SamPredictor(sam)
+    final_mask_tensor = torch.zeros((image.shape[0], image.shape[1]))
+    dino_detection_count = boxes.size(0)
+
+    if dino_detection_count > 0:
+        sam_predictor.set_image(image)
+
+        if sam_options.dino_erode_or_dilate != 0:
+            for index in range(boxes.size(0)):
+                assert boxes.size(1) == 4
+                boxes[index][0] -= sam_options.dino_erode_or_dilate
+                boxes[index][1] -= sam_options.dino_erode_or_dilate
+                boxes[index][2] += sam_options.dino_erode_or_dilate
+                boxes[index][3] += sam_options.dino_erode_or_dilate
+
+        if sam_options.dino_debug:
+            from PIL import ImageDraw, Image
+            debug_dino_image = Image.new("RGB", (image.shape[1], image.shape[0]), color="black")
+            draw = ImageDraw.Draw(debug_dino_image)
+            for box in boxes.numpy():
+                draw.rectangle(box.tolist(), fill="white")
+            return np.array(debug_dino_image), dino_detection_count, sam_detection_count, sam_detection_on_mask_count
+
+        transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes, image.shape[:2])
+        masks, _, _ = sam_predictor.predict_torch(
+            point_coords=None,
+            point_labels=None,
+            boxes=transformed_boxes,
+            multimask_output=False,
+        )
+
+        masks = optimize_masks(masks)
+        sam_detection_count = len(masks)
+        if sam_options.max_detections == 0:
+            sam_options.max_detections = sys.maxsize
+        sam_objects = min(len(logits), sam_options.max_detections)
+        for obj_ind in range(sam_objects):
+            mask_tensor = masks[obj_ind][0]
+            final_mask_tensor += mask_tensor
+            sam_detection_on_mask_count += 1
+
+    final_mask_tensor = (final_mask_tensor > 0).to('cpu').numpy()
+    mask_image = np.dstack((final_mask_tensor, final_mask_tensor, final_mask_tensor)) * 255
+    mask_image = np.array(mask_image, dtype=np.uint8)
+    return mask_image, dino_detection_count, sam_detection_count, sam_detection_on_mask_count
--- a/extras/sam/predictor.py
+++ b/extras/sam/predictor.py
@ -0,0 +1,288 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from ldm_patched.modules import model_management
+from ldm_patched.modules.model_patcher import ModelPatcher
+
+from segment_anything.modeling import Sam
+
+from typing import Optional, Tuple
+
+from segment_anything.utils.transforms import ResizeLongestSide
+
+
+class SamPredictor:
+    def __init__(
+        self,
+        model: Sam,
+        load_device=model_management.text_encoder_device(),
+        offload_device=model_management.text_encoder_offload_device()
+    ) -> None:
+        """
+        Uses SAM to calculate the image embedding for an image, and then
+        allow repeated, efficient mask prediction given prompts.
+
+        Arguments:
+          model (Sam): The model to use for mask prediction.
+        """
+        super().__init__()
+
+        self.load_device = load_device
+        self.offload_device = offload_device
+        # can't use model.half() here as slow_conv2d_cpu is not implemented for half
+        model.to(self.offload_device)
+
+        self.patcher = ModelPatcher(model, load_device=self.load_device, offload_device=self.offload_device)
+
+        self.transform = ResizeLongestSide(model.image_encoder.img_size)
+        self.reset_image()
+
+    def set_image(
+        self,
+        image: np.ndarray,
+        image_format: str = "RGB",
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+
+        Arguments:
+          image (np.ndarray): The image for calculating masks. Expects an
+            image in HWC uint8 format, with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        assert image_format in [
+            "RGB",
+            "BGR",
+        ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+        if image_format != self.patcher.model.image_format:
+            image = image[..., ::-1]
+
+        # Transform the image to the form expected by the model
+        input_image = self.transform.apply_image(image)
+        input_image_torch = torch.as_tensor(input_image, device=self.load_device)
+        input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
+
+        self.set_torch_image(input_image_torch, image.shape[:2])
+
+    @torch.no_grad()
+    def set_torch_image(
+        self,
+        transformed_image: torch.Tensor,
+        original_image_size: Tuple[int, ...],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method. Expects the input
+        image to be already transformed to the format expected by the model.
+
+        Arguments:
+          transformed_image (torch.Tensor): The input image, with shape
+            1x3xHxW, which has been transformed with ResizeLongestSide.
+          original_image_size (tuple(int, int)): The size of the image
+            before transformation, in (H, W) format.
+        """
+        assert (
+            len(transformed_image.shape) == 4
+            and transformed_image.shape[1] == 3
+            and max(*transformed_image.shape[2:]) == self.patcher.model.image_encoder.img_size
+        ), f"set_torch_image input must be BCHW with long side {self.patcher.model.image_encoder.img_size}."
+        self.reset_image()
+
+        self.original_size = original_image_size
+        self.input_size = tuple(transformed_image.shape[-2:])
+        model_management.load_model_gpu(self.patcher)
+        input_image = self.patcher.model.preprocess(transformed_image.to(self.load_device))
+        self.features = self.patcher.model.image_encoder(input_image)
+        self.is_image_set = True
+
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self.is_image_set:
+            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+        # Transform input prompts
+        coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            point_coords = self.transform.apply_coords(point_coords, self.original_size)
+            coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.load_device)
+            labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.load_device)
+            coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
+        if box is not None:
+            box = self.transform.apply_boxes(box, self.original_size)
+            box_torch = torch.as_tensor(box, dtype=torch.float, device=self.load_device)
+            box_torch = box_torch[None, :]
+        if mask_input is not None:
+            mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.load_device)
+            mask_input_torch = mask_input_torch[None, :, :, :]
+
+        masks, iou_predictions, low_res_masks = self.predict_torch(
+            coords_torch,
+            labels_torch,
+            box_torch,
+            mask_input_torch,
+            multimask_output,
+            return_logits=return_logits,
+        )
+
+        masks = masks[0].detach().cpu().numpy()
+        iou_predictions = iou_predictions[0].detach().cpu().numpy()
+        low_res_masks = low_res_masks[0].detach().cpu().numpy()
+        return masks, iou_predictions, low_res_masks
+
+    @torch.no_grad()
+    def predict_torch(
+        self,
+        point_coords: Optional[torch.Tensor],
+        point_labels: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor] = None,
+        mask_input: Optional[torch.Tensor] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using ResizeLongestSide.
+
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self.is_image_set:
+            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+
+        if point_coords is not None:
+            points = (point_coords.to(self.load_device), point_labels.to(self.load_device))
+        else:
+            points = None
+
+        # load
+        if boxes is not None:
+            boxes = boxes.to(self.load_device)
+        if mask_input is not None:
+            mask_input = mask_input.to(self.load_device)
+        model_management.load_model_gpu(self.patcher)
+
+        # Embed prompts
+        sparse_embeddings, dense_embeddings = self.patcher.model.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=mask_input,
+        )
+
+        # Predict masks
+        low_res_masks, iou_predictions = self.patcher.model.mask_decoder(
+            image_embeddings=self.features,
+            image_pe=self.patcher.model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+        )
+
+        # Upscale the masks to the original image resolution
+        masks = self.patcher.model.postprocess_masks(low_res_masks, self.input_size, self.original_size)
+
+        if not return_logits:
+            masks = masks > self.patcher.model.mask_threshold
+
+        return masks, iou_predictions, low_res_masks
+
+    def get_image_embedding(self) -> torch.Tensor:
+        """
+        Returns the image embeddings for the currently set image, with
+        shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+        the embedding spatial dimension of SAM (typically C=256, H=W=64).
+        """
+        if not self.is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) to generate an embedding."
+            )
+        assert self.features is not None, "Features must exist if an image has been set."
+        return self.features
+
+    @property
+    def device(self) -> torch.device:
+        return self.patcher.model.device
+
+    def reset_image(self) -> None:
+        """Resets the currently set image."""
+        self.is_image_set = False
+        self.features = None
+        self.orig_h = None
+        self.orig_w = None
+        self.input_h = None
+        self.input_w = None
--- a/fooocus_version.py
+++ b/fooocus_version.py
@ -1 +1 @@
-version = '2.4.3 (mashb1t)'
+version = '2.5.0-rc6 (mashb1t)'
--- a/language/en.json
+++ b/language/en.json
@ -44,14 +44,7 @@
    "Top": "Top",
    "Bottom": "Bottom",
    "* \"Inpaint or Outpaint\" is powered by the sampler \"DPMPP Fooocus Seamless 2M SDE Karras Inpaint Sampler\" (beta)": "* \"Inpaint or Outpaint\" is powered by the sampler \"DPMPP Fooocus Seamless 2M SDE Karras Inpaint Sampler\" (beta)",
-    "Mask generation model": "Mask generation model",
-    "Cloth category": "Cloth category",
-    "Segmentation prompt": "Segmentation prompt",
    "Advanced options": "Advanced options",
-    "SAM model": "SAM model",
-    "Quantization": "Quantization",
-    "Box Threshold": "Box Threshold",
-    "Text Threshold": "Text Threshold",
    "Generate mask from image": "Generate mask from image",
    "Setting": "Setting",
    "Style": "Style",
@ -377,10 +370,14 @@
    "Disable preview during generation.": "Disable preview during generation.",
    "Disable Intermediate Results": "Disable Intermediate Results",
    "Disable intermediate results during generation, only show final gallery.": "Disable intermediate results during generation, only show final gallery.",
+    "Debug Inpaint Preprocessing": "Debug Inpaint Preprocessing",
+    "Debug GroundingDINO": "Debug GroundingDINO",
+    "Used for SAM object detection and box generation": "Used for SAM object detection and box generation",
+    "GroundingDINO Box Erode or Dilate": "GroundingDINO Box Erode or Dilate",
    "Inpaint Engine": "Inpaint Engine",
    "v1": "v1",
-    "Version of Fooocus inpaint model": "Version of Fooocus inpaint model",
    "v2.5": "v2.5",
+    "v2.6": "v2.6",
    "Control Debug": "Control Debug",
    "Debug Preprocessors": "Debug Preprocessors",
    "Mixing Image Prompt and Vary/Upscale": "Mixing Image Prompt and Vary/Upscale",
@ -410,5 +407,63 @@
    "Image Prompt parameters are not included. Use png and a1111 for compatibility with Civitai.": "Image Prompt parameters are not included. Use png and a1111 for compatibility with Civitai.",
    "fooocus (json)": "fooocus (json)",
    "a1111 (plain text)": "a1111 (plain text)",
-    "Unsupported image type in input": "Unsupported image type in input"
+    "Unsupported image type in input": "Unsupported image type in input",
+    "Enhance": "Enhance",
+    "Detection prompt": "Detection prompt",
+    "Detection Prompt Quick List": "Detection Prompt Quick List",
+    "Maximum number of detections": "Maximum number of detections",
+    "Base image for enhance": "Base image for enhance",
+    "Order of Processing": "Order of Processing",
+    "Use before for enhancement of small details and after for large areas.": "Use before for enhancement of small details and after for large areas.",
+    "Before First Enhancement": "Before First Enhancement",
+    "After Last Enhancement": "After Last Enhancement",
+    "Prompt Type": "Prompt Type",
+    "Choose which prompt to use for Upscale or Variation.": "Choose which prompt to use for Upscale or Variation.",
+    "Original Prompts": "Original Prompts",
+    "Last Filled Enhancement Prompts": "Last Filled Enhancement Prompts",
+    "Enable": "Enable",
+    "Describe what you want to detect.": "Describe what you want to detect.",
+    "Enhancement positive prompt": "Enhancement positive prompt",
+    "Uses original prompt instead if empty.": "Uses original prompt instead if empty.",
+    "Enhancement negative prompt": "Enhancement negative prompt",
+    "Uses original negative prompt instead if empty.": "Uses original negative prompt instead if empty.",
+    "Detection": "Detection",
+    "u2net": "u2net",
+    "u2netp": "u2netp",
+    "u2net_human_seg": "u2net_human_seg",
+    "u2net_cloth_seg": "u2net_cloth_seg",
+    "silueta": "silueta",
+    "isnet-general-use": "isnet-general-use",
+    "isnet-anime": "isnet-anime",
+    "sam": "sam",
+    "Mask generation model": "Mask generation model",
+    "Cloth category": "Cloth category",
+    "Use singular whenever possible": "Use singular whenever possible",
+    "full": "full",
+    "upper": "upper",
+    "lower": "lower",
+    "SAM Options": "SAM Options",
+    "SAM model": "SAM model",
+    "vit_b": "vit_b",
+    "vit_l": "vit_l",
+    "vit_h": "vit_h",
+    "Box Threshold": "Box Threshold",
+    "Text Threshold": "Text Threshold",
+    "Set to 0 to detect all": "Set to 0 to detect all",
+    "Inpaint": "Inpaint",
+    "Inpaint or Outpaint (default)": "Inpaint or Outpaint (default)",
+    "Improve Detail (face, hand, eyes, etc.)": "Improve Detail (face, hand, eyes, etc.)",
+    "Modify Content (add objects, change background, etc.)": "Modify Content (add objects, change background, etc.)",
+    "Disable initial latent in inpaint": "Disable initial latent in inpaint",
+    "Version of Fooocus inpaint model. If set, use performance Quality or Speed (no performance LoRAs) for best results.": "Version of Fooocus inpaint model. If set, use performance Quality or Speed (no performance LoRAs) for best results.",
+    "Inpaint Denoising Strength": "Inpaint Denoising Strength",
+    "Same as the denoising strength in A1111 inpaint. Only used in inpaint, not used in outpaint. (Outpaint always use 1.0)": "Same as the denoising strength in A1111 inpaint. Only used in inpaint, not used in outpaint. (Outpaint always use 1.0)",
+    "Inpaint Respective Field": "Inpaint Respective Field",
+    "The area to inpaint. Value 0 is same as \"Only Masked\" in A1111. Value 1 is same as \"Whole Image\" in A1111. Only used in inpaint, not used in outpaint. (Outpaint always use 1.0)": "The area to inpaint. Value 0 is same as \"Only Masked\" in A1111. Value 1 is same as \"Whole Image\" in A1111. Only used in inpaint, not used in outpaint. (Outpaint always use 1.0)",
+    "Mask Erode or Dilate": "Mask Erode or Dilate",
+    "Positive value will make white area in the mask larger, negative value will make white area smaller. (default is 0, always processed before any mask invert)": "Positive value will make white area in the mask larger, negative value will make white area smaller. (default is 0, always processed before any mask invert)",
+    "Invert Mask": "Invert Mask",
+    "Debug Enhance Masks": "Debug Enhance Masks",
+    "Show enhance masks in preview and final results": "Show enhance masks in preview and final results",
+    "Use GroundingDINO boxes instead of more detailed SAM masks": "Use GroundingDINO boxes instead of more detailed SAM masks"
 }
--- a/modules/async_worker.py
+++ b/modules/async_worker.py
--- a/modules/config.py
+++ b/modules/config.py
@ -201,6 +201,7 @@ path_fooocus_expansion = get_dir_or_set_default('path_fooocus_expansion', '../mo
 path_safety_checker_models = get_dir_or_set_default('path_safety_checker_models', '../models/safety_checker_models/')
 path_wildcards = get_dir_or_set_default('path_wildcards', '../wildcards/')
 path_safety_checker = get_dir_or_set_default('path_safety_checker', '../models/safety_checker/')
+path_sam = get_dir_or_set_default('path_sam', '../models/sam/')
 path_outputs = get_path_output()


@ -500,6 +501,50 @@ example_inpaint_prompts = get_config_item_or_set_default(
    validator=lambda x: isinstance(x, list) and all(isinstance(v, str) for v in x),
    expected_type=list
 )
+example_enhance_detection_prompts = get_config_item_or_set_default(
+    key='example_enhance_detection_prompts',
+    default_value=[
+        'face', 'eye', 'mouth', 'hair', 'hand', 'body'
+    ],
+    validator=lambda x: isinstance(x, list) and all(isinstance(v, str) for v in x),
+    expected_type=list
+)
+default_enhance_tabs = get_config_item_or_set_default(
+    key='default_enhance_tabs',
+    default_value=3,
+    validator=lambda x: isinstance(x, int) and 1 <= x <= 5,
+    expected_type=int
+)
+default_enhance_checkbox = get_config_item_or_set_default(
+    key='default_enhance_checkbox',
+    default_value=False,
+    validator=lambda x: isinstance(x, bool),
+    expected_type=bool
+)
+default_enhance_uov_method = get_config_item_or_set_default(
+    key='default_enhance_uov_method',
+    default_value=modules.flags.disabled,
+    validator=lambda x: x in modules.flags.uov_list,
+    expected_type=int
+)
+default_enhance_uov_processing_order = get_config_item_or_set_default(
+    key='default_enhance_uov_processing_order',
+    default_value=modules.flags.enhancement_uov_before,
+    validator=lambda x: x in modules.flags.enhancement_uov_processing_order,
+    expected_type=int
+)
+default_enhance_uov_prompt_type = get_config_item_or_set_default(
+    key='default_enhance_uov_prompt_type',
+    default_value=modules.flags.enhancement_uov_prompt_type_original,
+    validator=lambda x: x in modules.flags.enhancement_uov_prompt_types,
+    expected_type=int
+)
+default_sam_max_detections = get_config_item_or_set_default(
+    key='default_sam_max_detections',
+    default_value=0,
+    validator=lambda x: isinstance(x, int) and 0 <= x <= 10,
+    expected_type=int
+)
 default_black_out_nsfw = get_config_item_or_set_default(
    key='default_black_out_nsfw',
    default_value=False,
@ -526,13 +571,8 @@ metadata_created_by = get_config_item_or_set_default(
 )

 example_inpaint_prompts = [[x] for x in example_inpaint_prompts]
+example_enhance_detection_prompts = [[x] for x in example_enhance_detection_prompts]

-default_black_out_nsfw = get_config_item_or_set_default(
-    key='default_black_out_nsfw',
-    default_value=False,
-    validator=lambda x: isinstance(x, bool),
-    expected_type=bool
-)
 default_inpaint_mask_model = get_config_item_or_set_default(
    key='default_inpaint_mask_model',
    default_value='isnet-general-use',
@ -540,6 +580,13 @@ default_inpaint_mask_model = get_config_item_or_set_default(
    expected_type=str
 )

+default_enhance_inpaint_mask_model = get_config_item_or_set_default(
+    key='default_enhance_inpaint_mask_model',
+    default_value='sam',
+    validator=lambda x: x in modules.flags.inpaint_mask_models,
+    expected_type=str
+)
+
 default_inpaint_mask_cloth_category = get_config_item_or_set_default(
    key='default_inpaint_mask_cloth_category',
    default_value='full',
@ -549,8 +596,8 @@ default_inpaint_mask_cloth_category = get_config_item_or_set_default(

 default_inpaint_mask_sam_model = get_config_item_or_set_default(
    key='default_inpaint_mask_sam_model',
-    default_value='sam_vit_b_01ec64',
-    validator=lambda x: x in modules.flags.inpaint_mask_sam_model,
+    default_value='vit_b',
+    validator=lambda x: x in [y[1] for y in modules.flags.inpaint_mask_sam_model if y[1] == x],
    expected_type=str
 )

@ -789,4 +836,43 @@ def downloading_safety_checker_model():
    return os.path.join(path_safety_checker, 'stable-diffusion-safety-checker.bin')


+def download_sam_model(sam_model: str) -> str:
+    match sam_model:
+        case 'vit_b':
+            return downloading_sam_vit_b()
+        case 'vit_l':
+            return downloading_sam_vit_l()
+        case 'vit_h':
+            return downloading_sam_vit_h()
+        case _:
+            raise ValueError(f"sam model {sam_model} does not exist.")
+
+
+def downloading_sam_vit_b():
+    load_file_from_url(
+        url='https://huggingface.co/mashb1t/misc/resolve/main/sam_vit_b_01ec64.pth',
+        model_dir=path_sam,
+        file_name='sam_vit_b_01ec64.pth'
+    )
+    return os.path.join(path_sam, 'sam_vit_b_01ec64.pth')
+
+
+def downloading_sam_vit_l():
+    load_file_from_url(
+        url='https://huggingface.co/mashb1t/misc/resolve/main/sam_vit_l_0b3195.pth',
+        model_dir=path_sam,
+        file_name='sam_vit_l_0b3195.pth'
+    )
+    return os.path.join(path_sam, 'sam_vit_l_0b3195.pth')
+
+
+def downloading_sam_vit_h():
+    load_file_from_url(
+        url='https://huggingface.co/mashb1t/misc/resolve/main/sam_vit_h_4b8939.pth',
+        model_dir=path_sam,
+        file_name='sam_vit_h_4b8939.pth'
+    )
+    return os.path.join(path_sam, 'sam_vit_h_4b8939.pth')
+
+
 update_files()
--- a/modules/flags.py
+++ b/modules/flags.py
@ -8,9 +8,15 @@ upscale_15 = 'Upscale (1.5x)'
 upscale_2 = 'Upscale (2x)'
 upscale_fast = 'Upscale (Fast 2x)'

-uov_list = [
-    disabled, subtle_variation, strong_variation, upscale_15, upscale_2, upscale_fast
-]
+uov_list = [disabled, subtle_variation, strong_variation, upscale_15, upscale_2, upscale_fast]
+
+enhancement_uov_before = "Before First Enhancement"
+enhancement_uov_after = "After Last Enhancement"
+enhancement_uov_processing_order = [enhancement_uov_before, enhancement_uov_after]
+
+enhancement_uov_prompt_type_original = 'Original Prompts'
+enhancement_uov_prompt_type_last_filled = 'Last Filled Enhancement Prompts'
+enhancement_uov_prompt_types = [enhancement_uov_prompt_type_original, enhancement_uov_prompt_type_last_filled]

 CIVITAI_NO_KARRAS = ["euler", "euler_ancestral", "heun", "dpm_fast", "dpm_adaptive", "ddim", "uni_pc"]

@ -76,7 +82,7 @@ output_formats = ['png', 'jpeg', 'webp']

 inpaint_mask_models = ['u2net', 'u2netp', 'u2net_human_seg', 'u2net_cloth_seg', 'silueta', 'isnet-general-use', 'isnet-anime', 'sam']
 inpaint_mask_cloth_category = ['full', 'upper', 'lower']
-inpaint_mask_sam_model = ['sam_vit_b_01ec64', 'sam_vit_h_4b8939', 'sam_vit_l_0b3195']
+inpaint_mask_sam_model = ['vit_b', 'vit_l', 'vit_h']

 inpaint_engine_versions = ['None', 'v1', 'v2.5', 'v2.6']
 inpaint_option_default = 'Inpaint or Outpaint (default)'
@ -107,7 +113,6 @@ metadata_scheme = [
 ]

 controlnet_image_count = 4
-preparation_step_count = 13


 class OutputFormat(Enum):
@ -163,14 +168,6 @@ class Performance(Enum):
    def values(cls) -> list:
        return list(map(lambda c: c.value, cls))

-    @classmethod
-    def values(cls) -> list:
-        return list(map(lambda c: c.value, cls))
-
-    @classmethod
-    def values(cls) -> list:
-        return list(map(lambda c: c.value, cls))
-
    @classmethod
    def by_steps(cls, steps: int | str):
        return cls[Steps(int(steps)).name]
--- a/modules/util.py
+++ b/modules/util.py
@ -390,6 +390,9 @@ def get_enabled_loras(loras: list, remove_none=True) -> list:
 def parse_lora_references_from_prompt(prompt: str, loras: List[Tuple[AnyStr, float]], loras_limit: int = 5,
                                      skip_file_check=False, prompt_cleanup=True, deduplicate_loras=True,
                                      lora_filenames=None) -> tuple[List[Tuple[AnyStr, float]], str]:
+    # prevent unintended side effects when returning without detection
+    loras = loras.copy()
+
    if lora_filenames is None:
        lora_filenames = []

--- a/requirements_versions.txt
+++ b/requirements_versions.txt
@ -13,10 +13,10 @@ omegaconf==2.2.3
 gradio==3.41.2
 pygit2==1.12.2
 opencv-contrib-python==4.8.0.74
-diffusers==0.25.1
 httpx==0.24.1
 onnxruntime==1.16.3
 timm==0.9.2
-translators==5.8.9
-rembg==2.0.53
-groundingdino-py==0.4.0
+translators==5.9.2
+rembg==2.0.57
+groundingdino-py==0.4.0
+segment_anything==1.0
--- a/webui.py
+++ b/webui.py
@ -16,6 +16,7 @@ import modules.meta_parser
 import args_manager
 import copy
 import launch
+from extras.inpaint_mask import SAMOptions

 from modules.sdxl_styles import legal_style_names
 from modules.private_logger import get_current_html_path
@ -89,6 +90,34 @@ def generate_clicked(task: worker.AsyncTask):
    return


+def inpaint_mode_change(mode):
+    assert mode in modules.flags.inpaint_options
+
+    # inpaint_additional_prompt, outpaint_selections, example_inpaint_prompts,
+    # inpaint_disable_initial_latent, inpaint_engine,
+    # inpaint_strength, inpaint_respective_field
+
+    if mode == modules.flags.inpaint_option_detail:
+        return [
+            gr.update(visible=True), gr.update(visible=False, value=[]),
+            gr.Dataset.update(visible=True, samples=modules.config.example_inpaint_prompts),
+            False, 'None', 0.5, 0.0
+        ]
+
+    if mode == modules.flags.inpaint_option_modify:
+        return [
+            gr.update(visible=True), gr.update(visible=False, value=[]),
+            gr.Dataset.update(visible=False, samples=modules.config.example_inpaint_prompts),
+            True, modules.config.default_inpaint_engine_version, 1.0, 0.0
+        ]
+
+    return [
+        gr.update(visible=False, value=''), gr.update(visible=True),
+        gr.Dataset.update(visible=False, samples=modules.config.example_inpaint_prompts),
+        False, modules.config.default_inpaint_engine_version, 1.0, 0.618
+    ]
+
+
 reload_javascript()

 title = f'Fooocus {fooocus_version.version}'
@ -146,6 +175,7 @@ with shared.gradio_root:
                    skip_button.click(skip_clicked, inputs=currentTask, outputs=currentTask, queue=False, show_progress=False)
            with gr.Row(elem_classes='advanced_check_row'):
                input_image_checkbox = gr.Checkbox(label='Input Image', value=False, container=False, elem_classes='min_check')
+                enhance_checkbox = gr.Checkbox(label='Enhance', value=modules.config.default_enhance_checkbox, container=False, elem_classes='min_check')
                advanced_checkbox = gr.Checkbox(label='Advanced', value=modules.config.default_advanced_checkbox, container=False, elem_classes='min_check')
            with gr.Row(visible=False) as image_input_panel:
                with gr.Tabs():
@ -223,44 +253,56 @@ with shared.gradio_root:
                                                             choices=flags.inpaint_mask_cloth_category,
                                                             value=modules.config.default_inpaint_mask_cloth_category,
                                                             visible=False)
-                                inpaint_mask_sam_prompt_text = gr.Textbox(label='Segmentation prompt', value='', visible=False)
+                                inpaint_mask_dino_prompt_text = gr.Textbox(label='Detection prompt', value='', visible=False, info='Use singular whenever possible', placeholder='Describe what you want to detect.')
+                                example_inpaint_mask_dino_prompt_text = gr.Dataset(
+                                    samples=modules.config.example_enhance_detection_prompts,
+                                    label='Detection Prompt Quick List',
+                                    components=[inpaint_mask_dino_prompt_text],
+                                    visible=modules.config.default_inpaint_mask_model == 'sam')
+                                example_inpaint_mask_dino_prompt_text.click(lambda x: x[0],
+                                                                            inputs=example_inpaint_mask_dino_prompt_text,
+                                                                            outputs=inpaint_mask_dino_prompt_text,
+                                                                            show_progress=False, queue=False)
+
                                with gr.Accordion("Advanced options", visible=False, open=False) as inpaint_mask_advanced_options:
                                    inpaint_mask_sam_model = gr.Dropdown(label='SAM model', choices=flags.inpaint_mask_sam_model, value=modules.config.default_inpaint_mask_sam_model)
-                                    inpaint_mask_sam_quant = gr.Checkbox(label='Quantization', value=False)
                                    inpaint_mask_box_threshold = gr.Slider(label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.05)
                                    inpaint_mask_text_threshold = gr.Slider(label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.05)
+                                    inpaint_mask_sam_max_detections = gr.Slider(label="Maximum number of detections", info="Set to 0 to detect all", minimum=0, maximum=10, value=modules.config.default_sam_max_detections, step=1, interactive=True)
                                generate_mask_button = gr.Button(value='Generate mask from image')

-                                def generate_mask(image, mask_model, cloth_category, sam_prompt_text, sam_model, sam_quant, box_threshold, text_threshold):
+                                def generate_mask(image, mask_model, cloth_category, dino_prompt_text, sam_model, box_threshold, text_threshold, sam_max_detections, dino_erode_or_dilate, dino_debug):
                                    from extras.inpaint_mask import generate_mask_from_image

                                    extras = {}
+                                    sam_options = None
                                    if mask_model == 'u2net_cloth_seg':
                                        extras['cloth_category'] = cloth_category
                                    elif mask_model == 'sam':
-                                        extras['sam_prompt_text'] = sam_prompt_text
-                                        extras['sam_model'] = sam_model
-                                        extras['sam_quant'] = sam_quant
-                                        extras['box_threshold'] = box_threshold
-                                        extras['text_threshold'] = text_threshold
+                                        sam_options = SAMOptions(
+                                            dino_prompt=dino_prompt_text,
+                                            dino_box_threshold=box_threshold,
+                                            dino_text_threshold=text_threshold,
+                                            dino_erode_or_dilate=dino_erode_or_dilate,
+                                            dino_debug=dino_debug,
+                                            max_detections=sam_max_detections,
+                                            model_type=sam_model
+                                        )

-                                    return generate_mask_from_image(image, mask_model, extras)
+                                    mask, _, _, _ = generate_mask_from_image(image, mask_model, extras, sam_options)

-                                generate_mask_button.click(fn=generate_mask,
-                                                           inputs=[
-                                                               inpaint_input_image, inpaint_mask_model,
-                                                               inpaint_mask_cloth_category,
-                                                               inpaint_mask_sam_prompt_text,
-                                                               inpaint_mask_sam_model,
-                                                               inpaint_mask_sam_quant,
-                                                               inpaint_mask_box_threshold,
-                                                               inpaint_mask_text_threshold
-                                                           ],
-                                                           outputs=inpaint_mask_image, show_progress=True, queue=True)
+                                    return mask

-                                inpaint_mask_model.change(lambda x: [gr.update(visible=x == 'u2net_cloth_seg'), gr.update(visible=x == 'sam'), gr.update(visible=x == 'sam')],
+
+                                inpaint_mask_model.change(lambda x: [gr.update(visible=x == 'u2net_cloth_seg')] +
+                                                                    [gr.update(visible=x == 'sam')] * 2 +
+                                                                    [gr.Dataset.update(visible=x == 'sam',
+                                                                                       samples=modules.config.example_enhance_detection_prompts)],
                                                          inputs=inpaint_mask_model,
-                                                          outputs=[inpaint_mask_cloth_category, inpaint_mask_sam_prompt_text, inpaint_mask_advanced_options],
+                                                          outputs=[inpaint_mask_cloth_category,
+                                                                   inpaint_mask_dino_prompt_text,
+                                                                   inpaint_mask_advanced_options,
+                                                                   example_inpaint_mask_dino_prompt_text],
                                                          queue=False, show_progress=False)

                    with gr.TabItem(label='Describe') as desc_tab:
@ -283,6 +325,12 @@ with shared.gradio_root:
                                desc_input_image.upload(trigger_show_image_properties, inputs=desc_input_image,
                                                        outputs=desc_image_size, show_progress=False, queue=False)

+                    with gr.TabItem(label='Enhance') as enhance_tab:
+                        with gr.Row():
+                            with gr.Column():
+                                enhance_input_image = grh.Image(label='Base image for enhance', source='upload', type='numpy')
+                                gr.HTML('<a href="https://github.com/mashb1t/Fooocus/discussions/42" target="_blank">\U0001F4D4 Document</a>')
+
                    with gr.TabItem(label='Metadata') as metadata_tab:
                        with gr.Column():
                            metadata_input_image = grh.Image(label='For images created by Fooocus', source='upload', type='filepath')
@ -304,6 +352,153 @@ with shared.gradio_root:
                        metadata_input_image.upload(trigger_metadata_preview, inputs=metadata_input_image,
                                                    outputs=metadata_json, queue=False, show_progress=True)

+            with gr.Row(visible=modules.config.default_enhance_checkbox) as enhance_input_panel:
+                with gr.Tabs():
+                    with gr.TabItem(label='Upscale or Variation'):
+                        with gr.Row():
+                            with gr.Column():
+                                enhance_uov_method = gr.Radio(label='Upscale or Variation:', choices=flags.uov_list,
+                                                              value=modules.config.default_enhance_uov_method)
+                                enhance_uov_processing_order = gr.Radio(label='Order of Processing',
+                                                                        info='Use before for enhancement of small details and after for large areas.',
+                                                                        choices=flags.enhancement_uov_processing_order,
+                                                                        value=modules.config.default_enhance_uov_processing_order)
+                                enhance_uov_prompt_type = gr.Radio(label='Prompt',
+                                                                   info='Choose which prompt to use for Upscale or Variation.',
+                                                                   choices=flags.enhancement_uov_prompt_types,
+                                                                   value=modules.config.default_enhance_uov_prompt_type,
+                                                                   visible=modules.config.default_enhance_uov_processing_order == flags.enhancement_uov_after)
+
+                                enhance_uov_processing_order.change(lambda x: gr.update(visible=x == flags.enhancement_uov_after),
+                                                                    inputs=enhance_uov_processing_order,
+                                                                    outputs=enhance_uov_prompt_type,
+                                                                    queue=False, show_progress=False)
+                                gr.HTML('<a href="https://github.com/mashb1t/Fooocus/discussions/42" target="_blank">\U0001F4D4 Document</a>')
+                    enhance_ctrls = []
+                    for index in range(modules.config.default_enhance_tabs):
+                        with gr.TabItem(label=f'#{index + 1}') as enhance_tab_item:
+                            enhance_enabled = gr.Checkbox(label='Enable', value=False, elem_classes='min_check',
+                                                          container=False)
+
+                            enhance_mask_dino_prompt_text = gr.Textbox(label='Detection prompt',
+                                                                       info='Use singular whenever possible',
+                                                                       placeholder='Describe what you want to detect.',
+                                                                       interactive=True,
+                                                                       visible=modules.config.default_enhance_inpaint_mask_model == 'sam')
+                            example_enhance_mask_dino_prompt_text = gr.Dataset(
+                                samples=modules.config.example_enhance_detection_prompts,
+                                label='Detection Prompt Quick List',
+                                components=[enhance_mask_dino_prompt_text],
+                                visible=modules.config.default_enhance_inpaint_mask_model == 'sam')
+                            example_enhance_mask_dino_prompt_text.click(lambda x: x[0],
+                                                                        inputs=example_enhance_mask_dino_prompt_text,
+                                                                        outputs=enhance_mask_dino_prompt_text,
+                                                                        show_progress=False, queue=False)
+
+                            enhance_prompt = gr.Textbox(label="Enhancement positive prompt",
+                                                        placeholder="Uses original prompt instead if empty.",
+                                                        elem_id='enhance_prompt')
+                            enhance_negative_prompt = gr.Textbox(label="Enhancement negative prompt",
+                                                                 placeholder="Uses original negative prompt instead if empty.",
+                                                                 elem_id='enhance_negative_prompt')
+
+                            with gr.Accordion("Detection", open=False):
+                                # TODO check if limiting to SAM is better
+                                enhance_mask_model = gr.Dropdown(label='Mask generation model',
+                                                                 choices=flags.inpaint_mask_models,
+                                                                 value=modules.config.default_enhance_inpaint_mask_model)
+                                enhance_mask_cloth_category = gr.Dropdown(label='Cloth category',
+                                                                          choices=flags.inpaint_mask_cloth_category,
+                                                                          value=modules.config.default_inpaint_mask_cloth_category,
+                                                                          visible=modules.config.default_enhance_inpaint_mask_model == 'u2net_cloth_seg',
+                                                                          interactive=True)
+
+                                with gr.Accordion("SAM Options",
+                                                  visible=modules.config.default_enhance_inpaint_mask_model == 'sam',
+                                                  open=False) as sam_options:
+                                    enhance_mask_sam_model = gr.Dropdown(label='SAM model',
+                                                                         choices=flags.inpaint_mask_sam_model,
+                                                                         value=modules.config.default_inpaint_mask_sam_model,
+                                                                         interactive=True)
+                                    enhance_mask_box_threshold = gr.Slider(label="Box Threshold", minimum=0.0,
+                                                                           maximum=1.0, value=0.3, step=0.05,
+                                                                           interactive=True)
+                                    enhance_mask_text_threshold = gr.Slider(label="Text Threshold", minimum=0.0,
+                                                                            maximum=1.0, value=0.25, step=0.05,
+                                                                            interactive=True)
+                                    enhance_mask_sam_max_detections = gr.Slider(label="Maximum number of detections",
+                                                                                info="Set to 0 to detect all",
+                                                                                minimum=0, maximum=10,
+                                                                                value=modules.config.default_sam_max_detections,
+                                                                                step=1, interactive=True)
+
+                            with gr.Accordion("Inpaint", visible=True, open=False):
+                                enhance_inpaint_mode = gr.Dropdown(choices=modules.flags.inpaint_options,
+                                                                   value=modules.flags.inpaint_option_default,
+                                                                   label='Method', interactive=True)
+                                enhance_inpaint_disable_initial_latent = gr.Checkbox(
+                                    label='Disable initial latent in inpaint', value=False)
+                                enhance_inpaint_engine = gr.Dropdown(label='Inpaint Engine',
+                                                                     value=modules.config.default_inpaint_engine_version,
+                                                                     choices=flags.inpaint_engine_versions,
+                                                                     info='Version of Fooocus inpaint model. If set, use performance Quality or Speed (no performance LoRAs) for best results.')
+                                enhance_inpaint_strength = gr.Slider(label='Inpaint Denoising Strength',
+                                                                     minimum=0.0, maximum=1.0, step=0.001,
+                                                                     value=1.0,
+                                                                     info='Same as the denoising strength in A1111 inpaint. '
+                                                                          'Only used in inpaint, not used in outpaint. '
+                                                                          '(Outpaint always use 1.0)')
+                                enhance_inpaint_respective_field = gr.Slider(label='Inpaint Respective Field',
+                                                                             minimum=0.0, maximum=1.0, step=0.001,
+                                                                             value=0.618,
+                                                                             info='The area to inpaint. '
+                                                                                  'Value 0 is same as "Only Masked" in A1111. '
+                                                                                  'Value 1 is same as "Whole Image" in A1111. '
+                                                                                  'Only used in inpaint, not used in outpaint. '
+                                                                                  '(Outpaint always use 1.0)')
+                                enhance_inpaint_erode_or_dilate = gr.Slider(label='Mask Erode or Dilate',
+                                                                            minimum=-64, maximum=64, step=1, value=0,
+                                                                            info='Positive value will make white area in the mask larger, '
+                                                                                 'negative value will make white area smaller. '
+                                                                                 '(default is 0, always processed before any mask invert)')
+                                enhance_mask_invert = gr.Checkbox(label='Invert Mask', value=False)
+
+                            gr.HTML('<a href="https://github.com/mashb1t/Fooocus/discussions/42" target="_blank">\U0001F4D4 Document</a>')
+
+                        enhance_ctrls += [
+                            enhance_enabled,
+                            enhance_mask_dino_prompt_text,
+                            enhance_prompt,
+                            enhance_negative_prompt,
+                            enhance_mask_model,
+                            enhance_mask_sam_model,
+                            enhance_mask_text_threshold,
+                            enhance_mask_box_threshold,
+                            enhance_mask_sam_max_detections,
+                            enhance_inpaint_disable_initial_latent,
+                            enhance_inpaint_engine,
+                            enhance_inpaint_strength,
+                            enhance_inpaint_respective_field,
+                            enhance_inpaint_erode_or_dilate,
+                            enhance_mask_invert
+                        ]
+
+                        enhance_inpaint_mode.input(inpaint_mode_change, inputs=enhance_inpaint_mode, outputs=[
+                            inpaint_additional_prompt, outpaint_selections, example_inpaint_prompts,
+                            enhance_inpaint_disable_initial_latent, enhance_inpaint_engine,
+                            enhance_inpaint_strength, enhance_inpaint_respective_field
+                        ], show_progress=False, queue=False)
+
+                        enhance_mask_model.change(
+                            lambda x: [gr.update(visible=x == 'u2net_cloth_seg')] +
+                                      [gr.update(visible=x == 'sam')] * 2 +
+                                      [gr.Dataset.update(visible=x == 'sam',
+                                                         samples=modules.config.example_enhance_detection_prompts)],
+                            inputs=enhance_mask_model,
+                            outputs=[enhance_mask_cloth_category, enhance_mask_dino_prompt_text, sam_options,
+                                     example_enhance_mask_dino_prompt_text],
+                            queue=False, show_progress=False)
+
            switch_js = "(x) => {if(x){viewer_to_bottom(100);viewer_to_bottom(500);}else{viewer_to_top();} return x;}"
            down_js = "() => {viewer_to_bottom();}"

@ -316,7 +511,10 @@ with shared.gradio_root:
            inpaint_tab.select(lambda: 'inpaint', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
            ip_tab.select(lambda: 'ip', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
            desc_tab.select(lambda: 'desc', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
+            enhance_tab.select(lambda: 'enhance', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
            metadata_tab.select(lambda: 'metadata', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
+            enhance_checkbox.change(lambda x: gr.update(visible=x), inputs=enhance_checkbox,
+                                        outputs=enhance_input_panel, queue=False, show_progress=False, _js=switch_js)

        with gr.Column(scale=1, visible=modules.config.default_advanced_checkbox) as advanced_column:
            with gr.Tab(label='Settings'):
@ -379,7 +577,7 @@ with shared.gradio_root:
                def update_history_link():
                    if args_manager.args.disable_image_log:
                        return gr.update(value='')
-                    
+
                    return gr.update(value=f'<a href="file={get_current_html_path(output_format)}" target="_blank">\U0001F4DA History Log</a>')

                history_link = gr.HTML()
@ -544,7 +742,7 @@ with shared.gradio_root:
                                                       info='Image Prompt parameters are not included. Use png and a1111 for compatibility with Civitai.',
                                                       visible=modules.config.default_save_metadata_to_images)

-                            save_metadata_to_images.change(lambda x: gr.update(visible=x), inputs=[save_metadata_to_images], outputs=[metadata_scheme], 
+                            save_metadata_to_images.change(lambda x: gr.update(visible=x), inputs=[save_metadata_to_images], outputs=[metadata_scheme],
                                                           queue=False, show_progress=False)

                    with gr.Tab(label='Control'):
@ -570,11 +768,15 @@ with shared.gradio_root:

                    with gr.Tab(label='Inpaint'):
                        debugging_inpaint_preprocessor = gr.Checkbox(label='Debug Inpaint Preprocessing', value=False)
+                        debugging_enhance_masks_checkbox = gr.Checkbox(label='Debug Enhance Masks', value=False,
+                                                                       info='Show enhance masks in preview and final results')
+                        debugging_dino = gr.Checkbox(label='Debug GroundingDINO', value=False,
+                                                     info='Use GroundingDINO boxes instead of more detailed SAM masks')
                        inpaint_disable_initial_latent = gr.Checkbox(label='Disable initial latent in inpaint', value=False)
                        inpaint_engine = gr.Dropdown(label='Inpaint Engine',
                                                     value=modules.config.default_inpaint_engine_version,
                                                     choices=flags.inpaint_engine_versions,
-                                                     info='Version of Fooocus inpaint model')
+                                                     info='Version of Fooocus inpaint model. If set, use performance Quality or Speed (no performance LoRAs) for best results.')
                        inpaint_strength = gr.Slider(label='Inpaint Denoising Strength',
                                                     minimum=0.0, maximum=1.0, step=0.001, value=1.0,
                                                     info='Same as the denoising strength in A1111 inpaint. '
@ -590,8 +792,13 @@ with shared.gradio_root:
                        inpaint_erode_or_dilate = gr.Slider(label='Mask Erode or Dilate',
                                                            minimum=-64, maximum=64, step=1, value=0,
                                                            info='Positive value will make white area in the mask larger, '
-                                                                 'negative value will make white area smaller.'
-                                                                 '(default is 0, always process before any mask invert)')
+                                                                 'negative value will make white area smaller. '
+                                                                 '(default is 0, always processed before any mask invert)')
+                        dino_erode_or_dilate = gr.Slider(label='GroundingDINO Box Erode or Dilate',
+                                                         minimum=-64, maximum=64, step=1, value=0,
+                                                         info='Positive value will make white area in the mask larger, '
+                                                              'negative value will make white area smaller. '
+                                                              '(default is 0, processed before SAM)')
                        inpaint_mask_upload_checkbox = gr.Checkbox(label='Enable Mask Upload', value=False)
                        invert_mask_checkbox = gr.Checkbox(label='Invert Mask', value=False)

@ -701,46 +908,26 @@ with shared.gradio_root:
                                         adm_scaler_negative, refiner_switch, refiner_model, sampler_name,
                                         scheduler_name, adaptive_cfg, refiner_swap_method, negative_prompt, disable_intermediate_results
                                     ], queue=False, show_progress=False)
-        
+
        output_format.input(lambda x: gr.update(output_format=x), inputs=output_format)
-        
+
        advanced_checkbox.change(lambda x: gr.update(visible=x), advanced_checkbox, advanced_column,
                                 queue=False, show_progress=False) \
            .then(fn=lambda: None, _js='refresh_grid_delayed', queue=False, show_progress=False)

-        def inpaint_mode_change(mode):
-            assert mode in modules.flags.inpaint_options
-
-            # inpaint_additional_prompt, outpaint_selections, example_inpaint_prompts,
-            # inpaint_disable_initial_latent, inpaint_engine,
-            # inpaint_strength, inpaint_respective_field
-
-            if mode == modules.flags.inpaint_option_detail:
-                return [
-                    gr.update(visible=True), gr.update(visible=False, value=[]),
-                    gr.Dataset.update(visible=True, samples=modules.config.example_inpaint_prompts),
-                    False, 'None', 0.5, 0.0
-                ]
-
-            if mode == modules.flags.inpaint_option_modify:
-                return [
-                    gr.update(visible=True), gr.update(visible=False, value=[]),
-                    gr.Dataset.update(visible=False, samples=modules.config.example_inpaint_prompts),
-                    True, modules.config.default_inpaint_engine_version, 1.0, 0.0
-                ]
-
-            return [
-                gr.update(visible=False, value=''), gr.update(visible=True),
-                gr.Dataset.update(visible=False, samples=modules.config.example_inpaint_prompts),
-                False, modules.config.default_inpaint_engine_version, 1.0, 0.618
-            ]
-
        inpaint_mode.input(inpaint_mode_change, inputs=inpaint_mode, outputs=[
            inpaint_additional_prompt, outpaint_selections, example_inpaint_prompts,
            inpaint_disable_initial_latent, inpaint_engine,
            inpaint_strength, inpaint_respective_field
        ], show_progress=False, queue=False)

+        generate_mask_button.click(fn=generate_mask,
+                                   inputs=[inpaint_input_image, inpaint_mask_model, inpaint_mask_cloth_category,
+                                           inpaint_mask_dino_prompt_text, inpaint_mask_sam_model,
+                                           inpaint_mask_box_threshold, inpaint_mask_text_threshold,
+                                           inpaint_mask_sam_max_detections, dino_erode_or_dilate, debugging_dino],
+                                   outputs=inpaint_mask_image, show_progress=True, queue=True)
+
        ctrls = [currentTask, generate_image_grid]
        ctrls += [
            prompt, negative_prompt, translate_prompts, style_selections,
@ -766,6 +953,10 @@ with shared.gradio_root:
            ctrls += [save_metadata_to_images, metadata_scheme]

        ctrls += ip_ctrls
+        ctrls += [debugging_dino, dino_erode_or_dilate, debugging_enhance_masks_checkbox,
+                  enhance_input_image, enhance_checkbox, enhance_uov_method, enhance_uov_processing_order,
+                  enhance_uov_prompt_type]
+        ctrls += enhance_ctrls

        def parse_meta(raw_prompt_txt, is_generating):
            loaded_json = None
@ -828,15 +1019,18 @@ with shared.gradio_root:
        desc_btn.click(trigger_describe, inputs=[desc_method, desc_input_image],
                       outputs=[prompt, style_selections], show_progress=True, queue=True)

-        if args_manager.args.enable_describe_uov_image:
-            def trigger_uov_describe(mode, img, prompt):
+        if args_manager.args.enable_auto_describe_image:
+            def trigger_auto_describe(mode, img, prompt):
                # keep prompt if not empty
                if prompt == '':
                    return trigger_describe(mode, img)
                return gr.update(), gr.update()

-            uov_input_image.upload(trigger_uov_describe, inputs=[desc_method, uov_input_image, prompt],
-                           outputs=[prompt, style_selections], show_progress=True, queue=True)
+            uov_input_image.upload(trigger_auto_describe, inputs=[desc_method, uov_input_image, prompt],
+                                   outputs=[prompt, style_selections], show_progress=True, queue=True)
+
+            enhance_input_image.upload(lambda: gr.update(value=True), outputs=enhance_checkbox, queue=False, show_progress=False) \
+                .then(trigger_auto_describe, inputs=[desc_method, enhance_input_image, prompt], outputs=[prompt, style_selections], show_progress=True, queue=True)

 def dump_default_english_config():
    from modules.localization import dump_english_config