diff --git a/fooocus_version.py b/fooocus_version.py index 26b2cf04..0404a937 100644 --- a/fooocus_version.py +++ b/fooocus_version.py @@ -1 +1 @@ -version = '2.1.846' +version = '2.1.847' diff --git a/modules/patch.py b/modules/patch.py index 0ae53585..6a7111a6 100644 --- a/modules/patch.py +++ b/modules/patch.py @@ -271,8 +271,8 @@ def sdxl_encode_adm_patched(self, **kwargs): height = float(height) * positive_adm_scale def embedder(number_list): - h = torch.tensor(number_list, dtype=torch.float32) - h = self.embedder(h) + h = [self.embedder(torch.tensor([x], dtype=torch.float32)) for x in number_list] + h = torch.cat(h) h = torch.flatten(h).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1) return h diff --git a/modules/patch_clip.py b/modules/patch_clip.py index 157f051b..8aa7468f 100644 --- a/modules/patch_clip.py +++ b/modules/patch_clip.py @@ -23,28 +23,6 @@ import contextlib from transformers import CLIPTextModel, CLIPTextConfig, modeling_utils, CLIPVisionConfig, CLIPVisionModelWithProjection -@contextlib.contextmanager -def use_disable_weight_init_linear_ops(device=None, dtype=None): - old_torch_nn_linear = torch.nn.Linear - force_device = device - force_dtype = dtype - - def linear_with_dtype(in_features: int, out_features: int, bias: bool = True, device=None, dtype=None): - if force_device is not None: - device = force_device - if force_dtype is not None: - dtype = force_dtype - return ldm_patched.modules.ops.disable_weight_init.Linear(in_features, out_features, bias=bias, device=device, - dtype=dtype) - - torch.nn.Linear = linear_with_dtype - try: - yield - finally: - torch.nn.Linear = old_torch_nn_linear - return - - def encode_token_weights_fooocus(self, token_weight_pairs): to_encode = list() max_token_len = 0 @@ -93,34 +71,40 @@ class SDClipModelFooocus(torch.nn.Module, ldm_patched.modules.sd1_clip.ClipToken "hidden" ] - def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77, - freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=ldm_patched.modules.clip_model.CLIPTextModel, - special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True): # clip-vit-base-patch32 + def __init__(self, + device="cpu", + max_length=77, + freeze=True, + layer="last", + layer_idx=None, + textmodel_json_config=None, + dtype=None, + special_tokens=None, + layer_norm_hidden_state=True, + **kwargs): super().__init__() assert layer in self.LAYERS + if special_tokens is None: + special_tokens = {"start": 49406, "end": 49407, "pad": 49407} + if textmodel_json_config is None: textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(ldm_patched.modules.sd1_clip.__file__)), "sd1_clip_config.json") config = CLIPTextConfig.from_json_file(textmodel_json_config) - self.num_layers = config.num_hidden_layers - with use_disable_weight_init_linear_ops(device, dtype): - with modeling_utils.no_init_weights(): - self.transformer = CLIPTextModel(config) - self.inner_name = "text_model" + with modeling_utils.no_init_weights(): + self.transformer = CLIPTextModel(config) + if dtype is not None: self.transformer.to(dtype) - inner_model = getattr(self.transformer, self.inner_name) - if hasattr(inner_model, "embeddings"): - inner_model.embeddings.to(torch.float32) - else: - self.transformer.set_input_embeddings(self.transformer.get_input_embeddings().to(torch.float32)) + self.transformer.text_model.embeddings.to(torch.float32) - self.max_length = max_length if freeze: self.freeze() + + self.max_length = max_length self.layer = layer self.layer_idx = None self.special_tokens = special_tokens @@ -131,7 +115,6 @@ class SDClipModelFooocus(torch.nn.Module, ldm_patched.modules.sd1_clip.ClipToken self.layer_norm_hidden_state = layer_norm_hidden_state if layer == "hidden": assert layer_idx is not None - assert abs(layer_idx) < self.num_layers self.clip_layer(layer_idx) self.layer_default = (self.layer, self.layer_idx) @@ -142,11 +125,8 @@ class SDClipModelFooocus(torch.nn.Module, ldm_patched.modules.sd1_clip.ClipToken param.requires_grad = False def clip_layer(self, layer_idx): - if abs(layer_idx) > self.num_layers: - self.layer = "last" - else: - self.layer = "hidden" - self.layer_idx = layer_idx + self.layer = "hidden" + self.layer_idx = layer_idx def reset_clip_layer(self): self.layer = self.layer_default[0] @@ -200,7 +180,7 @@ class SDClipModelFooocus(torch.nn.Module, ldm_patched.modules.sd1_clip.ClipToken tokens = self.set_up_textual_embeddings(tokens, backup_embeds) tokens = torch.LongTensor(tokens).to(device) - if getattr(self.transformer, self.inner_name).final_layer_norm.weight.dtype != torch.float32: + if self.transformer.text_model.final_layer_norm.weight.dtype != torch.float32: precision_scope = torch.autocast else: precision_scope = lambda a, dtype: contextlib.nullcontext(a) @@ -227,7 +207,7 @@ class SDClipModelFooocus(torch.nn.Module, ldm_patched.modules.sd1_clip.ClipToken else: z = outputs.hidden_states[self.layer_idx] if self.layer_norm_hidden_state: - z = getattr(self.transformer, self.inner_name).final_layer_norm(z) + z = self.transformer.text_model.final_layer_norm(z) if hasattr(outputs, "pooler_output"): pooled_output = outputs.pooler_output.float() @@ -252,25 +232,28 @@ class SDClipModelFooocus(torch.nn.Module, ldm_patched.modules.sd1_clip.ClipToken class ClipVisionModelFooocus: def __init__(self, json_config): config = CLIPVisionConfig.from_json_file(json_config) + self.load_device = ldm_patched.modules.model_management.text_encoder_device() - offload_device = ldm_patched.modules.model_management.text_encoder_offload_device() - self.dtype = torch.float32 + self.offload_device = ldm_patched.modules.model_management.text_encoder_offload_device() + if ldm_patched.modules.model_management.should_use_fp16(self.load_device, prioritize_performance=False): self.dtype = torch.float16 + else: + self.dtype = torch.float32 + + with modeling_utils.no_init_weights(): + self.model = CLIPVisionModelWithProjection(config) - with use_disable_weight_init_linear_ops(offload_device, self.dtype): - with modeling_utils.no_init_weights(): - self.model = CLIPVisionModelWithProjection(config) self.model.to(self.dtype) - - self.patcher = ldm_patched.modules.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) + self.patcher = ldm_patched.modules.model_patcher.ModelPatcher( + self.model, + load_device=self.load_device, + offload_device=self.offload_device + ) def load_sd(self, sd): return self.model.load_state_dict(sd, strict=False) - def encode_image(self, image): - raise NotImplementedError('wrong clip vision call!') - def patch_all_clip(): ldm_patched.modules.sd1_clip.ClipTokenWeightEncoder.encode_token_weights = encode_token_weights_fooocus