본문 바로가기

VirtualTryon

CLIPVisionModel Projection, PBE image encoder to SDXL 이식기록

 

 

*1 : batch norm, layer norm 차이 :  https://yonghyuc.wordpress.com/2020/03/04/batch-norm-vs-layer-norm/

*2 : CLIPVisionModel output  : CLIPVisionModel의 output은 CLIPVisionTransformer을 받아오는거랑 같다. 

                                                 CLIPVisionModel의 forward 가 self.vision_model(..) 을 return 하기 때문이다. 

                                                 vision_model = CLIPVisionTransformer 이다. 

                                                 (의견 ) pooleroutput의 결과는 classification token으로 생각됨 ( 많은 정보가 생략됨) 

더보기
더보기
더보기
더보기
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModel
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output
>>> outputs
BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.5297, -0.7713,  0.4655,  ..., -0.3993, -0.0721, -0.3703],
         [ 0.8688,  0.1690,  0.6678,  ...,  0.5126, -1.1465, -0.1258],
         [ 1.1742, -0.7551,  0.0396,  ...,  0.7166, -0.5458,  0.0031],
         ...,
         [ 0.8636,  0.2223,  0.6411,  ...,  0.5242, -0.8104,  0.0170],
         [ 0.6842, -1.1056, -0.2486,  ...,  0.7901,  0.4862, -0.0949],
         [ 0.8934,  0.0066,  0.9236,  ...,  0.5707, -0.8436, -0.2182]]],
       grad_fn=<AddBackward0>), pooler_output=tensor([[-0.9326, -1.3289,  0.7919,  ..., -0.3337, -0.0479, -0.7106]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)
>>> last_hidden_state.shape
torch.Size([1, 257, 1024])
>>> pooled_output.shape
torch.Size([1, 1024])

 

CLIPVisionTransformer 의 output은 왜인지 모르겠지만...  torch.Size([1, 257, 1024]) 임!

CLIPVisionTransformer는 BaseModelOutputWithPooling을 return 한다.

이때, BaseModelOuptutwithPooling은  last_hidden_state 와 pooler_output 을 return 한다
(참조 : https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/modeling_outputs.py#L70)

 

따라서, 아래 예제에서 outputs (return of CLIPVisionTransformer) 에는 last_hidden_state ( 1x257x1024 ) 와 pooled_output (1x1024) 가 담겨있다. 

from PIL import Image
import requests
from transformers import AutoProcessor, CLIPVisionModel
model = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14")
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(images=image, return_tensors="pt")
breakpoint()
outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output  # pooled CLS states

 

더보기
더보기
더보기
더보기
class CLIPVisionTransformer(nn.Module):
    def __init__(self, config: CLIPVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        self.embeddings = CLIPVisionEmbeddings(config)
        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        self.encoder = CLIPEncoder(config)
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        hidden_states = self.embeddings(pixel_values)
        hidden_states = self.pre_layrnorm(hidden_states)

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        last_hidden_state = encoder_outputs[0] ## -> torch.Size([1, 257, 1024])
                                               ## -> [0]은 안에 내용 까느라 하는거 
        pooled_output = last_hidden_state[:, 0, :] ## -> torch.Size([1, 1024])
                                                   ## -> 젤 앞에꺼 까옴, 문제의 부분
        pooled_output = self.post_layernorm(pooled_output)

        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

 

 

 

CLIPVisionModel을 알아보았으니 이제 Paintbyexample의 image encoder에 대해서 보자 

 

class PaintByExampleImageEncoder(CLIPPreTrainedModel):
    def __init__(self, config, proj_size=768):
        super().__init__(config)
        self.proj_size = proj_size

        self.model = CLIPVisionModel(config)    
        self.mapper = PaintByExampleMapper(config)
        self.final_layer_norm = nn.LayerNorm(config.hidden_size)
        self.proj_out = nn.Linear(config.hidden_size, self.proj_size)

        # uncondition for scaling
        self.uncond_vector = nn.Parameter(torch.randn((1, 1, self.proj_size)))

    def forward(self, pixel_values, return_uncond_vector=False):
        clip_output = self.model(pixel_values=pixel_values)
        latent_states = clip_output.pooler_output   ### 찬찬히 보니, clip_output은 앞서말한
                                                                            CLIPVisionmodel( last_hidden_state 와 pooler_output으로 구성된)의
                                                                            output이다. 따라서 pooler_output을 가져오면, 1개만 뽑혀진다. 
                                                                            latent_states.shape = torch.Size([1, 1024])
        latent_states = self.mapper(latent_states[:, None])  ### latent_states.shape = torch.Size([1, 1, 1024])
        # latent 값 : tensor([[[-3.0098,  3.7832,  6.6094,  ...,  0.1696,  1.1875,  0.0864]]],
        latent_states = self.final_layer_norm(latent_states)
        # latent 값 (norm 후) : tensor([[[-0.6587,  0.7593,  1.3506,  ...,  0.0034,  0.2194, -0.0087]]],
        latent_states = self.proj_out(latent_states)  ## 1x1x1024 -> 1x1x768 projection layer  
         # latent_states.shape : torch.Size([1, 1, 768])
        if return_uncond_vector:
            return latent_states, self.uncond_vector

        return latent_states

위에 clip_output을 보면, CLIPVisionModel의 결과물이므로, 앞서 확인하였듯이,  last_hidden_state 와 pooler_output 을 return 한다. 따라서 이 부분을 바꿔주어야한다. 

 

 

paint by example Image encoder 부분을 pooling 없이 수정하면 아래와 같다. 

import torch
from torch import nn
from transformers import CLIPPreTrainedModel, CLIPVisionModel
from diffusers.pipelines.paint_by_example.image_encoder import PaintByExampleImageEncoder, PaintByExampleMapper

class PaintByExampleImageEncoderWihtoutPooling(PaintByExampleImageEncoder):
    def __init__(self, config, proj_size=None):
        super().__init__(config)
        self.proj_size = proj_size or getattr(config, "projection_dim", 768)

        self.model = CLIPVisionModel(config)
        self.mapper = PaintByExampleMapper(config)
        self.final_layer_norm = nn.LayerNorm(config.hidden_size)
        self.proj_out = nn.Linear(config.hidden_size, self.proj_size)

        # uncondition for scaling
        self.uncond_vector = nn.Parameter(torch.randn((1, 1, self.proj_size)))

    def forward(self, pixel_values, return_uncond_vector=False):
        clip_output = self.model(pixel_values=pixel_values)
        latent_states = clip_output.last_hidden_state  # use unpooled output instead of pooled output
        latent_states = self.model.vision_model.post_layernorm(latent_states) # pooled_output 에서 했음(post_layernorm)

        latent_states = self.mapper(latent_states[:, None])
        latent_states = self.final_layer_norm(latent_states)
        latent_states = self.proj_out(latent_states)
        if return_uncond_vector:
            return latent_states, self.uncond_vector

        return latent_states
   
    def from_pbe_encoder(cls, original):
        config = original.config
        new_encoder = cls(config)
        new_encoder.proj_size = original.proj_size
        new_encoder.model = original.moodel
        new_encoder.mapper = original.mapper
        new_encoder.final_layer_norm = original.final_layer_norm
        new_encoder.proj_out = original.proj_out
        new_encoder.uncond_vector = original.uncond_vector