sdv1.5 v1-inference.yaml
model: base_learning_rate: 1.0e-04 target: ldm.models.diffusion.ddpm.LatentDiffusion params: linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: "jpg" cond_stage_key: "txt" image_size: 64 channels: 4 cond_stage_trainable: false # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False scheduler_config: # 10000 warmup steps target: ldm.lr_scheduler.LambdaLinearScheduler params: warm_up_steps: [ 10000 ] cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases f_start: [ 1.e-6 ] f_max: [ 1. ] f_min: [ 1. ] unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModel params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
modules/initialize.py
Thread(target=load_model).start() load_model->shared.sd_model
modules/shared_items.py
Shared()-> sd_model()->modules.sd_models.model_data.get_sd_model()
sd_models.py
SdModelData: get_sd_model()->load_model() model_data = SdModelData()
sd_models.py load_model()
load_model(checkpoint_info,already_loaded_state_dict)-> state_dict = get_checkpoint_state_dict(checkpoint_info,..) - torch.load() checkpoint_config = sd_model_config.find_checkpoint_config(state_dict,checkpoint_info) # state_dict 权重已经加载上来了,类似下面这种 'model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight': tensor([0.8882, 0.9307, 0.8149, 0.8799, 0.8374, 0.8779, 0.8208, 0.7705, 0.7871, 0.6953, 0.8354, 0.8594, 0.7881, 0.8018, 0.8442, 0.7744, 0.7969, 0.7715,
sd_models_config.py
find_checkpoint_config(state_dict,info) guess_model_config_from_state_dict(state_dict,info.filename) - config_default # 根据权重的关键key从开头的config中选出来符合要求的yaml
sd_model.py load_model()
sd_config = OmegaConf.load(checkpoint_config) Creating model from config: /root/autodl-tmp/stable-diffusion-webui/configs/v1-inference.yaml sd_model = instantiate_from_config(sd_config.model)
简单分析下ldm下的代码:
models是串起全流程的代码,比如DDPM,modules下的是具体的模块代码
repositories/stable-diffusion-stability-ai/ldm/util.py
get_obj_from_str(config["target"])(**config.get("params", dict())) module:ldm.models.diffusion.ddpm,cls:LatentDiffusion importlib.import_module(module, package=None)-> <module 'ldm.models.diffusion.ddpm' from '/root/autodl-tmp/stable-diffusion-webui/repositories/stable-diffusion-stability-ai/ldm/models/diffusion/ddpm.py'>
sd_model.py load_model()
sd_model = instantiate_from_config(sd_config.model) # sd_model = LatentDiffusion
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/ddpm.py LatentDiffusion()
self.instantiate_first_stage() - model = instantiate_from_config(config) - self.first_stage_model = model.eval() self.instantiate_cond_stage() - model = instantiate_from_config(config) - self.cond_stage_model = model.eval() self.model = DiffusionWrapper(unet_config,..) - self.diffusion_model = instantiate_from_config(diff_model_config)
sd_model.first_stage_model:
AutoencoderKL( (encoder): Encoder( (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (down): ModuleList( (0): Module( (block): ModuleList( (0-1): 2 x ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (downsample): Downsample( (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2)) ) ) (1): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (downsample): Downsample( (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) ) ) (2): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (downsample): Downsample( (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2)) ) ) (3): Module( (block): ModuleList( (0-1): 2 x ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() ) ) (mid): Module( (block_1): ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (attn_1): AttnBlock( (norm): GroupNorm(32, 512, eps=1e-06, affine=True) (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) ) (block_2): ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True) (conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (decoder): Decoder( (conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (mid): Module( (block_1): ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (attn_1): AttnBlock( (norm): GroupNorm(32, 512, eps=1e-06, affine=True) (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) ) (block_2): ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (up): ModuleList( (0): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1)) ) (1-2): 2 x ResnetBlock( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() ) (1): Module( (block): ModuleList( (0): ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) ) (1-2): 2 x ResnetBlock( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (upsample): Upsample( (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (2-3): 2 x Module( (block): ModuleList( (0-2): 3 x ResnetBlock( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (attn): ModuleList() (upsample): Upsample( (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (norm_out): GroupNorm(32, 128, eps=1e-06, affine=True) (conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (loss): Identity() (quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1)) (post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1)) )
sd_model.cond_stage_model:
FrozenCLIPEmbedder( (transformer): CLIPTextModel( (text_model): CLIPTextTransformer( (embeddings): CLIPTextEmbeddings( (token_embedding): Embedding(49408, 768) (position_embedding): Embedding(77, 768) ) (encoder): CLIPEncoder( (layers): ModuleList( (0-11): 12 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) ) (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) )
sd_model.model -> diffusionModel
FrozenCLIPEmbedder( (transformer): CLIPTextModel( (text_model): CLIPTextTransformer( (embeddings): CLIPTextEmbeddings( (token_embedding): Embedding(49408, 768) (position_embedding): Embedding(77, 768) ) (encoder): CLIPEncoder( (layers): ModuleList( (0-11): 12 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) ) (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) )
sd_model.py load_model_weights
load_model_weights(sd_model,checkpoint_info,state_dict,...)-> model.is_sdxl model.is_sd1 model.is_sd2 model.load_state_dict(state_dict,strict=False) vae = model.first_stage_model model.first_stage_model = None model.half() model.first_stage_model = vae sd_vae.load_vae(model,vae_file,vae_source)
sd_model.py load_model
send_model_to_device(sd_model) sd_hijack.model_hijack.hijack(sd_model)
modules/sd_hijack.py
StableDiffusionModelHijack->hijack(,m)-> m=sd_model type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenCLIPEmbedder: model_embeddings = m.cond_stage_model.roberta.embeddings model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings,self) # 49408,768 m.cond_stage_model = sd_hijack_xlmr.FrozenXLMREmbedderWithCustomWords(m.cond_stage_model,self) apply_weighted_forward(m) self.apply_optimizations() self.clip = m.cond_stage_model self.layers = flatten(m) ldm.modules.diffusionmodules.openaimodel.copy_of_UNetModel_forward_for_webui = ldm.modules.diffusionmodules.openaimodel.UNetModel.forward ldm.modules.diffusionmodules.openaimodel.UNetModel.forward = sd_unet.UNetModel_forward
modules/sd_hijack_clip.py
FrozenCLIPEmbedderWithCustomWords()-> self.tokenizer = wrapped.tokenizer vocab = self.tokenizer.get_vocab()
sd_model.py load_model
sd_model.eval() model_data.set_sd_model(sd_model) sd_hijack.model_hijack.embedding_db.load_textual_inversion_embeddings(force_reload=True) script_callbacks.model_loaded_callback(sd_model) sd_model.cond_stage_model_empty_prompt = get_empty_cond(sd_model) Model loaded in 3004.5s (calculate hash: 175.0s, load weights from disk: 0.2s, find config: 13.4s, create model: 0.4s, apply weights to model: 667.5s, apply half(): 298.5s, apply dtype to VAE: 15.6s, load VAE: 101.6s, load weights from state dict: 69.7s, move model to device: 21.8s, hijack: 1429.6s, load textual inversion embeddings: 114.8s, scripts callbacks: 53.8s, calculate empty prompt: 42.5s).
sdxl sd_xl_base.yaml
model: target: sgm.models.diffusion.DiffusionEngine params: scale_factor: 0.13025 disable_first_stage_autocast: True denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization network_config: target: sgm.modules.diffusionmodules.openaimodel.UNetModel params: adm_in_channels: 2816 num_classes: sequential use_checkpoint: True in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [4, 2] num_res_blocks: 2 channel_mult: [1, 2, 4] num_head_channels: 64 use_spatial_transformer: True use_linear_in_transformer: True transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16 context_dim: 2048 spatial_transformer_attn_type: softmax-xformers legacy: False conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: # crossattn cond - is_trainable: False input_key: txt target: sgm.modules.encoders.modules.FrozenCLIPEmbedder params: layer: hidden layer_idx: 11 # crossattn and vector cond - is_trainable: False input_key: txt target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 params: arch: ViT-bigG-14 version: laion2b_s39b_b160k freeze: True layer: penultimate always_return_pooled: True legacy: False # vector cond - is_trainable: False input_key: original_size_as_tuple target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two # vector cond - is_trainable: False input_key: crop_coords_top_left target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two # vector cond - is_trainable: False input_key: target_size_as_tuple target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity
sd_model.py load_model_weights()
sd_model_xl.extend_sdxl(model)
sd_model_xl.py
model.model.conditioning_key = "crossattn" discretization = sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization() sgm.models.diffusion.DiffusionEngine.get_learned_conditioning = get_learned_conditioning sgm.models.diffusion.DiffusionEngine.apply_model = apply_model sgm.models.diffusion.DiffusionEngine.get_first_stage_encoding = get_first_stage_encoding
generative-models中sgm代码结构和ldm一致,models下面是整体代码流程,modules下是具体的模块代码。
repositories/generative-models/sgm/moduels/diffusion.py
model = instantiate_from_config(network_config) self.model = get_obj_from_str(model)-> self.denoiser = instantiate_from_config(denoiser_config) self.conditioner = instantiate_from_config(conditioner_config) self.first_stage_model = instantiate_from_config(first_stage_config).eval()
model.conditioner
GeneralConditioner( (embedders): ModuleList( (0): FrozenCLIPEmbedder( (transformer): CLIPTextModel( (text_model): CLIPTextTransformer( (embeddings): CLIPTextEmbeddings( (token_embedding): Embedding(49408, 768) (position_embedding): Embedding(77, 768) ) (encoder): CLIPEncoder( (layers): ModuleList( (0-11): 12 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) ) (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (1): FrozenOpenCLIPEmbedder2( (model): CLIP( (transformer): Transformer( (resblocks): ModuleList( (0-31): 32 x ResidualAttentionBlock( (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) ) (ls_1): Identity() (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (mlp): Sequential( (c_fc): Linear(in_features=1280, out_features=5120, bias=True) (gelu): GELUHijack(approximate='none') (c_proj): Linear(in_features=5120, out_features=1280, bias=True) ) (ls_2): Identity() ) ) ) (token_embedding): Embedding(49408, 1280) (ln_final): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (2-4): 3 x ConcatTimestepEmbedderND( (timestep): Timestep() ) ) (wrapped): Module() )
model.first_stage_model:
GeneralConditioner( (embedders): ModuleList( (0): FrozenCLIPEmbedder( (transformer): CLIPTextModel( (text_model): CLIPTextTransformer( (embeddings): CLIPTextEmbeddings( (token_embedding): Embedding(49408, 768) (position_embedding): Embedding(77, 768) ) (encoder): CLIPEncoder( (layers): ModuleList( (0-11): 12 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) ) (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (1): FrozenOpenCLIPEmbedder2( (model): CLIP( (transformer): Transformer( (resblocks): ModuleList( (0-31): 32 x ResidualAttentionBlock( (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) ) (ls_1): Identity() (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (mlp): Sequential( (c_fc): Linear(in_features=1280, out_features=5120, bias=True) (gelu): GELUHijack(approximate='none') (c_proj): Linear(in_features=5120, out_features=1280, bias=True) ) (ls_2): Identity() ) ) ) (token_embedding): Embedding(49408, 1280) (ln_final): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (2-4): 3 x ConcatTimestepEmbedderND( (timestep): Timestep() ) ) (wrapped): Module() )
model.model:
OpenAIWrapper( (diffusion_model): UNetModel( (time_embed): Sequential( (0): Linear(in_features=320, out_features=1280, bias=True) (1): SiLU() (2): Linear(in_features=1280, out_features=1280, bias=True) ) (label_emb): Sequential( (0): Sequential( (0): Linear(in_features=2816, out_features=1280, bias=True) (1): SiLU() (2): Linear(in_features=1280, out_features=1280, bias=True) ) ) (input_blocks): ModuleList( (0): TimestepEmbedSequential( (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (1-2): 2 x TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 320, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=320, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 320, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Identity() ) ) (3): TimestepEmbedSequential( (0): Downsample( (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) (4): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 320, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=640, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0-1): 2 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=640, out_features=640, bias=False) (to_v): Linear(in_features=640, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=2560, out_features=640, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=2048, out_features=640, bias=False) (to_v): Linear(in_features=2048, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) ) (5): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=640, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Identity() ) (1): SpatialTransformer( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0-1): 2 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=640, out_features=640, bias=False) (to_v): Linear(in_features=640, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=2560, out_features=640, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=2048, out_features=640, bias=False) (to_v): Linear(in_features=2048, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) ) (6): TimestepEmbedSequential( (0): Downsample( (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) (7): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=1280, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0-9): 10 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=1280, out_features=1280, bias=False) (to_v): Linear(in_features=1280, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=5120, out_features=1280, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=2048, out_features=1280, bias=False) (to_v): Linear(in_features=2048, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) (8): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=1280, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Identity() ) (1): SpatialTransformer( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0-9): 10 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=1280, out_features=1280, bias=False) (to_v): Linear(in_features=1280, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=5120, out_features=1280, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=2048, out_features=1280, bias=False) (to_v): Linear(in_features=2048, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) ) (middle_block): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=1280, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Identity() ) (1): SpatialTransformer( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0-9): 10 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=1280, out_features=1280, bias=False) (to_v): Linear(in_features=1280, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=5120, out_features=1280, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=2048, out_features=1280, bias=False) (to_v): Linear(in_features=2048, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) (2): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=1280, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Identity() ) ) (output_blocks): ModuleList( (0-1): 2 x TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=1280, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0-9): 10 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=1280, out_features=1280, bias=False) (to_v): Linear(in_features=1280, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=5120, out_features=1280, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=2048, out_features=1280, bias=False) (to_v): Linear(in_features=2048, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) (2): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=1280, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0-9): 10 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=1280, out_features=1280, bias=False) (to_v): Linear(in_features=1280, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=5120, out_features=1280, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=1280, out_features=1280, bias=False) (to_k): Linear(in_features=2048, out_features=1280, bias=False) (to_v): Linear(in_features=2048, out_features=1280, bias=False) (to_out): Sequential( (0): Linear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) (2): Upsample( (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (3): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=640, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0-1): 2 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=640, out_features=640, bias=False) (to_v): Linear(in_features=640, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=2560, out_features=640, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=2048, out_features=640, bias=False) (to_v): Linear(in_features=2048, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) ) (4): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=640, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0-1): 2 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=640, out_features=640, bias=False) (to_v): Linear(in_features=640, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=2560, out_features=640, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=2048, out_features=640, bias=False) (to_v): Linear(in_features=2048, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) ) (5): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 960, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=640, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): SpatialTransformer( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0-1): 2 x BasicTransformerBlock( (attn1): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=640, out_features=640, bias=False) (to_v): Linear(in_features=640, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (ff): FeedForward( (net): Sequential( (0): GEGLU( (proj): Linear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): Linear(in_features=2560, out_features=640, bias=True) ) ) (attn2): CrossAttention( (to_q): Linear(in_features=640, out_features=640, bias=False) (to_k): Linear(in_features=2048, out_features=640, bias=False) (to_v): Linear(in_features=2048, out_features=640, bias=False) (to_out): Sequential( (0): Linear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) (2): Upsample( (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (6): TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 960, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=320, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 320, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) (7-8): 2 x TimestepEmbedSequential( (0): ResBlock( (in_layers): Sequential( (0): GroupNorm32(32, 640, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (h_upd): Identity() (x_upd): Identity() (emb_layers): Sequential( (0): SiLU() (1): Linear(in_features=1280, out_features=320, bias=True) ) (out_layers): Sequential( (0): GroupNorm32(32, 320, eps=1e-05, affine=True) (1): SiLU() (2): Dropout(p=0, inplace=False) (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) ) (out): Sequential( (0): GroupNorm32(32, 320, eps=1e-05, affine=True) (1): SiLU() (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) )
idetransformerclipclidiffusioncodesatapptokenctrivapsaopenairapctolegacyamlrtcwebwebui