免费建社交网站,上海电商网站建设公司,江门小程序制作,最专业的营销网站建设公司排名Diffusion Models视频生成 前言#xff1a;目前开源的DiT视频生成模型不是很多#xff0c;Open-Sora是开发者生态最好的一个#xff0c;涵盖了DiT、时空DiT、3D VAE、Rectified Flow、因果卷积等Diffusion视频生成的经典知识点。本篇博客从Open-Sora的代码出发#xff0c;深… Diffusion Models视频生成 前言目前开源的DiT视频生成模型不是很多Open-Sora是开发者生态最好的一个涵盖了DiT、时空DiT、3D VAE、Rectified Flow、因果卷积等Diffusion视频生成的经典知识点。本篇博客从Open-Sora的代码出发深入解读背后的原理。 目录
3D VAE原理
代码剖析
2D VAE
时间VAE
因果3D卷积 3D VAE原理
之前绝大多数都是2D VAE特别是SDXL的VAE相当好用很多人都拿来直接用了。但是在DiT-based的模型中时间序列上如果再不做压缩的话就已经很难训得动了。因此非常有必要在时间序列上进行压缩3D VAE应运而生。
Open-Sora的方案是在2D VAE的基础上再添加一个时间VAE相比于EasyAnimate 和 CogVideoX的方案的Full Attention 存在劣势但是可以充分利用到2D VAE的权重成本更低。
代码剖析
2D VAE
来自华为pixart sdxl vae vae_2d dict(typeVideoAutoencoderKL,from_pretrainedPixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers,subfoldervae,micro_batch_sizemicro_batch_size,local_files_onlylocal_files_only,)
时间VAE vae_temporal dict(typeVAE_Temporal_SD,from_pretrainedNone,)
MODELS.register_module()
class VAE_Temporal(nn.Module):def __init__(self,in_out_channels4,latent_embed_dim4,embed_dim4,filters128,num_res_blocks4,channel_multipliers(1, 2, 2, 4),temporal_downsample(True, True, False),num_groups32, # for nn.GroupNormactivation_fnswish,):super().__init__()self.time_downsample_factor 2 ** sum(temporal_downsample)# self.time_padding self.time_downsample_factor - 1self.patch_size (self.time_downsample_factor, 1, 1)self.out_channels in_out_channels# NOTE: following MAGVIT, conv in biasFalse in encoder first convself.encoder Encoder(in_out_channelsin_out_channels,latent_embed_dimlatent_embed_dim * 2,filtersfilters,num_res_blocksnum_res_blocks,channel_multiplierschannel_multipliers,temporal_downsampletemporal_downsample,num_groupsnum_groups, # for nn.GroupNormactivation_fnactivation_fn,)self.quant_conv CausalConv3d(2 * latent_embed_dim, 2 * embed_dim, 1)self.post_quant_conv CausalConv3d(embed_dim, latent_embed_dim, 1)self.decoder Decoder(in_out_channelsin_out_channels,latent_embed_dimlatent_embed_dim,filtersfilters,num_res_blocksnum_res_blocks,channel_multiplierschannel_multipliers,temporal_downsampletemporal_downsample,num_groupsnum_groups, # for nn.GroupNormactivation_fnactivation_fn,)def get_latent_size(self, input_size):latent_size []for i in range(3):if input_size[i] is None:lsize Noneelif i 0:time_padding (0if (input_size[i] % self.time_downsample_factor 0)else self.time_downsample_factor - input_size[i] % self.time_downsample_factor)lsize (input_size[i] time_padding) // self.patch_size[i]else:lsize input_size[i] // self.patch_size[i]latent_size.append(lsize)return latent_sizedef encode(self, x):time_padding (0if (x.shape[2] % self.time_downsample_factor 0)else self.time_downsample_factor - x.shape[2] % self.time_downsample_factor)x pad_at_dim(x, (time_padding, 0), dim2)encoded_feature self.encoder(x)moments self.quant_conv(encoded_feature).to(x.dtype)posterior DiagonalGaussianDistribution(moments)return posteriordef decode(self, z, num_framesNone):time_padding (0if (num_frames % self.time_downsample_factor 0)else self.time_downsample_factor - num_frames % self.time_downsample_factor)z self.post_quant_conv(z)x self.decoder(z)x x[:, :, time_padding:]return xdef forward(self, x, sample_posteriorTrue):posterior self.encode(x)if sample_posterior:z posterior.sample()else:z posterior.mode()recon_video self.decode(z, num_framesx.shape[2])return recon_video, posterior, z
因果3D卷积
class CausalConv3d(nn.Module):def __init__(self,chan_in,chan_out,kernel_size: Union[int, Tuple[int, int, int]],pad_modeconstant,stridesNone, # allow custom stride**kwargs,):super().__init__()kernel_size cast_tuple(kernel_size, 3)time_kernel_size, height_kernel_size, width_kernel_size kernel_sizeassert is_odd(height_kernel_size) and is_odd(width_kernel_size)dilation kwargs.pop(dilation, 1)stride strides[0] if strides is not None else kwargs.pop(stride, 1)self.pad_mode pad_modetime_pad dilation * (time_kernel_size - 1) (1 - stride)height_pad height_kernel_size // 2width_pad width_kernel_size // 2self.time_pad time_padself.time_causal_padding (width_pad, width_pad, height_pad, height_pad, time_pad, 0)stride strides if strides is not None else (stride, 1, 1)dilation (dilation, 1, 1)self.conv nn.Conv3d(chan_in, chan_out, kernel_size, stridestride, dilationdilation, **kwargs)def forward(self, x):x F.pad(x, self.time_causal_padding, modeself.pad_mode)x self.conv(x)return x