a ee@sddlZddlZddlmZddlmZmZmZddl Z ddl m Z ddl m m ZddlmZmZddlmZddlmZddlZddlZddlmZd0d d Zd1ddZddZddZddZddZGddde j Z!Gddde j Z"Gddde j Z#Gddde j Z$Gd d!d!e j Z%Gd"d#d#e j Z&Gd$d%d%e j Z'Gd&d'd'e j(Z)Gd(d)d)e j Z*Gd*d+d+e j Z+Gd,d-d-e j Z,Gd.d/d/eeZ-dS)2N isfunction)OptionalAnyList) rearrangerepeat) ConfigMixin) ModelMixin) orbit_camerahTFc Cs||}g}t||||D]H}t||dd} |rX| dd9<| ddg| ddg<|| q|r|t|dttj|dd S)N)radiusr)axis) nparanger appendflatten zeros_liketorch from_numpystackfloat) num_frames elevationZ azimuth_startZ azimuth_spanZ blender_coord extra_viewZ angle_gapcamerasazimuthposer!"/data/tang/glrm/mvdream/mv_unet.py get_camerasr#'cCs|s|d}tt| tjd|tjd|j|jd}|dddf|d}tjt |t |gdd}|drtj|t |ddddfgdd}nt |d |d }|S) aX Create sinusoidal timestep embeddings. :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an [N x dim] Tensor of positional embeddings. rr)startenddtype)deviceNrdimr zb -> b d)d) rexpmathlogrfloat32tor(catcossinrr) timestepsr* max_period repeat_onlyhalffreqsargs embeddingr!r!r"timestep_embedding*s&  r;cCs|D]}|q|S)z< Zero out the parameters of a module and return it. ) parametersdetachzero_)modulepr!r!r" zero_moduleFs rAcOsZ|dkrtj|i|S|dkr0tj|i|S|dkrHtj|i|Std|dS)z4 Create a 1D, 2D, or 3D convolution module. r runsupported dimensions: N)nnConv1dConv2dConv3d ValueErrordimsr9kwargsr!r!r"conv_ndOsrLcOsZ|dkrtj|i|S|dkr0tj|i|S|dkrHtj|i|Std|dS)z8 Create a 1D, 2D, or 3D average pooling module. r rrBrCN)rD AvgPool1d AvgPool2d AvgPool3drHrIr!r!r" avg_pool_nd\srPcCs|dur |St|r|S|SNr)valr+r!r!r"defaultisrScs$eZdZfddZddZZS)GEGLUcs tt||d|_dS)Nr)super__init__rDLinearproj)selfdim_indim_out __class__r!r"rVps zGEGLU.__init__cCs&||jddd\}}|t|S)Nrrr))rXchunkFgelu)rYxgater!r!r"forwardtsz GEGLU.forward__name__ __module__ __qualname__rVrc __classcell__r!r!r\r"rTos rTcs&eZdZd fdd ZddZZS) FeedForwardNFcshtt||}t||}|ss z7MemoryEfficientCrossAttention.forward..) attn_biasopcsH|d|jdjjddddj|jdjSrrrrr!r"rs rrB)rrSr}rrrrrmapxformersopsmemory_efficient_attentionrr~rrr{r|rr) rYracontextq token_lenZ context_ipZk_ipZv_ipkv_outZout_ipr!rr"rcsH                  z%MemoryEfficientCrossAttention.forward)Nrwrxrkrr )Nrdr!r!r\r"rvs"rvcs(eZdZd fdd Zd dd ZZS) BasicTransformerBlock3DrkTrr c sntt|d|||d|_t|||d|_t|||||||d|_t||_ t||_ t||_ dS)N)rrr{r|rs)rsrr)rrr{r|rsr}r~) rUrVrvattn1riffattn2rD LayerNormnorm1norm2norm3) rYr*n_headsd_headrrsZgated_ffr}r~r\r!r"rVs*   z BasicTransformerBlock3D.__init__NcCslt|d|d}|j||dd|}t|d|d}|j|||d|}||||}|S)Nz(b f) l c -> b (f l) c)f)rzb (f l) c -> (b f) l c)rrrrrrrr)rYrarrr!r!r"rc s zBasicTransformerBlock3D.forward)rkTrr )Nr rdr!r!r\r"rs #rcs(eZdZd fdd Zd ddZZS) SpatialTransformer3Dr rkrc stttsg||_tjd|ddd|_t||_ t fddt |D|_ t t||_dS)N gư>T) num_groups num_channelsepsaffinec s&g|]}t|dqS))rrsr}r~)r).0r+rrrsrtr}r~rr!r" -s z1SpatialTransformer3D.__init__..)rUrV isinstancelist in_channelsrD GroupNormnormrWproj_in ModuleListrangetransformer_blocksrAproj_out) rYrrrrdepthrsr}r~r\rr"rVs   zSpatialTransformer3D.__init__Nc Cst|ts|g}|j\}}}}|}||}t|d}||}t|jD]\} } | ||| |d}qN| |}t|d||d}||S)Nzb c h w -> b (h w) c)rrzb (h w) c -> b c h w)hw) rrrrrrr enumeraterr) rYrarrrcrrx_iniblockr!r!r"rc>s    zSpatialTransformer3D.forward)r rkrr )Nr rdr!r!r\r"rs (rcs,eZdZdddfdd ZddZZS)PerceiverAttentionrxrw)r|r{cs~t|d|_||_||_||}t||_t||_tj ||dd|_ tj ||ddd|_ tj ||dd|_ dS)NgFryr) rUrVscaler|r{rDrrrrWrto_kvr)rYr*r|r{rtr\r!r"rVPs    zPerceiverAttention.__init__c s|}|}|j\}}|}tj||fdd}|jddd\}}tfdd|||f\}}}dt t j } || ||  dd} tj | dd| j} | |} | dddd |d} | S) z Args: x (torch.Tensor): image features shape (b, n1, D) latent (torch.Tensor): latent features shape (b, n2, D) r)rrcs8||jdjdddj|jddS)Nr rr)rrr{ transposerrrr!r"rpsz,PerceiverAttention.forward..r rrB)rrrrrr1rr^rr-sqrtr|rsoftmaxrtyper'rrr) rYralatentslrrZkv_inputrrrweightrr!rr"rc^s       zPerceiverAttention.forwardrdr!r!r\r"rOsrcs&eZdZd fdd Zd d ZZS) Resamplerrwrxrjc stttd|||d|_t|||_t|||_ t ||_ t g|_ t|D]Z} |j t t|||dtt |tj|||ddttj|||ddgqddS)Nr g?)r*r|r{Fry)rUrVrD ParameterrrandnrrWrrrnorm_outrlayersrrrrmrn) rYr*rr|r{ num_queries embedding_dim output_dimff_multrr\r!r"rVs&     zResampler.__init__cCs^|j|ddd}||}|jD]"\}}||||}|||}q&||}||S)Nrr )rrsizerrrr)rYrarattnrr!r!r"rcs  zResampler.forward)rrwrxrrwrrrjrdr!r!r\r"rs!rc@seZdZdZdddZdS)CondSequentialzt A sequential module that passes timestep embeddings to the children that support it as an extra input. Nr cCsF|D]<}t|tr|||}qt|tr8||||d}q||}q|S)Nr)rResBlockr)rYraembrrlayerr!r!r"rcs    zCondSequential.forward)Nr )rerfrg__doc__rcr!r!r!r"rsrcs*eZdZdZd fdd ZddZZS) UpsampleaA An upsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then upsampling occurs in the inner-two dimensions. rNr csFt||_|p||_||_||_|rBt||j|jd|d|_dS)NrBpadding)rUrVchannels out_channelsuse_convrJrLconv)rYrrrJrrr\r!r"rVs  zUpsample.__init__cCst|jd|jksJ|jdkrPtj||jd|jdd|jddfdd}ntj|ddd}|jrp||}|S)Nr rBrrjnearest)mode) scale_factorr)rrrJr_ interpolaterrrur!r!r"rcs & zUpsample.forward)rNr rerfrgrrVrcrhr!r!r\r"rs rcs*eZdZdZd fdd ZddZZS) DownsampleaD A downsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the inner-two dimensions. rNr cszt||_|p||_||_||_|dkr2dnd}|rVt||j|jd||d|_n |j|jksfJt|||d|_dS)NrBr)r rr)strider) kernel_sizer) rUrVrrrrJrLrrP)rYrrrJrrrr\r!r"rVs"   zDownsample.__init__cCs|jd|jksJ||S)Nr )rrrrur!r!r"rcszDownsample.forward)rNr rr!r!r\r"rsrcs*eZdZdZd fdd ZddZZS) ra A residual block that can optionally change the number of channels. :param channels: the number of input channels. :param emb_channels: the number of timestep embedding channels. :param dropout: the rate of dropout. :param out_channels: if specified, the number of out channels. :param use_conv: if True and out_channels is specified, use a spatial convolution instead of a smaller 1x1 convolution to change the channels in the skip connection. :param dims: determines if the signal is 1D, 2D, or 3D. :param up: if True, use this block for upsampling. :param down: if True, use this block for downsampling. NFrc spt||_||_||_|p"||_||_||_t t d|t t |||jddd|_ |pd| |_|rt|d||_t|d||_n2| rt|d||_t|d||_nt|_|_t t t||rd|jn|j|_t t d|jt tj|dtt ||j|jddd|_|j|krrcs,eZdZdZdfd d Zdd dZZS)MultiViewUNetModela The full multi-view UNet model with attention, timestep embedding and camera embedding. :param in_channels: channels in the input Tensor. :param model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param num_res_blocks: number of residual blocks per downsample. :param attention_resolutions: a collection of downsample rates at which attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x downsampling, attention will be used. :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param conv_resample: if True, use learned convolutions for upsampling and downsampling. :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this model will be class-conditional with `num_classes` classes. :param num_heads: the number of attention heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use a fixed channel width per attention head. :param num_heads_upsample: works with num_heads to set a different number of heads for upsampling. Deprecated. :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks for up/downsampling. :param use_new_attention_order: use a different attention pattern for potentially increased efficiency. :param camera_dim: dimensionality of camera input. rr rrjrwTrNrFr ?c& st|dusJ|dkr"| }| dkr:| dks:Jd| dkrR| dksRJd|_|_|_|_t|trt||g_ nt|t|krt d|_ durttj ksJt t fddt tsJtdd|d|_|_|_| _| _| _| _|_|du_|_|_jd krht|d d d |d |d d_|d }tt||tt||_ |dur|d }tt||tt||_!jdur^tjtrt"j|_#nhjdkrtd|_#nLjdkrX|dus,Jttt||tt||_#nt t$t%t&| ||dddg_'|_(|g}|}d}t)|D]>\}}t j |D]} t*|||||| |dg}!||}||vr:| dkr|| }"n || } | }"dus| |kr:|!+t,|| |"||jjdj'+t%|!j(|7_(|+|q|t|dkr|}#j'+t%|rt*||||#| |ddnt-|| | |#d|#}|+||d9}j(|7_(q| dkr|| }"n || } | }"t%t*|||| |dt,|| |"||jjdt*|||| |d_.j(|7_(t$g_/t0t)|dddD](\}}t j |dD]}$|1}%t*||%||||| |dg}!||}||vr| dkr|| }"n || } | }"dus|$|kr|!+t,|| |"||jjd|rr|$j |krr|}#|!+|rXt*||||#| |ddnt2|| | |#d|d}j/+t%|!j(|7_(qqltt3d|tt4t&| ||ddd_5jrtt3d|t&| ||d_6dS)Nrz3Either num_heads or num_head_channels has to be setzprovide num_res_blocks either as an int (globally constant) or as a list/tuple (per-level) with the same length as channel_multcsj||kSrQ)num_res_blocks)rnum_attention_blocksrYr!r"rz-MultiViewUNetModel.__init__..z7Constructor of UNetModel received num_attention_blocks=z;. This option has LESS priority than attention_resolutions zz, i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, attention will still not be set.rrjrx i)r*rr|r{rrrr continuousr sequentialrBr)rrJr)rrr}r~T)rrJrr)rJrr)rJr)rrJrrr)7rUrV image_sizermodel_channelsrrrlrrrHallrrprintattention_resolutionsrs channel_mult conv_resample num_classes num_headsnum_head_channelsnum_heads_upsamplepredict_codebook_idsr}r~r image_embedrDrmrWr time_embed camera_embed Embedding label_embrrrL input_blocksZ _feature_sizerrrrr middle_block output_blocksrpoprrrAr id_predictor)&rYrrrrrrrsrrrJrrrrrZresblock_updownZtransformer_depthrZn_embedr Zadm_in_channelsZ camera_dimr}r~rKtime_embed_dimZinput_block_chanschdslevelrqnrrr|out_chrZichr\r r"rVs                                          zMultiViewUNetModel.__init__c Ks|jd|dksJd|du|jduks4Jdg} t||jdd|j} || } |jdur|duspJ|jd|jdksJ| ||} |dur| ||} |j dkr|||dd|ddddddf<| |} t || fd}|}|j D] }||| ||d}| |q|j|| ||d}|jD],}t j || gdd }||| ||d}q:||j}|jr||S||SdS) a Apply the model to an input batch. :param x: an [(N x F) x C x ...] Tensor of inputs. F is the number of frames (views). :param timesteps: a 1-D batch of timesteps. :param context: conditioning plugged in via crossattn :param y: an [N] Tensor of labels, if class-conditional. :param num_frames: a integer indicating number of frames for tensor reshaping. :return: an [(N x F) x C x ...] Tensor of outputs. F is the number of frames (views). rz1input batch size must be dividable by num_frames!Nzs>           Z.;3-!"d