o Ag0@sddlZddlmZddlmZddlmZddlZddlm Z ddl Z ddl m Z ddl mZddlmZmZddlmZdd lmZdd lmZGd d d e jjjZGd dde jZGddde jZGddde jZdddZ dddZ!ddZ"ddZ#GdddeeeZ$dS)N) OrderedDict)partial)Callable) checkpoint)set_module_tensor_to_device)apply_rotary_emb FluxPosEmbed) ModelMixin) ConfigMixin)FromOriginalModelMixincs>eZdZdZdZdededeffdd Zfdd ZZ S) MLPBlockzTransformer MLP block.in_dimmlp_dimdropoutcsdtj|||gtjd|d|D]}t|tjr/tj|j |j dur/tjj |j ddqdS)N)activation_layerinplacerư>std) super__init__nnGELUmodules isinstanceLinearinitxavier_uniform_weightbiasnormal_)selfrrrm __class__4/home/wyb/yanbin/ART_v1.0/custom_model_transp_vae.pyrs   zMLPBlock.__init__c s|dd}|dus|dkrr?rr LayerNormrArBrrUModulerTensorrdrCr&r&r$r'rD@s" rDcsleZdZdZeejddfdededededed ed ed e d e jj fffd d Z de j fddZZS)Encoderz?Transformer Model Encoder for sequence to sequence translation.rrE seq_length num_layersrGrHrrrIrJ.c s`tt||_t} t|D]} t||||||| d| <qt| |_ |||_ dS)Nencoder_layer_) rrrrOrrr.rD Sequentiallayersln) r"rjrkrGrHrrrIrJrnr8r$r&r'rns   zEncoder.__init__rRcCsPt|dkd|j|}||}|jD]}t|||}q||}|S)Nr,rS)rUrVrWrXrrnrro)r"rRr\r`lr&r&r'rds   zEncoder.forwardrer&r&r$r'riks*  rics&eZdZdfdd ZddZZS) ViTEncodervit-b/32c st||_|jdkrd}d}d}n |jdkrd}d}d}td||||d d d d |_td||_t|d |_|jdkrOd dl m }m }||j d}n|jdkrbd dl m }m} || jd}|jj|jdd\} } t| d ks{t| d krtd| td| ~dS)Nrri zvit-h/14i g)rjrkrGrHrrrIr)vit_b_32ViT_B_32_Weights)weights)vit_h_14ViT_H_14_WeightsF)r4zViT Encoder Missing keys: zViT Encoder Unexpected keys: )rrarchriencoderrrfc_infc_outZ%torchvision.models.vision_transformerryrzDEFAULTr|r}IMAGENET1K_SWAG_E2E_V1load_state_dictr1lenprint) r"r~chrnheadsryrzvitr|r}r5r6r$r&r'rs@       zViTEncoder.__init__cCs&||}|||}t|j|}|SN)rrrr)r"r`r\outr&r&r'rds   zViTEncoder.forward)rr)r<r=r>rrdrCr&r&r$r'rqs&rqcCbt|jdkr|j\}}}}tj|d||d}|St|jdkr/|j\}}}tj|d||d}|S)Nrwz$b c (h p1) (w p2) -> b (c p1 p2) h wp1p2r,z c (h p1) (w p2) -> (c p1 p2) h wrrXeinops rearranger` patch_sizebschwr&r&r'patchify rcCr)Nrwz$b (c p1 p2) h w -> b c (h p1) (w p2)rr,z (c p1 p2) h w -> c (h p1) (w p2)rrr&r&r' unpatchifyrrcCsg}gg}}t|jdD]} || durq|| \} } } } | d| d| d| df\} } } } |dd| | | | | f}|j\}}}||d}||td|d|d|j|j}|| |dddf<||}|d||d|d||d}}||| | | | fdd||| | | | fddqtj|dd dd}tj|dd}tj|dd}|||ffS)Nr*rrvr r@rW) r.rXr[appendprepare_latent_image_idsdevicedtyperUcatpermute) hidden_states use_layerslist_layer_boxHW pos_embedding token_listZcos_listZsin_list layer_idxx1y1x2y2 layer_tokenrrridsimage_rotary_embZpos_cosZpos_sinr&r&r'crop_each_layers*  $   &"$ rc Cst|d|dd}|dt|ddddf|d<|dt|ddddf|d<|j\}}}||||}|j||dS)Nr r,).r*).r rr)rUzerosarangerXr[to) batch_sizeheightwidthrrZlatent_image_idsZlatent_image_id_heightZlatent_image_id_widthZlatent_image_id_channelsr&r&r'rs&& rcs6eZdZfddZddZddZd dd ZZS) AutoencoderKLTransformerTrainingcsvtd|_d|_t|_tddd|_d|jvs d|jvr5tj t dd |j d d j d d d d|_dd}dS)Nrropei')rr)thetaaxes_dimrelabsrur r*g{Gz?rT) requires_gradcSs|D]}|q|S)zL Zero out the parameters of a module and return it. ) parametersdetachzero_)modulepr&r&r' zero_module s z>AutoencoderKLTransformerTraining.__init__..zero_module)rr decoder_archlayer_embeddingrqdecoderrrr ParameterrUemptyZ max_layersr!)r"rr$r&r'rs * z)AutoencoderKLTransformerTraining.__init__cCs|j\}}}}}gg} } t|D]_} || } d|jvrFtj|| |jd} d|jvr1d| | dk<d|jvs;d|jvrF| |jdd| f} d|jvrVdgt|| || <t| || || |||j \} }| | | |q| | fS)Nr)rrr rrr) rXr.rrUtensorrrrrrr)r"z_2dboxrr]r_Trrzr\b_zZ _use_layerscisr&r&r'encodes          z'AutoencoderKLTransformerTraining.encodecCst|}tjd|||dj|djd}d|dddddf<g}t|D]} g} ||| d|| d} d} tt|| D]i} || | dkrU| | qC|| | \}}}}|d|d|d|df\}}}}||||}| | | |}t j |d||||d}t |}| }||dd||||f<| || |7} qCtj| d d } | | q%tj|dd }|S) Nrwrrrvr,rz(h w) c -> c h w)rrr*r)rrUrrrr.r unsqueezesqueezerclonerrrstack)r"rr\rrrr]padr`r_xr current_indexrrrrrZx1_tokZy1_tokZx2_tokZy2_tokZ token_lengthtokenspixels unpatchedr&r&r'decode&s2   $   z'AutoencoderKLTransformerTraining.decodeNc Cs|ddd}|ptt|jdg}||||\}}|jdd\}}|||||d|d}|jddks@J|jt|dd}|ddddf|ddddf} } | | fS)Nrr*r rzc t h w -> t c h wr,) rZrlistr.rXrrrr) r"rrrrr\rrx_hatZ x_hat_rgbZ x_hat_alphar&r&r'rdBs*z(AutoencoderKLTransformerTraining.forwardr)r<r=r>rrrrdrCr&r&r$r'rs  r)r)%r collectionsr functoolsrtypingrrUtorch.nnr torchvisiontorch.utils.checkpointraccelerate.utilsrZdiffusers.models.embeddingsrrZdiffusers.models.modeling_utilsr Zdiffusers.configuration_utilsr Zdiffusers.loadersr opsmiscMLPr rgrDrirqrrrrrr&r&r&r's,         .+* .