MgdZddlmZmZmZmZmZmZmZddl Z ddl m Z ddl m Z ddlmZmZddlmZGdd e jZGd d ee ZdS) a References: - VectorQuantizer2: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L110 - GumbelQuantize: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L213 - VQVAE (VQModel): https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/models/autoencoder.py#L14 )AnyDictListOptionalSequenceTupleUnionN)PyTorchModelHubMixin)DecoderEncoder)VectorQuantizer2c eZdZ d fd Zd!dZdejfdZ d"dejdee e e e e e ffdee deejfdZ d!deejdede eejejffdZ d!deejdede eejejffdZ d#dee e e e e e ffdeejfdZd$deeefffd ZxZS)%VQVAE ?F?r r rr Tc t| |_||c|_|_t |||ddddd} | ddtd ddi| |_td i| |_ ||_ dt| dd z z|_ t||j||| | || |_t j|j|j|d |dz |_t j|j|j|d |dz |_|jr4|d |DdSdS)Nr)r r rrrrT)dropoutch z_channels in_channelsch_multnum_res_blocksusing_sa using_mid_sadouble_zFr&r ) vocab_sizeCvae using_znormbetadefault_qresi_counts v_patch_nums quant_resishare_quant_resi)stridepaddingc8g|]}|dSF)requires_grad_).0ps //home/notantonvoron/switti_demo/models/vqvae.py z"VQVAE.__init__..Ls& @ @ @Q  e $ $ @ @ @)super__init__ test_modeVr,dictpopr encoderr decoderr+len downsamplerquantizetorchnnConv2d quant_convpost_quant_conveval parameters)selfr+r$r#r"r.r- quant_conv_ksr1r2r/r0r@ddconfig __class__s r:r?zVQVAE.__init__s "&  !#      Z&&&::::: **** $HY$7 8 81 <=*:!#!5%!- + + +   (// Ity-=TUCU*   %x Ity-=TUCU /   > A IIKKK @ @doo.?.? @ @ @ @ @ @ A Ar<ctj|||||\}}}|||||fS)N) ret_usages)rforwardrHrLrDrErM)rPinprUf_hatusagesvq_losss r:rVz VQVAE.forwardOsp  !% OODLL-- . .:"/" " vw||D007788&'IIr<rXcz|||ddS)Nr rErMclamp_)rPrXs r: fhat_to_imgzVQVAE.fhat_to_imgXs2||D007788??AFFFr<Ninp_img_no_gradr0 noise_stdreturnc|||}|j|d||S)NF)to_fhatr0ra)rLrDrHf_to_idxBl_or_fhat)rPr`r0rafs r: img_to_idxBlzVQVAE.img_to_idxBl[sI OODLL99 : :}// u<90   r< ms_idx_Bl same_shapec \|djd}g}|D]}}|jd}t|dz}||j|dd||j||~||||S)Nrr rr) ms_h_BChwall_to_max_scalelast_one) shaperoundappendrH embedding transposeviewr, embed_to_img) rPrhrirmBrkidx_Bllpns r: idxBl_to_imgzVQVAE.idxBl_to_imgfs aL q !   F QAq#vB    ''//1aaB++       *x!   r<rkrlc |rWj||dddSfdj||dDS)NT)rlrmr\r cg|]>}|dd?Sr\r r]r8rXrPs r:r;z&VQVAE.embed_to_img..sS T11%8899@@QGGr<F)rErMrH embed_to_fhatr^)rPrkrlrms` r:rtzVQVAE.embed_to_imgws  <<$$M//!4Dt0 fRmm  !]880@59 r<c2|}j|d|}|rB|dddSfd|DS)NT)rdr0r\r cg|]>}|dd?Sr|r]r}s r:r;z2VQVAE.img_to_reconstructed_img..sS T11%8899@@QGGr<)rLrDrHrerErMr^)rPxr0rmrf ls_f_hat_BChws` r:img_to_reconstructed_imgzVQVAE.img_to_reconstructed_imgs OODLLOO , , 88 t,9    << 4 4]25F G GHHOOPRTUVV V* r< state_dictcd|vr;|djd|jjjdkr|jj|d<t|||S)Nzquantize.ema_vocab_hit_SVr)rstrictassign)rnrHema_vocab_hit_SVr>load_state_dict)rPrrrrSs r:rzVQVAE.load_state_dictsw ': 5 U67=a@}-3A67 U 7;m6TJ2 3ww&&!&'   r<) rrrrrFrrrrrTr6)NN)NF)TF)__name__ __module__ __qualname__r?rVrITensorr_rrr intrfloatr LongTensorrgboolryrtrrstrrr __classcell__rSs@r:rrsG  68A8A8A8A8A8AvJJJJGGGGG IM%)     xc5c?.B(CDE  E?  e      IN  el+ 9= tEL!5</ 0    $OTel+?C tEL!5</ 0,IM xc5c?.B(CDE el  $   $sCx.           r<rc,eZdZ dfd ZxZS) VQVAEHFrrrTr r rrrr rrcVt||||||dS)N)r+r$r#r@r2r0)r>r?)rPr+r$r#r@r2r0rSs r:r?zVQVAEHF.__init__sB !!-%      r<)rrrTrr)rrrr?rrs@r:rrsQ 7          r<r)__doc__typingrrrrrrr rItorch.nnrJhuggingface_hubr basic_vaer r quantrModulerrr=r<r:rs EDDDDDDDDDDDDDDDDD 000000''''''''######R R R R R BIR R R h     e)     r<