8g`,ddlZddlmZddlmZmZddlZddlZddl Z ddl m Z ddl m cm ZddlmZmZddlmZmZmZmZmZmZmZddlmZddlmZdd lmZm Z m!Z!m"Z"m#Z#dd l$m%Z%m&Z&m'Z'dd l(m)Z)ddl m cm Zdd l*m+Z+m,Z,ddl-Z.ej^e0Z1gd Z2gdZ3de jhde jhfdZ5ddZ6Gdde jnZ8Gdde jnZ9Gdde jnZ:Gdde jnZ;y)N) defaultdict)field dataclass) rearrangerepeat)AnyDictListOptionalTupleUnionCallable)partial) DecoderBlock) AttnBlockCrossAttnBlockMlp ResidualBlockMlp_res) PoseEmbeddingpose_encoding_to_cameracamera_to_pose_encoding) checkpoint)matrix_to_quaternionquaternion_to_matrix)g ףp= ?gv/?gCl?)gZd;O?gy&1?g? quaternionsreturncFtj|dddfdk| |S)a Convert a unit quaternion to a standard form: one in which the real part is non negative. Args: quaternions: Quaternions with real part first, as tensor of shape (..., 4). Returns: Standardized quaternions as tensor of shape (..., 4). .rr)torchwhere)rs 3/nas3/zsz/FLARE_huggingface/mast3r/vgg_pose_head.pystandardize_quaternionr# s+ ;;{3!8,q0;, LLc||jddz}|d|dz|dz}|dz dz jd|zd|z j}|S)N).rr).rr).r(rr() transposeclampacos_)R1R2epsR_difftraceangles r"rotation_distancer2.sb  R# #F 7OF7O +F7O ;EAgq[  3qu - 3 3 5E Lr$c^eZdZfdZdZdZdZejdZ ejddZ ejdZ dZ dd Z d Zejd Zejdd Zejdd ZxZS)SimpleVQAutoEncoderc  t|tjt d|dz|dzdgt dDcgc]}t |dz|dz|dzdc}zt |dz|dzddgz|_tjt d|dz|dzdgt dDcgc]}t |dz|dz|dzdc}zt |dz|dzddgz|_ycc}wcc}w)Nr(r)drop) super__init__nn ModuleListrrangerencoderdecoder)self hidden_size_ __class__s r"r;zSimpleVQAutoEncoder.__init__6s }} KM;q=q 9 :DIJKDL=M~W[QR]T_`aTacnopcpwx=y=M M Q A s ; < =  }} k!m[] ; <GLMNGO@PBC TU WbcdWdfqrsfsz{@|@P PTWXcdeXegrstgtvw~T@SA AB =M@Ps C; 6D cJ|j|}|j|}|SN)encodedecode)rAxsz_eouts r"forwardzSimpleVQAutoEncoder.forward?s"kk"okk# r$c8|jD] }||} |SrF)r?)rAxr?s r"rGzSimpleVQAutoEncoder.encodeDs%|| G A r$c8|jD] }||} |SrF)r@)rAz_qr@s r"rHzSimpleVQAutoEncoder.decodeJs%|| G#,C   r$cR|j|}|j|\}}}|SrF)rG quantizer)rArIrJrCcodes r" get_codeszSimpleVQAutoEncoder.get_codesSs(kk"o^^C( 1d r$ct|jdsJ|j|}|jj|||\}}||fS)Nget_soft_codes)temp stochastic)hasattrrRrGrV)rArIrWrXrJ soft_coderSs r"rVz"SimpleVQAutoEncoder.get_soft_codesYsNt~~'7888kk"o..77$S]7^ 4$r$c^|jj|}|j|}|SrF)rR embed_coderH)rArSrPdecodeds r" decode_codezSimpleVQAutoEncoder.decode_codeas)nn''-++c"r$cX|dzdz}|dzdz}tj|dd}||fS)Ng?rr)r r*)rAxs_realxs_recons r"get_recon_imgsz"SimpleVQAutoEncoder.get_recon_imgsgs;C-#%c>C';;xA.  r$ctj||d}|r"||jdz|jdz}|}||dS)Nmean) reductionrr) loss_total loss_recon)Fmse_lossshape)rArKrIvalidrgrfs r" compute_lossz SimpleVQAutoEncoder.compute_lossosOZZR6: #bhhqk1BHHQK?J $$  r$cB|jjjSrF)r@conv_outweight)rAs r"get_last_layerz"SimpleVQAutoEncoder.get_last_layers||$$+++r$c8|jj|SrF)rRembed_code_with_depth)rArSs r"get_code_emb_with_depthz+SimpleVQAutoEncoder.get_code_emb_with_depths~~33D99r$cb|jj|||}|j|}|S)a Use partial codebooks and decode the codebook features. If decode_type == 'select', the (code_idx)-th codebook features are decoded. If decode_type == 'add', the [0,1,...,code_idx]-th codebook features are added and decoded. )rRembed_partial_coderH)rArScode_idx decode_typerPr]s r"decode_partial_codez'SimpleVQAutoEncoder.decode_partial_codes/nn//h L++c"r$cN|j|}|j|||}|S)z> Reconstuct an input using partial codebooks. )rTrx)rArIrvrwrSrKs r"forward_partial_codez(SimpleVQAutoEncoder.forward_partial_codes+ ~~b!&&tX{C r$)g?F)NF)select)__name__ __module__ __qualname__r;rLrGrHr no_gradrTrVr^rbrlrprsrxrz __classcell__rDs@r"r4r45sB  U]]_ U]]_U]]_ ! ,U]]_::U]]_U]]_r$r4ceZdZ d dedeffd Zdddddej fdZdZd ejd ejfd Z d Z xZ S)CameraPredictorr8Nz_dim z_dim_inputc t|| |_||_||_||_| |_| |_|j dk(rd|_|j dk(rd|_tj|dd|_ tj|dd|_ t|j||jzd zd |_tjd ||_tj"t%j&d d d ||_tj"t%j&d d d ||_tj"t%j&d d d ||_tj"t%j&d d d ||_t1|||d |_t1||d z||jzd |_tj6tj||tj8|_tj<t?|jD cgc]} tA|||tjB!c} |_"tj<t?|jD cgc]} tG||||c} |_$tj<t?d Dcgc]7}tK|d|dtMtjdd|j9c}|_'tj6t?| D cgc]} tA|||tjB!c} |_(d|_)tjTjW|j(ddtXfdtZffD]>\}}|j]|t%j^|jad dd d d@ycc} wcc} wcc}wcc} w)NabsT_quaR_OneFLabsT_quaR_logFL Fư>elementwise_affiner.Tr( target_dimn_harmonic_functions append_inputrr in_featureshidden_features out_featuresr7 mlp_ratio attn_class)r )r.)rqkv_bias norm_layernorm_memrope皙?std _resnet_mean _resnet_std persistent)1r:r;cfg hooks_idx att_depth down_sizepose_encoding_typerrr< LayerNormnorm norm_inputr embed_poseLinear pose_proj Parameterr zeros pose_tokenpose_token_ref feat0_token feat1_tokenrinput_transform pose_branch SequentialGELU ffeat_updaterr=r>rMultiheadAttentionself_attr cross_attrr dec_blockstrunkgammainitnormal_ _RESNET_MEAN _RESNET_STDregister_buffer FloatTensorview)rArrB num_headsrrrrr trunk_depthrrrrCinamevaluerDs r"r;zCameraPredictor.__init__se """"4  " "&7 7DO  " "&7 7DO LLDQ ,,{tQUV(kT__>\ab=bqu 7K8,,u{{1aK'HI ll5;;q!Q +LM<< Aq![(IJ<< Aq![(IJ"{Kfqxyz#[1_S^aeapapSpwx  ]]299[++NPRPWPWPYZ t~~. +yIRTRgRgh   _deieses_t uZ[^Ki9 U u --1X) bIY`acamamswYxDHOSOXOX Y) ]]{++yIRTRgRgh    T2+\:]Kdetachrrrrrrrcudaampautocastfloat32r)rA batch_sizeiters pos_encodinginterm_feature1interm_feature2rrrgb_feat_init1rgb_feat_init2rgb_featBSC pred_pose_enc rgb_feat_initpred_cameras_listiter_num pose_embeddeltadelta_pred_pose_enc delta_feat pred_camerass r"rLzCameraPredictor.forwards')(!__T-A-A.QRBS-TUq!__T-A-A.QRBS-TUq 66z>Sacoqvw!Q..1a Aq$//:==hooN  ( e  GH)002M7J 3J*,H!00A6!BQB$GHQrrTNzz(+H$$X.E"'->t->(>"? sDOO$556J))$))J*?@8KH),??M =0A5H((emm(L G6}Y`a $5$F! G G% G6!(** G Gs &H  H c|dk(r tjjddS|dk(r tjjddStd|dz* Load the backbone model. dinov2szfacebookresearch/dinov2dinov2_vits14_regdinov2bdinov2_vitb14_regz Backbone 'z' not implementedr hubloadNotImplementedErrorrAbackbones r" get_backbonezCameraPredictor.get_backbone"W y 99>>";=PQ Q  "99>>";=PQ Q% 8* (b s) p c)bsz(b s) p c -> b s p czb m p c -> b (m p) c)mpzb (m p) c -> b m p c)r catrrrreshaperjrpermuter ones_likerexpandrr>rrrr)rArrrrr rgb_feat0 rgb_feat1rCrrNrNrrPridxfeat_0 feat_otherss r"rz%CameraPredictor.get_2D_image_features0sg II~a0.2CD!LOOPUVY]YiYijkYlYoYopuYvv II~a0.2CD!LOOPUVY]YiYijkYlYoYopuYvv %I%%b>)//!"*=> %I%%b>)//!"*=> )tq))Y l[ 1$9$$ZJiooab6IJ H  R 7)//"#"6 7''1a IIaB # IIaAq !  %//!,//2LADDUK K IIaAq ! IIaB 99ZQ2^^ 1a__++AqsB; YY 3 3 : :1aR H*U[\]``afg 99j(3<^^ 1a( IC +AQ!LH)t}}S)(3H +AQ!LHad^F"1ab5/K#K1G1q5TUVK-$..-k6BK#K1G1q5TUVKyy(1ac6"2K!@aHH IAq!G$Aq  r$) rr8rrPrr8rNN) r|r}r~intr;r bfloat16rLrTensorrrrrs@r"rrs,  Md  MdMd^)*ddhrvDMM6+p P<5<<t5dD cgc]} t7|||tj8!c} |_ tjBjE|j(d||_#dtHfdtJffD]>\} } |jM| t%jN| jQd dd d d@ycc} wcc} wNrrrrFrrr(Trrrrrrrrrrrr))r:r;rhood_idxrrrrrr<rrrrrr time_projrr rrrrrrrr>rrrrr=cam_token_encoderrrrBrrrrrrAr$rBrrrrrrrrrCrrrDs r"r;zCameraPredictor_light.__init___sS   """4  " "&7 7DO  " "&7 7DO LLDQ 'kT__>\ab=bqu 7K81k2 ll5;;q!Q +LM#[1_S^aeapapSpwx  ]]299[++NPRPWPWPYZ]]{++yIRTRgRgh   !#q0#1:+y\ertsHsH1I0#"$ ++6&+\:]Kddl}|j|jddDcgc]%}||dz j|d|j'} }|jddDcgc]%}||dz j|d|j'} }t j | d} t j | d} t j | | gdj|} |jD] } | | | z} | ddddf} | j|d| jd} | j\} }}t j| ||jj| }| j}g}t|D]w}|j}|jt j |gj| d}|j#|}|j%|}| |z|z} |j&dddf| ddddfz| ddddf<|j)| } |j+| }|dd|jf}|d|jdf}|j-|j/|| z} ||z}| |zdz } t j0j2j5d t j6 5t9|j;d }||gz}dddz|| fScc}wcc}w#1swYxYw) rrNrrr(r')NN.Frrr)ipdb set_tracer$rrBr rrr&rjrrrr>rr%tensorrrrrrrrrrrrrfloat)rArrrrrrr)rrrrr&rrrrrrrpose_embed_timerrrrrs r"rLzCameraPredictor_light.forwards T^^%aeananopoqars\]/!A#.66z1dFVFVWssaeananopoqars\]/!A#.66z1dFVFVWss>q9>q999nn=1EHHO!%!7!7 > "3H"==H >AqrE?##JHNN24FG..1a Aq$//:==hG  ( e  GH)002M"nnU\\8*-E-H-H-RST^_O7J 3J*,>H!00A6!BQB$GHQrrTNzz(+H$$X.E"'->t->(>"? sDOO$556J))$))J*?@8KH),??M =0A5H((emm(L G6}7J7J7Lahi $5$F! G G' G,!(**Its@ G Gs*L#*L "LL c|dk(r tjjddS|dk(r tjjddStd|drrrs r"rz"CameraPredictor_light.get_backbonerr$rrc:||jz |jz SrFrrs r"rz-CameraPredictor_light._resnet_normalize_imagerr$ rrr8rrr8rNN r|r}r~r;r rrLrrrrrs@r"r!r!^sd,  ;dz)*4QU_ckpkyky.+` P<5<<\} } |jE| t#jF| jId dd d d@ycc} wr#)%r:r;rr$rrrrrr<rrrrrrrr rrrrrrrr>rrrrrrrrrrrr's r"r;zCameraPredictor_clean.__init__s   """4  " "&7 7DO  " "&7 7DOLLDQ 'kT__>\ab=bqu 7K8 ll5;;q!Q +LM#[1_S^aeapapSpwx  ]]299[++NPRPWPWPYZ]]{++yIRTRgRgh    ++6+\:]Krrrrrrrrrrrrrrr,)rArrrrrrrrrrrrrrrrrrrrrs r"rLzCameraPredictor_clean.forward s.),44Z_UWEXE^E^_aEbc(,44Z_UWEXE^E^_aEbc99nn=1EHHO..1a Aq$//:==hG  ( e  GH)002M7J 3J*,H!00A6!BQB$GHQrrTN"$**h7H$$X.E"'->t->(>"? sDOO$556J))$))J*?@8KH),??M =0A5H((emm(L G6}7J7J7Lahi $5$F! G G' G,!(** G Gs +"HH& c|dk(r tjjddS|dk(r tjjddStd|drrrs r"rz"CameraPredictor_clean.get_backbone3rr$rrc:||jz |jz SrFrrs r"rz-CameraPredictor_clean._resnet_normalize_image>rr$r0r1rs@r"r3r3sd,  ,d\)*4QU_ckpkyky&+P P<5<<)<logging collectionsr dataclassesrrmathnumpynpr torch.nnr<torch.nn.functional functionalrheinopsrrtypingrr r r r r r functoolsr models.blocksrmodulesrrrrrutil_vggrrrtorch.utils.checkpointr)pytorch3d.transforms.rotation_conversionsrrpytorch3d.transforms pytorch3d getLoggerr|loggerrrrr#r2Moduler4rr!r3r$r"rOs#(  $DDD&KKUU-`   8 $$ #  M M Mc"))cJA!biiA!Hz