0g! ddlZddlmZmZmZddlZddlZddlmZGddej Z Gddej Z de d e fd Z dd e de d e d efdZdejdejfdZ ddejdejdejdefdZdS)N)AnyOptionalTuple)nnceZdZdZ ddededeeffd Zd Z e j d Z e Z e j d Ze j d e jfd ZxZS)PositionEmbeddingSinez This is a more standard version of the position embedding, very similar to the one used by the Attention Is All You Need paper, generalized to work on images. 'TN temperature normalizescalect|dzdks Jd|dz|_||_||_||durt d|dt jz}||_i|_ dS)NrzExpecting even model widthFz+normalize should be True if scale is passed) super__init__ num_pos_featsr r ValueErrormathpir cache)selfrr r r __class__s CC:\codes\sam2\segment-anything-2\sam2\modeling\position_encoding.pyrzPositionEmbeddingSine.__init__s q A%%%'C%%%*a/&"  e!3!3JKK K =KE  ct|t|kr|j|jcxkrdksnJ||jz}||jz}tj|jtj|j}|jd|dzz|jz z}|dddf|z }|dddf|z }tj |dddddf |dddddf fd d}tj |dddddf |dddddf fd d}||fS)Ndtypedevicerrdim) lenndimr torcharangerfloat32rr stacksincosflatten)rxyx_embedy_embeddim_tpos_xpos_ys r _encode_xyz PositionEmbeddingSine._encode_xy*s1vvQAFaf$9$9$9$9$9$9$9$9$9$9dj.dj. T/u}QXVVV Q%1*%58J%JK4 5(4 5( 111add7^   ! !5ADqD>#5#5#7#7 8a   '!**  111add7^   ! !5ADqD>#5#5#7#7 8a   '!** e|rc|||\}}tj|||dddf|dddffd}|S)Nrr)r1r#cat)rr*r+whr/r0poss r encode_boxesz"PositionEmbeddingSine.encode_boxes=sSq!,, uiqDz1QQQW:>AFFF rc|j|j|jc\}}\}}\}} ||kr||kr ||kr|| ksJ|||\} } | ||d| ||d} } t j| | |dddddffd} | S)Nrr)shaper1r)reshaper#r3) rr*r+labelsbxnxbynyblnlr/r0r6s r encode_pointsz#PositionEmbeddingSine.encode_pointsEs'(w$R(2rHRRxxB"HHrbBhhhhqyy{{AIIKK@@ u}}RR00%--B2K2KuivaaaDj'9:BBB rr*c @|jd|jdf}||jvr4|j|d|jddddStjd|jddztj|jddd|jdd|jd}tjd|jddztj|jddd|jd|jdd}|jr@d}||ddddddf|zz |j z}||ddddddf|zz |j z}tj|j tj|j}|j d|dzz|j z z}|dddddddf|z }|dddddddf|z }tj |dddddddddf |dddddddddffd d }tj |dddddddddf |dddddddddffd d }tj||fd dd dd} | d|j|<| S) Nr9rrrgư>rr)r:rrepeatr#r$r%rviewr r rr r&r'r(r)r3permute) rr* cache_keyr-r,epsr.r/r0r6s rforwardzPositionEmbeddingSine.forwardNs7WR[!'"+.  " ":i(.55agaj!QJJ J LAGBK!O5= R R R T!R^^ VAGAJ172; / /  LAGBK!O5= R R R T!Q^^ VAGAJ Q / /  > HCBCC!3c!9:TZGGAAArss!3c!9:TZGG T/u}QXVVV Q%1*%58J%JK111aaa &.111aaa &. 111aaaADqD= ! % % ' 'qqq!!!QQQ1})=)A)A)C)C D!   '!**  111aaaADqD= ! % % ' 'qqq!!!QQQ1})=)A)A)C)C D!   '!** iA...66q!QBB #A 9 r)r TN)__name__ __module__ __qualname____doc__intboolrfloatrr1r#no_gradr7encoderCTensorrM __classcell__rs@rrrs!!%    (&U]___ FU]___U]__!!!!_!!!!!rrceZdZdZddedeeddffd Zdej dej fd Z d e eefdej fd Z d ej d e eefdej fdZ xZS)PositionEmbeddingRandomz? Positional encoding using random spatial frequencies. @Nrr returnct||dkrd}|d|tjd|fzdS)Ng?#positional_encoding_gaussian_matrixr)rrregister_bufferr#randn)rrr rs rrz PositionEmbeddingRandom.__init__xsg  =ESLLE  1 EKM 233 3     rcoordscd|zdz }||jz}dtjz|z}tjtj|tj|gdS)z8Positionally encode points that are normalized to [0,1].rrr9r)r`nprr#r3r'r()rrcs r _pe_encodingz$PositionEmbeddingRandom._pe_encodings\Va$BBRUV#y%)F++UYv->->?RHHHHrsizecf|\}}|jj}tj||f|tj}|ddz }|ddz }||z }||z }|tj||gd}|dddS)z>Generate positional encoding for a grid of the specified size.)rrrrg?rr9r) r`rr#onesr%cumsumrfr&rJ) rrgr5r4rgridr-r,pes rrMzPositionEmbeddingRandom.forwards1>Ez1a&u}EEE++!+$$s*++!+$$s*A+A+   u{GW+=2FFF G Gzz!Q"""r coords_input image_sizec|}|dddddf|dz |dddddf<|dddddf|dz |dddddf<||tjS)zz)reshape_for_broadcast..s- F F F41a!tax--QQQ F F Fr)r"r: enumeraterI)rr*r:r"s @rreshape_for_broadcastrs{ 6D ====D====== ?qwr{AGBK8 8 8 8 8 F F F F9QW3E3E F F FE 9>5 !!rFxqxkrepeat_freqs_kctj|jg|jddddR}|jddkrBtj|jg|jddddRnd}t ||}tj||zd}|/|| |j |fS|r|jd|jdz}|j r|j gdg|j dz z|dR}n@|ddd|dddd}tj||zd}|| |j || |j fS)Nr9rrErrGr)r#view_as_complexrTr;r:r view_as_realr)type_asrqris_cudarHr" unsqueezeexpand) rrrrxq_xk_xq_outrxk_outs rapply_rotary_encrs   2 2 IBHSbSM I2 Iq I I I J JC 8B<1   0bhhjj0G"(3B3-GGQGGGHHH  &i55I  i 0 0 8 8 ; ;F {~~b!!$$RY//33W IbMSYr] *   W( (MA3).12D+EMM1MMMII"++A..55b"aRHHPPQRTUVVI  i 0 0 8 8 ; ;F >>"    + +V^^B-?-?-B-B29-M-M MMr)r})F)rtypingrrrnumpyrer#rModulerr[rRr|rTrrWrrSrrrrrs '''''''''' `````BI```F+9+9+9+9+9bi+9+9+9hS 9 93 9s 93 9u 9 9 9 9"U\"el""""! NN N N|N NNNNNNr