o HQg @sddlZddlmZmZmZddlZddlZddlmZGdddej Z Gdddej Z de d e fd d Z dd e de d e defddZdejdejfddZ ddejdejdejdefddZdS)N)AnyOptionalTuple)nncs~eZdZdZ   ddededeeffdd Zd d Z e d d Z e Z e ddZe de jfddZZS)PositionEmbeddingSinez This is a more standard version of the position embedding, very similar to the one used by the Attention Is All You Need paper, generalized to work on images. 'TN temperature normalizescalecsnt|ddksJd|d|_||_||_|dur&|dur&td|dur/dtj}||_i|_ dS)NrzExpecting even model widthFz+normalize should be True if scale is passed) super__init__ num_pos_featsrr ValueErrormathpir cache)selfrrr r  __class__S/mnt/petrelfs/dingshuangrui/SAM2-Video-Predictor/sam2/modeling/position_encoding.pyr s    zPositionEmbeddingSine.__init__cCs*t|t|kr|j|jkrdksJJ||j}||j}tj|jtj|jd}|jd|d|j}|dddf|}|dddf|}tj |dddddf |dddddf fdd d}tj |dddddf |dddddf fdd d}||fS)Ndtypedevicer rdim) lenndimr torcharangerfloat32rrstacksincosflatten)rxyx_embedy_embeddim_tpos_xpos_yrrr _encode_xy*s$0  44z PositionEmbeddingSine._encode_xycCsB|||\}}tj|||dddf|dddffdd}|S)Nrr)r.r cat)rr'r(whr,r-posrrr encode_boxes=s.z"PositionEmbeddingSine.encode_boxesc Cs|j|j|j\}}\}}\}} ||kr!||kr!||kr!|| ks#J|||\} } | ||d| ||d} } tj| | |dddddffdd} | S)Nr r)shaper.r&reshaper r/) rr'r(labelsbxnxbynyblnlr,r-r2rrr encode_pointsEs "$&z#PositionEmbeddingSine.encode_pointsr'c Csn|jd|jdf}||jvr|j|d|jddddStjd|jddtj|jdddd|jdd|jd}tjd|jddtj|jdddd|jd|jdd}|jrd}||ddddddf||j }||ddddddf||j }tj|j tj|jd}|j d|d|j }|dddddddf|}|dddddddf|}tj |dddddddddf |dddddddddffdd d }tj |dddddddddf |dddddddddffdd d }tj||fd d dd dd} | d|j|<| S) Nr4rrrgư>r r)r5rrepeatr r!r"rviewr r rrr#r$r%r&r/permute) rr' cache_keyr*r)epsr+r,r-r2rrrforwardNs@    ((  LLzPositionEmbeddingSine.forward)rTN)__name__ __module__ __qualname____doc__intboolrfloatr r.r no_gradr3encoder>TensorrG __classcell__rrrrrs(  rcseZdZdZddedeeddffdd Zd ej dej fd d Z d e eefdej fd dZ dej de eefdej fddZ ZS)PositionEmbeddingRandomz? Positional encoding using random spatial frequencies. @Nrr returncs<t|dus |dkrd}|d|td|fdS)Ng?#positional_encoding_gaussian_matrixr )r r register_bufferr randn)rrr rrrr xs z PositionEmbeddingRandom.__init__coordscCsBd|d}||j}dtj|}tjt|t|gddS)z8Positionally encode points that are normalized to [0,1].r rr4r)rWnprr r/r$r%)rrZrrr _pe_encodings  z$PositionEmbeddingRandom._pe_encodingsizec Cs||\}}|jj}tj||f|tjd}|jddd}|jddd}||}||}|tj||gdd}|dddS)z>Generate positional encoding for a grid of the specified size.)rrrrg?rr4r ) rWrr onesr"cumsumr\r#rD) rr]r1r0rgridr*r)perrrrGszPositionEmbeddingRandom.forward coords_input image_sizecCsz|}|dddddf|d|dddddf<|dddddf|d|dddddf<||tjS)zs$z)reshape_for_broadcast..)rr5 enumeraterC)rvr'r5rrzrreshape_for_broadcasts  r}Fxqxkrepeat_freqs_kc CsBt|jg|jddddR}|jddkr3t|jg|jddddRnd}t||}t||d}|durS|| |j |fS|r|jd|jd}|j ru|j gdg|j d|dR}n|ddd|dddd}t||d}|| |j || |j fS)Nr4r r?rrAr)r view_as_complexrNr6r5r} view_as_realr&type_asreris_cudarBr unsqueezeexpand) r~rrvrZxq_Zxk_Zxq_outrZxk_outrrrapply_rotary_encs ,, & $r)rp)F)rtypingrrrnumpyr[r rModulerrSrLrorNrurQr}rMrrrrrs( c4