YhPbddlZddlZddlmZmZmZmZmZddlm Z ddl Z ddl Z ddl m Z ddl m ZddlmZmZmZddlmZmZddlmZmZd d lmZmZe j8Zej:ZeZd Z d e jBd e"de"de jBfdZ#Gdde jHZ% dEde jHde jBde jBde jBdee jBde&de&fdZ'dFdZ(Gdde jHZ)Gdde jHZ*d Z+dGd!e"fd"Z,dHd#e jBd$e"d!e"d%e"fd&Z-dId'Z. dJd(Z/d)e jBd*e"de jBfd+Z0Gd,d-e jHZ1Gd.d/e jHZ2Gd0d1e jHZ3Gd2d3e jHZ4Gd4d5e jHZ5d6e jBd7e jld8e"d9e7d:e"de jBf d;Z8Gd<d=eZ9Gd>d?e9Z:Gd@dAe9Z;dBe"d9e7fdCZ T:#5x @r#c veZdZ d dedededeedef fd Zdejdejfd Z xZ S) BLTMLPr& hidden_dim multiple_offfn_dim_multipliermp_sizec^t|td|zdz }|t||z}|||zdz |zz}||zdk(sJ||_||_t j ||d|_t j ||d|_t j ||d|_ y)Nr)rrrFbias) super__init__intr&r4nnLinear gate_projup_proj down_proj)selfr&r4r5r6r7 __class__s r!r<zBLTMLP.__init__2s Z!+,  )/*<=J Z+%=%Ak$QR G#q((($    yy       r#r$r'c|j|j|}|j|j|}|jt j ||z}|SN)r@view_asrArBrsilu)rCr$x1x3outputs r!forwardzBLTMLP.forwardUsM ^^AIIaL ) \\!))A, 'r R0 r#)r) __name__ __module__ __qualname__r=rrr<torchTensorrL __classcell__rDs@r!r3r31s] ! ! !  ! %UO !  ! F%,,r#r3modulequerykeyvalueattention_maskscalingdropoutc Tt||j}t||j} tj||j dd|z} |#|ddddddd|j df} | | z} t jj| dtjj|j} t jj| ||j} tj| | } | j ddj} | | fS)Nr)rrr)r&dtypeptrainingr)r1num_key_value_groupsrPmatmul transposer*r>r softmaxfloat32tor\rZr_ contiguous) rTrUrVrWrXrYrZr key_states value_states attn_weights causal_mask attn_outputs r!eager_attention_forwardrl\s3 ; ; > HE >EKKZ0b 9 > > HE == U]]1- --r#c eZdZdedeffd Z d dejdejdejdedef d Z xZ S) BLTSelfAttentionconfig layer_idxcxt|||_|j|_|j |_|j |_|j|_|j |jz|_|j|jz|_ |jdz|_ |j|_ ||_ tj|j |j|jzd|_tj|j |j|jzd|_tj|j |j|jzd|_tj|j|jz|j d|_y)NFr9)r;r<rnum_attention_heads num_headsrZ hidden_sizenum_key_value_headsr0r`rY rope_thetarr>r?q_projk_projv_projo_proj)rCrrrDs r!r<zBLTSelfAttention.__init__s>  33~~ !--#)#=#= **dnn< $(NNd6N6N$N!}},  ++"ii 0 0$..4==2PW\] ii 0 0$2J2JT]]2Zafg ii 0 0$2J2JT]]2Zafg ii >@P@PW\] r# hidden_statesrXposition_embeddingsoutput_attentions use_cachec L|j\} } } |j|} |j|} |j|}| j | | |j |j jdd} | j | | |j|j jdd} |j | | |j|j jdd}|\}}t| | ||\} } |'|||d}|j| ||j|\} }t}d}d|j_d|_|jjdk7rN|jjdk(r|rt j#dnt$|jj}||| | ||f|j&sdn |j(|jd |\}}|j+| | d j-}|j/|}|sd}|||fS) Nrr))rvrucache_positionFsdpaeager`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rZrYr)sizerrrrprr0rbrrupdaterrlr_attn_implementationrYlogger warning_oncerr_rZr,rfr)rCrrXrrrpast_key_valuerr bszq_len_ query_statesrgrhrurv cache_kwargsattention_interfacerkris r!rLzBLTSelfAttention.forwards&**, UA{{=1 [[/ {{=1 #((eT^^T]]S]]^_abc __S%1I1I4==Yccdeghi #((eT5M5Mt}}]gghiklm &S#7 jRUWZ#[ j  %#&snUL'5'<'O##L '>dkk>^>^&_#$7      % $}}C$,,LL %  % ! \"))#ub9DDF kk+.  LL.88r#)FFNN) rMrNrOrr=r<rPrQboolrLrRrSs@r!rrsh^y^S^.',#<9 <<<9"LL<9"' <9 $ <9  <9r#rceZdZd fd Z d dej deedeej deej deej deedeejd ej fd Z xZ S) BLTTransformerLayerct ||}|}|j}|j}|j}|j }|j } |j} |xs||z|_|xs||z|_|xs |j|_||_ t|||_ t|d|z|| |_ t|| |_t|| |_y)N)rr)r&r4r5r6eps)r;r<r0r/rr5r6norm_epsn_headsrr self_attnr3mlpRMSNorminput_layernormpost_attention_layernorm) rCr&rrrr0r/rr5r6rrDs r!r<zBLTTransformerLayer.__init__s ??&& && (( #66?? 2C7N 1#/ $4  )9M3w#1   's9(/(B%r#rrrrXlayer_head_maskrrr'c |}|j|} |j| ||||||\}} } ||z}|j|} ||j| z}|S)N)rrrXrrrr)rrrr) rCrrrrXrrrresidualnorm_hidden_statesself_attn_weightspresent_key_valuenormalized_hidden_statess r!rLzBLTTransformerLayer.forward s!!11-@?Cnn,))+/) 3?M ? ; (*;!=0 #'#@#@#O %1I(JJ r#r)NNNNFN) rMrNrOr<rPrQrr LongTensorrLrRrSs@r!rrsCB*.6:1526,159||!&ell3  !. "%,,/$D>!!1!12 r#rc|dk(}tjtj|jddtj|j |ddddfgd}|dk7|z}|j S)Nrrr\devicerrn)rPcatzerosr*rrany)tensor zero_mask shifted_masknon_zero_after_zeros r!check_non_zero_after_zeror*su! I99 KK Q%**V]] S a"f    L"Q;,6  " " $$r# hash_func_nbc&gd}tj||tj|j}tjt |j dDcgc]}||z c}}tj||zdScc}w)N) iʚ;l21Ai oYlvtl.l}glAul 0lTlAK l|rrrn)rPrint64rrqranger*sum) token_tensorrprimesprimei prime_powerss r!rolling_polynomial_hashr6s{ F LL -U[[I\I\ ]E;;% 8J8J28N2OP2OQq2OPQL 99\L0b 99 Qs! B token_ids group_sizemax_hashcftj5|j\}}tj||dz tj|j }tj ||gd}|jd|d}t||}||z} dddd _ | S#1swYxYw)ah Returns a hash of the input token_ids and maps it to a value in the range [0, max_hash]. expects: token_ids of shape (batch_size, seq_len) with values as ids in the token vocab. returns a tensor of shape (batch_size, seq_len) with values in the range [0, max_hash]. Note: max hash can make a big difference on the number of collisions. rrrnNF) rPno_gradr*rrrrunfoldr requires_grad) rrrrr-seq_lenprefixwindowshasheshash_values_ranges r!byte_group_hash_functionrIs 'oo GZau{{S\ScScdIIvy1q9 ""1j!4(,?"X- ',#  s BB''B0c |j\}}|ss|jdj|||}tj||j jdjdj|||}nr|jdj|||}tj||j jdjdj|||}|||k(}|S||k|||zkz}|S)ab Creates a tensor of shape [batch_size, seq_len, num_patches] where each element at position (i, j, k) is True if the patch id at position (i, j) is less than or equal to k. Args: patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids. num_patches (int): Total number of patches. window (int): If not None, only considers patches within a window of size window. patches_as_queries (bool): If True, the patches are used as queries Returns: torch.Tensor: Tensor of shape [batch_size, q_len, kv_len] with the desired mask. rrrr)r*ror+rParanger) patch_ids num_patcheswindowpatches_as_queriesr-rq_idskv_idsmasks r!create_patch_mask_from_idsr^s$//J ##B'..z7KP LLY-=-= > Yq\ Yq\ VJ 5 $$Q'..z;P LLY-=-= > Yq\ Yr] VJ W 5  ~ K%EFVO$;< Kr#c z |jd}tj5t||jd||j ||rdnd |r|jd|zn|}|r|n|jd|z} j||| fk(sJ jd||| fd}|r! fd} t | |d|| d }|cdddStj tjd tjtd jdcdddS#1swYyxYw) Nrr)rrrrnz != c|||fSrF)brq_idxkv_idx cross_masks r! patch_maskz#cross_attn_mask..patch_masks!!UF"233r#T)BHQ_LENKV_LEN_compilerz-inf) r*rPrrrepeat_interleaver whererrro) r patch_lengthsNr cross_attn_kr block_maskr-rkv_lenrrs @r!cross_attn_maskrsS#J /     "1   L3Ea2  N  :L ##A&5QR(m.A.A!.D|.S   $   B tZ$?#@ A  B   4+ J; >;;z5<<+YZdd? sB$D1AD11D:rmax_patch_lengthc||S|jd}g}d}|D]q}g}||dkDD]<}t|j|\}} |j|g|z| r| gngz>|j |t |t |}stj||f|j|j} t|D]D\} }|s tj||j|j| | dt |f<F| dk7jdjjdj!} | | j"dkr| ddd| j"d| z f} | S)Nrrr)rdivmoditemextendappendmaxlenrPrr\r enumeraterflipr=argmaxminr*) rrr- split_allmax_lenseqsplitslengthfullrempaddedr last_non_zeros r!process_patch_lengthsrsv##A&JIG#'lFv{{}.>?ID# MM+,t3uL M#  gs6{+[[*g.m6I6IR_RfRf gFy) 6 &+ll6ATAT]j]q]q&rF1ls6{l? #* q[&&q)--/66q9==?Mv||A&;FLLOm;;;< Mr#c^eZdZddeffd Zej edZxZ S)BLTRotaryEmbeddingrc^t||jd|_|j|_|j|_||_t|j|_ |j|j|\}|_ |jd|d|j|_ y)N rope_typeinv_freqF) persistent)r;r< rope_scalingr!max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrr rope_init_fnattention_scalingregister_bufferr"original_inv_freq)rCrrr"rDs r!r<zBLTRotaryEmbedding.__init__s ,,[9"("@"@$*$B$B! /?+/+<+>@GG HZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk ^^ U C&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C D vvAGGv$cff177f&;;; D Cs BF  FrF) rMrNrOrr<rPrrrLrRrSs@r!rrs3 /y /U]]_ < |j |jz|j|_ d|_!|jr|jtDjj!|_!|jr |jnd}t#|D]3}|jBjGtI|||j5yyycc}w)NrFr9rrrr)%r;r<rZdim_local_encodern_layers_local_encodern_heads_local_encoder vocab_sizepm_sizecross_attn_encodercross_attn_nheadscross_attn_all_layers_encodercross_attn_init_by_poolingrrsliding_windowr> ModuleListrrlayersr0max_encoder_seq_length max_seqlenr%r rotary_embencoder_dim_token_embr?token_embedding_projection_create_patch_projectionpatch_embedding_projection Embedding embed_tokenscross_attn_layersrPr BLTCrossAttention)rCrrencoder_config layers_to_addrrDs r!r<zBLTLocalEncoder.__init__s ~~ !'!9!9&,&C&C#%+%A%A" ++~~ "(";";!'!9!9-3-Q-Q**0*K*K'"// $33mmDEIE`E`a%bayz%89O9OQUQkQkms%ta%bc  "&"8"8Dr rZr_rPrrror+rLrrrIrCrrE patch_reducerPr,rr>rS)rCrWrXrYrrrrrZr-sequence_lengthrrrwridxlayerrcross_attention_outputencoder_cross_statess r!rLzBLTLocalEncoder.forwards'0oo# O  ,,Y7L'-- Aq$  --mt||VZVcVc-d ||IOOA$6|?R?RS]]^_`gghrtvw "oom\J -4<<$--X #DKK0JC!-EXimnM&&C3t{{3Ca3G,G4KmKm'#'#4#4]KQWYb#cL66B'+'F'F|'T '3';';J HZHZ[\H]`d`q`qHqswtJtJ(K #'#E#EC1 /Pt/E/Ei/P".+8#-&+##' 0,&1 ,.DD )1,04/F/F|D34e;;r#c@|jduxr|j|jk7}|jxr |jxs|jxr |j}|s|sy|j |j z}tj|j|dSNF) in_features out_featuresr:) encoder_dim_patch_embr>rCrFcross_attn_decoderrMrr>r?rCrdimension_mismatchcross_attn_conditions output_dims r!rOz(BLTLocalEncoder._create_patch_projectionPs#99EC&JfJfjpkCkCKC!'!:!:!`v?`?`!  % % K&*K*K #&;11F4G4GG yy44#  r#c*|j\}}}|jdjdd|jd}tj|||f|j |j }|j|d||d}|ddd|ddf}|S)a Reduce variable length patches to single embedding per patch Note: this works with variable number of patches for different sequences in the batch It handles variable length patches by assuming that patch_lengths will be 0 for any extra patches on the *right*. Since there can be a variable number of patches this function also return the number of patches for each sequence in the batch. Any embeddings on the right that are not allocated to a patch (i.e. if the sum(patch_lengths[i]) < seq_len for any i) will be sent to a dummy patch, which is trimmed before returning. rrrF)srcr&indexreduce include_selfN)r*ror+rPrr\rscatter_reduce) rCrmax_num_patches reductionrr-r embedding_dimreduced_embeddingss r!razBLTLocalEncoder.patch_reducebs.;-@-@* G]''+222r=;N;Nr;RS "[[*o})U]j]p]pzGzNzNO/>> ? 03CO3CQ0FG!!r#)NNNNNNN)rMrNrOrr<rPrQrrr3r=rrrLrOrarRrSs@r!r:r:s,y,b04/3@D-1%),0HL2<<<2<u||,2<u||, 2< u[%,,;<= 2< U\\* 2<c]2<ELL)2<U5<<s#BCDE2jCtE|||j5tj2|j|j d|_#ycc}w)Nr<Fr9rrr=)$r;r<dim_local_decodern_heads_local_decodern_layers_local_decoderrArrZrlrDcross_attn_all_layers_decoderrrGr>rHrrrIr0rJrKr%rrLdecoder_dim_token_embr?rNrOrPrnormrSrPr rTlm_head)rCrrdecoder_configrVrrDs r!r<zBLTLocalDecoder.__init__sv "(!9!9%+%A%A"&,&C&C# ++ ~~ "(";";!'!9!9-3-Q-Q*"//$33mmDEIE`E`a%bayz%89O9OQUQkQkms%ta%bc "&"8"8Dr?rms r!rOz(BLTLocalDecoder._create_patch_projections#..d:lv?P?PTZTlTl?l"(!:!:!`v?`?`!  % % K&*K*K #&;11F4G4GG yy))#  r#tokensembedsrYrr rrZc ~|j\}}|j\}} } |Jd|} |j]|Jd|j|}|j7|j||jd|jz|j}||j s| |z} t j|jd|jjdj|d} |j| | } tj| |j|j} t|j D]Q\}}|j r5|dk(s |j"r$|j$|| ||ddd \}} } | |z} || | d } S|j'|j)| }||fS) NzEmbeddings must be providedz Patch embeddings must be passed.rrrrr]Fr_r])r*rPrr,r~rlrPrrror+rLrrZr_rrIrrSrr)rCrrrYrrrZr-rb seq_lengthrrrwrrrdrelogitss r!rLzBLTLocalDecoder.forwards'-ll# O$*LL! J!@#@@!  * * 6+ O-O O+::<HL  ,+33J @R@RST@UX\XiXi@ikolBlB C  #D,C,C)L8M||FLLOFMMJTTUVW^^_ikmn "oom\J -4<<$--X !$++.HAu&&AFd6X6X/Ht/E/Ea/H"/+7#-&+##' 0,&1!.0F F !-EXimnM/dii 67u}r#NNNN)rMrNrOrr<rOrPrQrrr3rrr=rLrRrSs@r!r|r|~s0 y0 d .04@D-1HL- -&-u||, - u[%,,;<= - U\\* -U5<<s#BCDE-r#r|c6eZdZdZddededeeffd Z ddejdeejdee d eejd e d ee d eejd e ejeejee ejffdZxZS)rTzrrDrrr0r`rYrZr>r?rrrrrrq_normk_norm)rCrrrrDs r!r<zBLTCrossAttention.__init__se  "&B&*B*B11#)#;#; ((DNN: $(NNd6N6N$N! ~~ ii 0 0$..4==2PW\] ii 0 0$2J2JT]]2Zafg ii 0 0$2J2JT]]2Zafg ii >@P@PW\] jj!1!1vG jj!1!1vG r#rr`rrXrrrr'c |j\} } } |j|} |j| } |Y|j|}|j |} |j |}|n|j | ||jd|i\} }nJ|;|ddk7r3|j|j|j|j}} n | tdt}|jjdk7rN|jjdk(r|rtjdnt |jj}| j#| | |j$|j&j)dd }  j#| d |j*|j&j)dd } j#| d |j*|j&j)dd }||| | ||fd |j,d |\}}|j/| | d j1}|j3|}||z}|sd}|||fS) z#Input shape: Batch x Time x ChannelNrrzcCross attention layer can't find neither `cross_attention_states` nor cached values for key/values!rrrrr)rrr)rrrrrrrr key_cache value_cache ValueErrorrlrrrrrrprr0rbrrYr,rfr)rCrr`rrXrrrr rrrrrgrhrrkris r!rLzBLTCrossAttention.forward sY&**, UA{{=1 {{<0 ! -%)[[1G%H "%;L),:+@+@ dnn?OQ_>`,( L 'N1,=,B((8**4>>:%J &- y)@ ;; + +w 6{{//69>O##L '>dkk>^>^&_##((eT^^T]]S]]^_abc __S"d.F.F V``abdef #((b$2J2JDMMZddefhij $7      % LL %  % ! \"))#ub9DDF kk+. !M1  LL.88r#rF)NNNFNN)rMrNrO__doc__rr=rr<rPrQr rrrrLrRrSs@r!rTrTsFHyHSHxPS}H0:>*.15"'$(59H9||H9!) 6H9! H9 !. H9  H9D>H9!!1!12H9 u||Xell3XeELL>Q5RR SH9r#rTceZdZfdZ ddej deej deej deeeej e fdee e ej ej e ff dZ xZS) BLTGlobalTransformerct||j|_|j|_|j|_|j |_t j|_|j}|j|_ t|jD]<}|jjt|j|j|>||_ |}|j|jz|_t||_d|_|j$L|j$|jk7r2t j&|j$|jd|_yyy)Nr<Fr9)r;r<rn_heads_globaln_layers_globalrZr>rHrIr/n_kv_heads_globalrr rr0rrLrNglobal_dim_patch_embr?)rCroldr global_configrDs r!r<zBLTGlobalTransformer.__init__Xs+ !++$33%55~~ mmo "44t++,A KK  24??DDWDWY_` a- !%D4G4G!G ,MB*.'  & & 2v7R7RVZVeVe7e.0ii++/D +8f 2r#rWtok_idxrXrrZc|j\}}}|} |j-| jd|jk7r|j| } tj| |j|j } t j|jd|jjdj|d} |j| | } t|jD]\} } | | | d} | |fS)Nrr]rrrr])r*rNrrrZr_rPrrror+rLrrI)rCrWrrXrrZr-rrrrwrrrds r!rLzBLTGlobalTransformer.forwardus%1$6$6! J$  * * 6=;N;Nr;RVZVeVe;e ;;MJM -4<<$--X ||IOOA$6|?R?RS]]^_`gghrtvw "oom\J!$++.HAu!-EXimnM/e##r#r)rMrNrOr<rPrQrrr r3rrr=rLrRrSs@r!rrWs@+//3>BHL $<<$%,,'$u||, $ uY c9:; $ U5<<s#BCDE $r#rlocal_encoder_tokensencoder_hash_tok_embedding$encoder_hash_byte_group_nb_functionsencoder_hash_byte_group_sizeencoder_hash_byte_group_vocabc|y|j|}d}t|D]-}|D]&} t|| ||} ||} || | z}|dz }(/|t|k(sJ|S)a Compute embeddings using hash token embeddings. Args: local_encoder_tokens: Input tokens tensor local_encoder: Encoder object with embed_tokens method encoder_hash_tok_embedding: ModuleList of hash token embeddings encoder_hash_byte_group_nb_functions: Number of hash functions encoder_hash_byte_group_size: List of byte group sizes encoder_hash_byte_group_vocab: Vocabulary size for hash embeddings Returns: torch.Tensor: Combined embeddings Nr)rrr)rRrrr) r local_encoderrrrrlocal_encoder_embedsrfunc_nbbyte_group_sizehash_idshash_tok_embeddings r!compute_hash_embeddingsrs,")(556JK A=>;O/$$6 H " #7:LX:V#V FA <? ./ // / r#c8eZdZeZdZdZgdZdgZdZ dZ dZ dZ y)BLTPreTrainedModelmodelT)rr:r|rpast_key_valuesFct|tjrt|d|jdz}tj j |jd|d|zd|z|j*tj j|jyyt|tjrOt|d|jdz}tj j |jd|d|zd|zyt|tr?|j2|jjdz}|jD] }||_ yyt|t"rj|j$(|jjdz|j$_|j&)|jj(dz|j&_yyt|t*rj|j$(|jj,dz|j$_|j&)|jj.dz|j&_yyt|t0r,|j$|j2dz|j$_yyt|t4r<|jj6dz}||j8_||j:_yy)N _custom_stdrrr)meanstdar)r1r>r?getattrriinit trunc_normal_weightr:zeros_rQryBLTModelrrr>rr:rNrPrkr|r~rr dim_token_emb BLTPatcherr&rRr)rCrTremb_stdr8s r! _init_weightsz BLTPreTrainedModel._init_weightssd fbii (&-1C1C1MNC GG ! ! s(c' " {{&v{{+' -&-1E1E$1OPC GG ! ! s(c' "  )00< --99dC!<rrrpatchereval parametersr)rCrparamrDs r!r<zBLTModel.__init__sx  !' 7 7#11 ++"(";"; & 7 7#)#=#= %55"(";";"(";";"//)/)I)I&)/)I)I&-3-Q-Q*mm "//,V4"6v">,V4*> $66)/)L)L+ '  %f-DL LL   002&+#3 DLr#rrc |j\}}||}}||jtjk(rO|j ||j d|j |j|j|j\}}}nX|j\}} | dz} tj|| f|j|j}t||j}|j||jd} d} |j r0t#| ||d|j$|j&|j(} t+||j,|j.|j0j2|j0j4|j0j6} |j-|| d| |jd| \\}}}|!|j9||jdd}n+|j,j;||jdd | }|j=|jd |jdj?|j@}tjB||jDk(\}}| ||f}|jD|||f<|jG|| \}}|}|j|ddddf|jd}|jHsGtjJ|d|jMdjOdd|jd}d}n2|}t#|||d |j$|jP|j(}|jS|||| \}}|S)NT)rinclude_next_token thresholdrrrrrr)rrrr)rrrrrr)rWrXrYrrrr^r)rXrWF)rrYrr)*r*rrentropyrrrrrrrPonesr\rr_patch_ids_from_lengthsrCrrrrrrrrrrrrpranewfill_rrrrrlgatherror+rr)rCrrr-rbrlocal_decoder_tokensrbatch_size_tokensrseq_len_next_tokrcross_attn_mask_encrencoder_hidden_statesrf cache_encoderglobal_hidden_states global_tokensrowscols eos_patch_idsdecoder_embedsdecoder_patch_idspatch_hidden_statescross_attn_mask_decrKs r!rLzBLTModel.forward#s'-ll# O5;V2  !!%5%=%==&*ll(#'+"55%)%:%:(,(@(@//'3'#=!.B-G-G*!7#*Q; % &(89AUA[A[dxdd! !6mTEZEZ [ 00@T@Z@Z[]@^_ #  " ""1#'!..55==#  7!5,,'+'F'F151a1a)-)Q)Q*.++*S*S  HLGYGY*-*%++A. HZH D5  4} +#7#<#-> dM)*"&"9"9-##:# a/!88q!"u9MOcOiOijlOmn&&"',,/CQHYHcHcdfHgHnHnoqsuxLxRxRSUxVIW#X "& #7 "1!#(!..55==# &&!,'* '   r#rr'c|j\}}tjtj|d|j|j |j dddddfgd}tj||j }|jd|jdjdk}|jddz }|S)u Convert patch lengths to patch IDs for each token position. For each token position in the sequence, determines which patch it belongs to. Args: patch_lengths: [batch_size, num_patches] - length of each patch seq_len: total sequence length Returns: patch_ids: [batch_size, seq_len] - patch index for each token position Example: patch_lengths = [[3, 2, 4, 1]] # 4 patches of lengths 3,2,4,1 seq_len = 10 Returns: [[0, 0, 0, 1, 1, 2, 2, 2, 2, 3]] # pos 0-2→patch 0, pos 3-4→patch 1, pos 5-8→patch 2, pos 9→patch 3 rrrrnNrr) r*rPrrr\rcumsumrror) rCrrr-r patch_startstoken_positionsposition_ge_patch_startrs r!rz BLTModel._patch_ids_from_lengthss$#0"5"5 Kyy J1D1D]MaMab$$$,QV4    ,,w}7K7KL#/"8"8";?X?XYZ?[?e?efh?i"i,//B/7!; r#rF) rMrNrOrr<rPrQrrLr=rrRrSs@r!rrsb$ y$ R15J J  -JX(U\\(C(TYT`T`(r#rceZdZfdZ ddej deededee deededee fd Z e d Z e d Ze d Ze dd ZxZS)rc0t||jt|j|_t j|_t|jjD]Z}|jjt|jj|jj|j\tj j!|jj"|jj|_t'|jj|jj(|_t j,|jj|jj"d|_y)Nr<rFr9)r;r<patcher_configrrrLr>rHrIrn_layersr rr&rrPrQrArRrrrr?r)rCrrrDs r!r<zBLTPatcher.__init__s ../,DKK@mmo t{{++,A KK  24;;??DKKDWDWY]YdYde f-"HH..t{{/E/Et{{WDKKOO1E1EF yy KKOO KK " "  r# token_valuesrrrrrrcg}g} |jj} | |z} tj|j | } | D]} | | j | zz | z}tj || j| jd}tj| |fd} | jd| } || j|} | j\}}|j| }|}|j\}}}tj| jd|jjdj!|d}|j#||}t%|j&D]\}}|||d}|j)|j+|}|jd|jdd| j |z ddf}| j-||j/|}|j-|tj|dj|j}tj| dj|jdd}|j\}}|r|dzn|}|(|j1|||| }|j3||}n.tj4||f|j|j }t7||}|||fS) NF)r\rrrrnrrrr])rrr)rrKrPsplitrnumelrr\rrr,rer*rRrror+rLrrIrrr rfind_entropy_patch_start_idspatch_lengths_from_start_idsrr) rCrrrrrrr entropies predictions max_length batch_numelrrpad_sizepadr-rbrXrrrwrrrdrprediction_entropiesconcat_entropiesconcat_predictionsrpatch_start_idsrs r!rLzBLTPatcher.forwards  [[++  #66 \113[AE"ekkmj&@AZOH++hekk%,,^cdCIIucl2EMM"j1E!(+0++ 'J,,U3L(M+11 J1 << A|?R?RS]]^_`gghrtvwL"&//-"N %dkk25 %mI\mq r 3\\$))M":;F^^B R(89:TEKKMH>|?Q?QR"YY{:BBrQrH)rrr embeddingsemb_dimrrs r!rrsz **2JG$*$H$H! 6>> ?-A    1 .@ == $$r#)rrrr:r|r)r)Nrr)r)ri0u)NF)FrNT)>loggingostypingrrrrr cache_utilsr rPtorch.nnr>r r!torch.nn.attention.flex_attentionr r r modeling_rope_utilsrrmodeling_utilsrrconfiguration_bltrrr getLoggerrflex_attention_compr"rQr=r1Moduler3rrlrrrrrrrrrrr:r|rTrrHlistrrrrr__all__rr#r!r?s 99 $ZZKF **    $  c   )RYY)d% %||%\\%|| % !. %  %%6.DO9ryyO9b>"))>@ %::& #Y\lo*!P +\QVQ]Q]><<1>1B[![|u#un%%#'%2 r#