Kh46 UddlmZddlmZmZmZmZmZddlZddl m Z ddlm Z ddl m Z mZmZddlZddlZddlZddlZddlm Z ddlmZddlZddlmZd Zd Zeed <d Zeed <dZeed<dZeed<dZeed<dZ eed<dZ!eed<e jDZ"ejFZ$d dl%m&Z&m'Z'm(Z(ddl)m*Z*ddl+mZ,eZ-dZ.ddddde/de/dzdedzdej`dzd edzf d!Z1d"Z2d#ej`d$ed%ed&ej`fd'Z3 dbd%ed(ed)e4d*e5fd+Z6d,ej`d#ej`d-efd.Z7d/ej`d0ej`d-ed,ej`d&eej`ej`ff d1Z8Gd2d3ejjrZ:Gd4d5e jrZ;Gd6d7e jrZ<Gd8d9e jrZ=d:Z>d;Z?d<Z@dcd=efd>ZA ddd#ej`d?ed=ed@efdAZB dedBZC dfdCZDdej`dDe5dEej`dFedGef dHZEGdIdJe jrZFGdKdLeFZGGdMdNeFZHGdOdPe jrZIGdQdRe jrZJdSej`dTe jdUedVeLdWed&ej`f dXZMGdYdZe*ZNGd[d\eNZOGd]d^eNZPd_edVeLfd`ZQgdaZRy)g)Enum)AnyListOptionalTupleUnionN)model_validator)nn)create_block_mask BlockMaskflex_attention) functional) nullcontext BOS_IDEOS_IDPAD_IDBOE_IDBPE_IDOFFSET BYTE_UNITS) BLTConfigPatchingModeEnum InitStdFactor)PreTrainedModel)loggingc ||k\SN)bhq_idxkv_idxs \/fsx/ita_zaporozhets/transformers/src/transformers/models/blt_wip/modeling_blt_wip_backup.py causal_maskr+/s F?)eos_idtokenssliding_window attn_implattn_bias_typer-r.r/c|dk(r@ttjjdd}|dk(ry|dk(ryt d|dk(rt t dd||Std|d |d ) NsdpaBLT_SUPPRESS_ATTN_ERRORrcausalrzSDPA attention being used, which doesn't have specialized attention implementations for block_causal and local_block_causal attention. To suppress this error and run the model anyway, set the environment variable BLT_SUPPRESS_ATTN_ERROR=1r z Attention z with z sliding window not implemented)intosenvironget ValueErrorr r+NotImplementedError)seqlenr0r1r-r.r/r4s r*create_causal_maskr=3sF"%bjjnn5NPQ&R"S X % "a 'A  & & dD&&II! 6.)99X Y  r,c tjtj|jdj d|jdfi|S)N)end_dimr)Fnll_loss log_softmaxflattenfloat)predtargetkwargss r* cross_entropyrIOsM :: dll2l.446;r"   r,xn_repdimreturnc|dk(sJd|j\}}}}|dk(r|S|dddddddddfj|||||j||||z|S)z0torch.repeat_interleave(x, dim=2, repeats=n_rep)rzAOnly dim=2 is supported. Check the implementation for other dims.rN)shapeexpandreshape)rJrKrLbsslen n_kv_headshead_dims r* repeat_kvrVWsq !8XXX8%&WW"Bj( z !Q4  D*eX 6 T:-x 8r,endthetarope_use_fp32_in_outer_productcd|tjd|dd|dzj|z zz }tj||j}|r|j tj }tj ||j}|j|j}}tj|| ||fdjg|jddS)a Precompute the frequency tensor for complex exponentials (cis) with given dimensions. This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64 data type. Args: dim (int): Dimension of the frequency tensor. end (int): End index for precomputing frequencies. theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. Returns: torch.Tensor: Precomputed frequency tensor with complex exponentials. ?rrNdevicerrL) torcharangerEr]tofloat32outercossinstackviewsize)rLrWrXrYfreqstrdres r*precompute_freqs_cisrkds* 5U\\!S!4\qBHHJSPQ RE S.A% DD  KK5 ! ' ' )Eyy{EIIKC :5;;cT3," 5 : : OEJJL O! OQ OOr, freqs_cisseq_dimc||j}d|cxkr|ksJJ|j|j||jdddfk(s Jd|j|jft|jddDcgc]\}}||k(s||dz k(r|ndc}}ddgz}|j|Scc}}w) a  Reshape frequency tensor for broadcasting it with another tensor. This function reshapes the frequency tensor to have the same shape as the target tensor 'x' for the purpose of broadcasting the frequency tensor during element-wise operations. Args: freqs_cis (torch.Tensor): Frequency tensor to be reshaped. x (torch.Tensor): Target tensor for broadcasting compatibility. seq_dim (int): Sequence dimension index. Returns: torch.Tensor: Reshaped frequency tensor. rrzfreqs_cis vs x: Nr?rr)ndimrO enumeraterg)rlrJrmrpidrOs r*reshape_for_broadcastrts 66D  $    ??      7 9??AGG456 7 ?HPSQS >U >UdaQ'\Q$(]1>U  A E 9>>5 !! sB8xqxkc|jg|jddddd}|jg|jddddd}t|||j}||zj dj d}||zj dj d}|j ||j |fS)Nrrrr)rQrOrtrEsumrDtype_as)rurvrmrlxq_xk_xq_outxk_outs r*apply_rotary_embrs "** .bhhsm .R . .A .C "** .bhhsm .R . .A .C%3 egIo " "1 % - -a 0FIo " "1 % - -a 0F >>" v~~b1 11r,c teZdZdZ d dedededeffd ZdZ d de ed e e jfd Z xZ S) RotaryEmbeddingz RotaryEmbedding Module rXrU max_seqlenrYc t|||_||_||_||_|j dt||||j dy)NrlrLrWrXrYF) persistent)super__init__rXrUrrYregister_bufferrk)selfrXrUrrY __class__s r*rzRotaryEmbedding.__init__sb    $.L+   /3/R/R     r,ct|j|j|j|j|j d<y)Nr.)rkrUrrXrYrl)rs r*reset_parametersz RotaryEmbedding.reset_parameterss32 **+/+N+N  sr,r<tok_idxcn|duxs|du}|sJd||j|S||jd|Sy)a} Return freqs_cis corresponding to consecutive seqlen positions or the corresponding tok_idx positions Args: seqlen (int): Contiguous sequence length tok_idx (torch.Tensor[int]): Position indices of each token this overrides seqlen Returns: Tuple(torch.Tensor, torch.Tensor): Embedded input tensor and freqs_cis Nz(Should provide atleast seqlen or tok_idxr)rl)rr<rtests r*forwardzRotaryEmbedding.forwardsWd"<t(;???t  >>'* *  >>!F+ + r,)iF)NN)__name__ __module__ __qualname____doc__rEr6boolrrrr_Tensorr __classcell__rs@r*rrsi/4      )-  2 OS,sm,5=ell5K,r,rc eZdZdededededef fd Z ddejdejd eejd ee e e fd e d ejf d Z ddZ xZS) BLTAttentionrLrUn_headsrT rope_thetact|||_||_||_||_||_|j |j z|_tj|||zd|_ tj|||zd|_ tj|||zd|_ tj||z|d|_ y)NFbias)rrrLrUrrrTheads_per_groupr Linearwqwkwvwo)rrLrUrrTrrs r*rzBLTAttention.__init__s   $ $#||t>))  h   ))   !  ))   !  )) h   r,rJfreq_cisrmaskr0rMcv|j\}}}|j|j|} |j|j|} |j |j|} | j} | j |||j |j} | j |||j|j} | j |||j|j} t| | d|d|\} } t|dr |jj| | |\} } t| |jd} t| |jd} |dk(rX|t|t sJt#d| | | f\} } } t%| | | |} | j'ddj)} n|d k(rt#d | | | f\} } } |"t|t*t,j.fsJt|t*r|d k(nd }t|t,j.r|j1| j2nd}t5j6| | | || } | j'ddj)} nt9d|d| j;| }|j=|} | S)Nrrkv_cacherr^r c&|jddSNrr transposees r*z&BLTAttention.forward..6q{{1a'8r,) block_maskr3c&|jddSrrrs r*rz&BLTAttention.forward..;rr,r5F is_causal attn_maskzAttention implementation z not supported)rOrview_asrrrgrrUrTrhasattrrupdaterVr isinstancer mapflex_attention_compr contiguousstrr_rrar]rAscaled_dot_product_attentionr;rQr)rrJrrrr0bszseq_lenrLrurvxv output_shapeoutputroutput_reshapeds r*rzBLTAttention.forwardskGGWc WWQYYq\ " WWQYYq\ " WWQYYq\ "xx WWS'4<< ? WWS'4??DMM B WWS'4??DMM B!"b!Xa-@AB 4 $]]))"b':FB r4//Q 7 r4//Q 7 ( (<:dI#> >>82r2,GJBB(REF%%a+668F & 82r2,GJBB<:dS%,,4G#H HH.8s.C)I)3D%,,)G477299%TD33# F%%a+668F%+I;nE !..6) r,c d|xs|jdz|z }|j|j|jfD]6}tj j |jd|d|zd|z8tj j |jjd|d|zd|zyNrormeanstdar&) rLrrrr init trunc_normal_weightr)rinit_stdfactorws r*rzBLTAttention.reset_parametersRs<T 2f<''477DGG,A GG ! !x-h, " -  GGNN8m(l  r,NNr3Nr[)rrrr6rErr_rrrr rrrrrs@r*rrs& & &  &  &  & X+/04 : <<:,,:%,,' : uY^,- :  : :x r,rc ~eZdZ d dedededeedef fd Zdejdejfd Z d d Z xZ S) BLTMLPrL hidden_dim multiple_offfn_dim_multipliermp_sizec^t|td|zdz }|t||z}|||zdz |zz}||zdk(sJ||_||_t j ||d|_t j ||d|_t j ||d|_ y)NrrrrFr) rrr6rLrr rw1w3w2)rrLrrrrrs r*rzBLTMLP.__init__hs Z!+,  )/*<=J Z+%=%Ak$QR G#q((($))    ))    ))   r,rJrMc|j|j|}|j|j|}|jt j ||z}|Sr$)rrrrrAsilu)rrJx1x3rs r*rzBLTMLP.forwardsM WWQYYq\ " WWQYYq\ "b) r,c|xs|jdz|z }|xs|jdz|z }tjj |j j d|d|zd|ztjj |jj d|d|zd|ztjj |jj d|d|zd|zyr) rLrr rrrrrr)rrr in_init_std out_init_stds r*rzBLTMLP.reset_parameterss?488#5"? GDOO$=#G   GGNN;+o    GGNN<,    GGNN;+o  r,)rr) rrrr6rrErr_rrrrrs@r*rrgsb ! ! !  ! %UO !  ! F%,, r,rc eZdZfdZ d dej dej deej deeee fde dej f dZ d d Z xZ S) BLTTransformerLayerct ||d}|d}|d}|d}|d}|d}|d}|d} | |Jd |xs||z|_|xs||z|_|xs |j|_||jzd k(sJ||zd k(sJt ||j|j|j| |_t|d |z|| |_t|| |_ t|| |_ y)NrLrrUrTrrrnorm_epsz+Should specify at least head_dim or n_headsr)rLrUrrTrr)rLrrreps) rrrUrrTr attentionr feed_forwardRMSNormattention_normffn_norm) rargsrLrrUrTrrrrrs r*rzBLTTransformerLayer.__init__s> 5ky/ #,' ,' =) !"67 #$   9 8 9 !2C7N 1#/ $4 (A---W}!!!%]]LL!  #3w#1   &cx82 r,rJrrrr0rMc|j|}|j|||||}||z}|j|} ||j| z} | S)Nrrr0)rrrr) rrJrrrr0norm_xattn_outr'h_normouts r*rzBLTTransformerLayer.forwardsi$$Q'>>   "  Lq!$##F++ r,c|jj|||jj|jj|||jjyr$)rrrrr)rrrs r* init_weightsz BLTTransformerLayer.init_weightssN ''&9 ,,. **8V< &&(r,rr) rrrrr_rrrr rrrrrs@r*rrs{%3V+/04  <<,,%,,'  uY^,-    *)r,rc,||g|t|z zzSr$)len)seqpad_idmax_lens r*rightpadrs &Ws3x/0 00r,c|dk(}tjtj|jddtj|j |ddddfgd}|dk7|z}|j S)Nrrdtyper]rr^)r_catzerosrOrr]any)tensor zero_mask shifted_masknon_zero_after_zeros r*check_non_zero_after_zeror su! I99 KK Q%**V]] S a"f    L"Q;,6  " " $$r,c|j\}}||zdk(r|S|||zz }|j||j|}tj||fdS)Nrrr^)rOnewfill_r_r)r. patch_sizefill_id batch_sizer remaining final_paddings r* fill_tokensrsb ,,Jq  :!55  :y9??H yy&-0a88r, hash_func_nbc&gd}tj||tj|j}tjt |j dDcgc]}||z c}}tj||zdScc}w)N) iʚ;l21Ai oYlvtl.l}glAul 0lTlAK l|rrr^)r_rint64r]rfrangerOry)rjrprimesprimerr prime_powerss r*rolling_polynomial_hashr sv F LL -U[[ RE;;% 2DE2DQq2DEFL 99Q%2 .. Fs! B group_sizemax_hashcftj5|j\}}tj||dz tj|j }tj ||gd}|jd|d}t||}||z} dddd _ | S#1swYxYw)aX Returns a hash of the input x and maps it to a value in the range [0, max_hash]. expects: x of shape (batch_size, seq_len) with values as ids in the token vocab. returns a tensor of shape (batch_size, seq_len) with values in the range [0, max_hash]. Note: max hash can make a big difference on the number of collisions. rrr^NF) r_no_gradrOrrr]runfoldr requires_grad) rJrrrrRrprefixwindowshasheshash_values_ranges r*byte_group_hash_functionr's gg GRau{{188T IIvqkq )((1j!,(,?"X- ',#  s BB''B0c |j\}}|ss|jdj|||}tj||j jdjdj|||}nr|jdj|||}tj||j jdjdj|||}|||k(}|S||k|||zkz}|S)aJ Creates a tensor of shape [bs, seq_len, num_patches] where each element at position (i, j, k) is True if the patch id at position (i, j) is less than or equal to k. Args: patch_ids (torch.Tensor): Tensor of shape [bs, seq_len] containing patch ids. num_patches (int): Total number of patches. window (int): If not None, only considers patches within a window of size window. patches_as_queries (bool): If True, the patches are used as queries Returns: torch.Tensor: Tensor of shape [bs, q_len, kv_len] with the desired mask. rr\rr)rO unsqueezerPr_r`r]) patch_ids num_patcheswindowpatches_as_queriesrRrq_idskv_idsrs r*create_patch_mask_from_idsr06s//KB ##B'..r7KH LLY-=-= > Yq\ Yq\ VB - $$Q'..r;H LLY-=-= > Yq\ Yr] VB W -  ~ K%EFVO$;< Kr,c z |jd}tj5t||jd||j ||rdnd |r|jd|zn|}|r|n|jd|z} j||| fk(sJ jd||| fd}|r! fd} t | |d|| d }|cdddStj tjd tjtd jdcdddS#1swYyxYw) Nrr)r,r-rr^ != c|||fSr$r%)r&r'r(r) cross_masks r* patch_maskz#cross_attn_mask..patch_maskxs!!UF"233r,T)BHQ_LENKV_LEN_compilerz-inf) rOr_r r0repeat_interleaver whererrEr)) r* patch_lengthsNr- cross_attn_kr,rrRq_lenkv_lenr5r4s @r*cross_attn_maskrB\sS  B /     "1   L3Ea2  N  :L ##A&5QR(m.A.A!.D|.S   $   : tR$7#8 9  :   4+ J; >;;ELL-u||E&M/JiC sB$D1AD11D:enforce_patch_size_multiplenb_boerboe_idc|j\}}|}|}|dkDr:|j||j|} tj| |fd}|r"|jd|zdk7r t |||}|d|fS)a This function returns X_et, X_gt and X_dt, the encoder, global, and decoder tokens respectively. Consider the input and target sequences: X=[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12,13] Y=[4,5,6,7,eos,bos,8,9,10,eos,bos,11,12,13,14] with patch_size=4 Note 1: that there will be no special tokens introduced at the patch level. Note 2: X_e needs to be trimmed to be passed to Global Current without boe: X_et = [[boe,boe,boe,boe] [3,4,5,6], [7,eos,bos,8], [9,10,eos,bos] [11,12,13, pad]] X_g = [[boe,boe,boe,boe] [3,4,5,6], [7,eos,bos,8], [9,10,eos,bos] [11,12,13, pad]] # remove last glob patch X_dt = [[3,4,5,6] [7,eos,bos,8], [9,10,eos,bos], [11,12,13]] Y = [[4,5,6,7] [eos,bos,8,9], [10,eos,bos,11], [12,13,14]] --> lag fix: X_et = [[boe,boe,boe,3] [4,5,6,7], [eos,bos,8,9], [10,eos,bos,11] [12,13,pad,pad]] X_g = [[boe,boe,boe,3] [4,5,6,7], [eos,bos,8,9], [10,eos,bos,11]] X_dt = [[3,4,5,6] [7,eos,bos,8], [9,10,eos,bos], [11,12,13]] Y = [[4,5,6,7] [eos,bos,8,9], [10,eos,bos,11], [12,13,14]] Dynamic (current): X = [3,4,5,6,7,eos,bos,8,9,10,eos,bos] Y = [4,5,6,7,eos,bos,8,9,10,eos,bos,11] entropy patching: input: 7, bos, 9, 10 pred (high entropy): eos, 8, 10, eos X_et = [[boe,3,4,5,6,7,eos,bos,8,9,10,eos,bos] X_g = [[boe], [3,4,5,6], [7,eos],[bos,8],[9], [10,eos]] X_dt = [[3,4,5,6], [7,eos], [bos,8],[9], [10,eos],[bos]] Y = [4,5,6,7,eos,bos,8,9,10,eos,bos,11] --> lag fix no boe (force single byte first patch): X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12] X_g = [[3], [4,5,6,7], [eos,bos],[8,9], [10], [eos,bos], [11,12]] # remove last global patch X_dt = [[3,4,5,6], [7,eos], [bos,8], [9], [10,eos], [bos,11,12]] Y = [4,5,6,7, eos,bos, 8,9, 10, eos,bos, 11,12,13] input: 4, 7, bos, 9, 10 pred (high entropy): 5, eos, 8, 10, eos X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12] X_g = [[3], [4] , [5,6,7], [eos,bos],[8,9], [10], [eos,bos], [11,12]] # remove last global patch X_dt = [[3] [4,5,6], [7,eos], [bos,8], [9], [10,eos], [bos,11,12]] Y = [4,] [5,6,7, eos,bos, 8,9, 10, eos,bos, 11,12,13] Handle the last byte properly. patch_lengths = [1, 1, 3, 2, 2 1 2 2 1] X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12] X_g = [[3], [4] , [5,6,7], [eos,bos],[8,9], [10], [eos,bos], [11,12]] # do not remove last global patch X_dt = [[3] [4,5,6], [7,eos], [bos,8], [9], [10,eos], [bos,11] [12]] Y = [4,] [5,6,7, eos,bos, 8,9, 10, eos,bos, 11,12, 13]] bpe delim X_et = [[3,4,5,6,7,,eos,bos,,8,9,,10,,eos,bos,11,12] X_g = [[3], [4,5,6,7,], [eos,bos,], .. X_dt = [[3,4,5,6,7], [,eos,bos], [,bos,8], .. Y = [4,5,6,7,, eos,bos, 8,9,, .. Note 1: that there will be no special tokens introduced at the patch level. Note 2: X_e needs to be trimmed to be passed to Global rrr^rN)rOr rr_rr) r.rCrDrrErrlocal_encoder_tokenslocal_decoder_tokens padded_patchs r* get_blt_inputrJsX!,,J!! zzz*f5;;FC $yy,8L)MSTU#';'A'A"'E 'RVW'W*+?VT '; ;;r,cJeZdZddedeffd ZdefdZdZdZd dZ xZ S) LocalModelBaseconfigcomponent_typec t|||_|dk(rk|j|_|j |_|j|_|jxs |j|_ d|_ |j|_ n~|dk(rk|j|_|j|_|j |_|jxs |j|_ d|_ |j|_ nt#d||j$|_|j&|j(z|_|j*|_|j,|_|j.|_|j0|_|j2|_t5|dd|_t5|dd|_t5|dd|_|j<|_t@|_!d|_"|j|j|jFt5|dd|jHt5|d d t5|d d|jJd }tMjNtQ|j Dcgc] }tS|c}|_*|j.s&tMjVd |j|_,n^t[|jH|jFxs|j|jz|j|j\|_/d|_,|dk(r#|j`|_1|jd|_3n'|dk(r"|jh|_1|jj|_3|jbE|jb|jk7r,tMjl|jb|jdnd|_7|jq||_9ycc}w)Nencoderlocal_block_causaldecoderzUnknown component_type: cross_attn_encodercross_attn_decoderr?rTrrrrLrrUrTrrrrirXrUrrYFr):rrrMdim_local_encoderrLn_layers_local_encodern_layersn_heads_local_encoderrmax_encoder_seq_lengthrr1local_attention_window_lenr/dim_local_decodern_layers_local_decodern_heads_local_decoderr:dropout vocab_sizepm_sizerr0use_ropeinit_std_factor init_base_stdgetattrrSrTr? eos_token_idr-rrEcross_attn_layersrUrrr ModuleListrrlayers Embeddingpos_embeddingsrrYropeencoder_dim_token_emb dim_token_embencoder_dim_patch_emb dim_patch_embdecoder_dim_token_emb dim_globalrtoken_embedding_projection_create_patch_projectionpatch_embedding_projection)rrMrN layer_params_rs r*rzLocalModelBase.__init__s7   Y &//DH"99DM!77DL$;;Pv?P?PDO"6D "("C"CD  y (//DH"99DM!77DL$;;Pv?P?PDO"6D "("C"CD 77GHI I~~  ++fnn< ++)) %55#11")&2F"M")&2F"M#FNDA))  "&88||!&,= ++"6=#>")&2F"M  mm8=dmm8L M8L1  .8L M  }}"$,,tTXX">D '''DDHH ,D??/5/T/T DI #'D  Y &!'!=!=D !'!=!=D  y (!'!=!=D !'!2!2D !!-$2D2D2P IId(($(( ? ' +/*G*G*O'9 Ns&Oc|jduxr|j|jk7}|jxr |jxs|jxr |j}|xs|Sr$)rqrLrScross_attn_init_by_poolingrT)rrMdimension_mismatchcross_attn_conditionss r*_should_create_patch_projectionz.LocalModelBase._should_create_patch_projectionDsl   d * Mt/A/ATXX/M   % % K&*K*K!O''MF,M,M ":%::r,c|j|sy|j|jxsdz}tj|j |dS)NrF) in_features out_featuresr)r}ror?r rrq)rrM output_dims r*ruz'LocalModelBase._create_patch_projectionPsN33F;''4+<+<+AB yy**#  r,c,||S|j|Sr$)tok_embeddingsrr.embedss r*apply_embeddingzLocalModelBase.apply_embedding\s  M&&v. .r,c|jjt|dr|jj|xs|jdz}t|dr>t j j|jjd|d|zd|z|j>t j j|jjd|d|zd|zt|jD]<\}}|jj|}|j|j |>t|dr>t j j|j"jd|d|zd|z|j$>t j j|j$jd|d|zd|z|j&M|j(dz}t j j|j&jd|d|zd|z|j*Kt|j*D]2\}}|jj|}|jd|4yy) Nnormrrrrorrr)rmrrrrLr rrrrrlrqrjrMget_init_std_factorrrerrtrvrqrh)rrdepthlayerr patch_emb_stds r*rzLocalModelBase.init_weightsbsF ""$ 4 II & & (3T 2 4) * GG ! !##**x-h, "     * GG ! !##**x-h, " &dkk2LE5[[44U;F   t116 :3 4 " GG ! ! ""x-h, "   * * 6 GG ! !//66x-h, "   * * 6 ..48M GG ! !//66!}$m# "   ! ! - )$*@*@ A u88?""40!B .r,)rPr$) rrrrrrr}rurrrrs@r*rLrLs8SPySP#SPj ;i ;  / :1r,rLcPeZdZdeffd ZdZ ddejdeejdeejdee deje fd eejd ee d eejd ee e ejeje ffd ZdZxZS) LocalEncoderrMc t||d|j|_|j|_|j du|_|j|_|j|_|j|_ |j|_ tj|j|j|_|jrt jj#|_|jr |j&nd}t)|D]j}|j$j+t-|j|j|jz|j|j|j.lyy)NrPrNrrLrUrrTr)rruse_local_encoder_transformerapply_transformerdownsampling_by_poolingencoder_hash_byte_group_sizeexpects_hash_embeddingsrScross_attn_all_layers_encoderrzcross_attn_nheadsr rkrarLrr_rirhrYrappendBLTCrossAttentionrrrM layers_to_addrxrs r*rzLocalEncoder.__init__s,  :!'!E!E'-'E'E$'-'J'JRV'V$"(";";-3-Q-Q**0*K*K'!'!9!9 ll4??DHHE  " "%*XX%8%8%:D "-1-O-ODMMUVM=)&&--% HH!%T-C-C!C $ 6 6#'#9#9!' * #r,cR||jsJd|S|j|S)Nz&Not expecting embeddings to be passed.)rrrs r*rzLocalEncoder.apply_embeddings7  ,, 87 8,M&&v. .r,r.r patch_embedsrr r4r+r*cachec |j\} } |/t| |jd|j||j}|j ||} |j r|j| nd} tj| |j|j} t|jD]\} }|| || |j} |js)| t|jdz k(s |jsQ|j rj|h|j#| |d|}|j$H|j%|}|j'| |jd|j(z|j*}|jr| nd }|j,||| | }||z}|jr|nd}| |f|fS) rNrQr/r.r-r<ptrainingrrr0ramaxrrJkvr)rOr=r0r/r-rrcrmrAr`rrqrjrSrrrz patch_reducervrQr?rLrh)rr.rrrr4r+r*rrRr<r'rlrrr layer_idxpatch_embeds_cross h_residuals r*rzLocalEncoder.forwards\\ F <%$#22{{ D   004 DIIVI,4 IIa4<<$-- @!$++.HAuadY$..QA&&S%))T-O-O22|7K#'#4#4Q VY#WL66B'+'F'F|'T '3';'; 2 21 58I8I I488( "&!C!CA %FT%;%;I%F"#&" ,.@@ +/.&*%<%<\$ :%%r,c*|j\}}}|jdjdd|jd}tj|||f|j |j }|j|d||d}|ddd|ddf}|S)a Reduce variable length patches to single embedding per patch Note: this works with variable number of patches for different sequences in the batch It handles variable length patches by assuming that patch_lengths will be 0 for any extra patches on the *right*. Since there can be a variable number of patches this function also return the number of patches for each sequence in the batch. Any embeddings on the right that are not allocated to a patch (i.e. if the sum(patch_lengths[i]) < seq_len for any i) will be sent to a dummy patch, which is trimmed before returning. rrrF)srcrLindexreduce include_selfN)rOr)rPr_rrr]scatter_reduce) rr'max_num_patches reductionr*rRremb_dim reduced_embss r*rzLocalEncoder.patch_reduces !wwGW''+222r1772;G {{ ' *!''!(( $22 3 $A'7'7$:; r,)NNNNNNN)rrrrrrr_rrrrr6rrrrrrs@r*rrsy6/*./3@D-1%),0HL4& 4&&4&u||, 4& u[%,,;<= 4& U\\* 4&c]4&ELL)4&U5<<s#BCDE4&pr,rceZdZdeffd Z d dej deej deej deedej e fdeej d ee e ej ej e ff d Z xZS) LocalDecoderrMc t||d|j|_|j|_|j|_|j |_t |j|j|_ |jrtjj|_ |jr |jnd}t|D]j}|jj!t#|j|j|j z|j |j |jltj$|j|j&d|_y)NrRrrrrFr)rrrTcross_attn_all_layers_decoderrzrrrLrrr_r rirhrYrrrrrarrs r*rzLocalDecoder.__init__s  :#)";";-3-Q-Q**0*K*K'!'!9!9DHH&//:  " "%*XX%8%8%:D "-1-O-ODMMUVM=)&&--% HH!%T-C-C!C $ 6 6#'#9#9!' *ii HH     r,r.rrrr r4rc|j\}}|Jd|/t||jd|j||j}|} |j ]|Jd|j |}|j 7|j||jd|j z|j}||js| |z} |jr|j|nd} tj| |j|j} t|j D]U\} } |jr.| dk(s |j"r|j$| | || } | | z} | | || |j } W|j'| }tj||j|j}|j)|}|j+}||fS) NzEmbeddings must be providedrQrz Patch embeddings must be passed.rrrrrr)rOr=r0r/r-rvr?rQrLrTrcrmrAr`rrqrjrrhrrrE)rr.rrrr4rrRr<r'rlrrrh_crossh_predss r*rzLocalDecoder.forward:s\\ F!@#@@! <%$#22{{ D   * * 6+ O-O O+::<HL  ,+33 **1-0A0AA488    #D,C,CL A04 DIIVI,4 IIa4<<$-- @!$++.HAu&&Q$<<4$003## KadY$..QA/))A,))Gt||dmmL++g&--/~r,NNNN)rrrrrr_rrrrrrr6rrrs@r*rrs y F04@D-1HL8 8&8u||, 8 u[%,,;<= 8 U\\* 8U5<<s#BCDE8r,rc eZdZdZdededededef fd Z ddejd ejd e e e e fd ejfd Z dd edefdZxZS)rzn BLTCrossAttention block to attend to the encoder states from the decoder. Rope is not supported. rLrUrrTrct|||_||_||_||_|j|j z|_tj|||_ t|||_ tj|||zd|_ tj|||zd|_ tj|||zd|_tj||z|d|_y)NrFr)rrrLrUrrTrr rcross_attn_norm_qcross_attn_norm_kvrrrrr)rrLrUrrTrrs r*rzBLTCrossAttention.__init__{s    $#||t>!#CX!>")#8"<))  h   ))   !  ))   !  )) h   r,rJrrrMc|j\}}}|j\}}}|j|}|j|}|j|} |j |} |j |} | j} | j |||j|j} | j |||j|j} | j |||j|j} t| |jd} t| |jd} td| | | f\} } } t|tr|dk(nd} t|tj r|nd}|j#| j$j#| j&}t)j*| | | | |}|j-ddj/}|j1|j3| }||zS) Nrr^c&|jddSrrrs r*rz+BLTCrossAttention.forward..s1;;q!#4r,r5F)rrr)rOrrrrrrgrrUrTrVrrrrr_rrarr]rArrrrrQ)rrJrrrrrxslen_kvx_normrurvrrrrs r*rzBLTCrossAttention.forwards''Wa 7A''*  $ $R ( WWV_ WWR[ WWR[xx WWS'4<< ? WWS'4??DMM B WWS'4??DMM B r4//Q 7 r4//Q 74r2rlC B*4T3*?TX%U !$ 5t4wwRXXw&))"))4//     !!!Q'224 566zr,base_stdrc|xs|jdz|z }tjj|jj d|d|zd|ztjj|j j d|d|zd|ztjj|jj d|d|zd|ztjj|jj d|d|zd|z|jj|jjyr) rLr rrrrrrrrrr)rrrrs r*rzBLTCrossAttention.init_weightss7488-7  GGNN3h#g    GGNN3h#g    GGNN3h#g    GGNN3h#g   //1 002r,r$)r[)rrrrr6rErr_rrrr rrrrrs@r*rrus ( ( (  (  (  ( \15 * <<* LL*uY^,- *  *X#3U#3E#3r,rceZdZfdZ d dej deej deej deeeej e fdee e ej ej e ff dZ dZxZS) GlobalTransformerc Pt|||_|j|_|j|_|j |_|j |_|j|_|j|_t|j|jxs|j|jz|j|j|_t|dt|dd|_|j|j|jt|dd|jt|ddt|dd|j"d }t%j&|_t+|j,D]&}|j(j/t1|(|j2|_|j4|_d|_|j4L|j4|jk7r2t%j8|j4|jd |_yyy) NrVr-rgrrTrrrrUFr)rrrMrLrer0r1rdrrrrUrrYrope_embeddingsrfr-rr rirjrrYrrr`rortr)rrMrwrxrs r*rzGlobalTransformer.__init__s  ::#11))$33%55 ++.##__D fnn(D((+1+P+P  fhPQ0RS 88~~!&,= ++"6=#>")&2F"M  mmo v'A KK  2<@ A(~~ #11*.'    +0D0D0P.0ii$$ /D +1Q +r,r.rrrrc|j\}}|}||n-t||j|j||j}|j -|jd|j k7r|j |}tj||j|j}|j|j|} t|jD]\} } | || |||j}||fS)N)r.r-rrr<rr)rOr=r0r1r-rtrLrAr`rrrrqrj) rr.rrrrrRr<r'rrrrs r*rzGlobalTransformer.forward)s\\ F  ###{{    * * 61772;$((;R//2A IIa4<<$-- @''t'P!$++.HAua7XA/%xr,c|jjt|jD]<\}}|jj |}|j |j|>|jdz}|j?tjj|jjd|d|zd|zyyr)rrrqrjrMrrrerortr rrr)rrrrrs r*rzGlobalTransformer.init_weightsMs --/%dkk2LE5[[44U;F   t116 :3   T*  * * 6 GG ! !//66s(c' "  7r,r)rrrrr_rrrr rrrr6rrrrs@r*rrs/h+/)->BHL " "%,,'"& " uY c9:; " U5<<s#BCDE "Hr,rrGencoder_hash_tok_embedding$encoder_hash_byte_group_nb_functionsrencoder_hash_byte_group_vocabc|y|j|}d}t|D]-}|D]&} t|| ||} ||} || | z}|dz }(/|t|k(sJ|S)a Compute embeddings using hash token embeddings. Args: local_encoder_tokens: Input tokens tensor local_encoder: Encoder object with tok_embeddings method encoder_hash_tok_embedding: ModuleList of hash token embeddings encoder_hash_byte_group_nb_functions: Number of hash functions encoder_hash_byte_group_size: List of byte group sizes encoder_hash_byte_group_vocab: Vocabulary size for hash embeddings Returns: torch.Tensor: Combined embeddings Nr)rrr)rrr'r) rG local_encoderrrrrlocal_encoder_embedsrrfunc_nbbyte_group_sizehash_idshash_tok_embeddings r*compute_hash_embeddingsr^s,")(778LM A=>;O/$$6 H " #7:LX:V#V FA <? ./ // / r,c<eZdZdZeZdZdZgdZdgZ dZ dZ dZ dZ y) BLTPreTrainedModela An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained BLT models. This class provides the interface for model loading, saving, and weight initialization for all BLT model variants. It inherits from [`PreTrainedModel`] which provides the core functionality for working with HuggingFace models. Args: config ([`BLTConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. modelT)rrrrpast_key_valuesFcy)z]Initialize the weights - this is called by PreTrainedModel but we delegate to our custom initNr%)rmodules r* _init_weightsz BLTPreTrainedModel._init_weightss r,N)rrrrr config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_classrr%r,r*rrs? L&*#d#4"5"N! r,rceZdZdZdeffd ZdejdedejfdZ dejdededejfd Z d d ejde ejfd Z d Z xZS)BLTModelav The BLTModel (BLT) is a byte-level language model architecture that processes byte sequences by dynamically segmenting them into patches. It uses a combination of local encoders, global transformers, and local decoders to efficiently encode and decode byte sequences, leveraging patch-based processing for improved performance and inference efficiency. rMct ||||_t||_t |di|j }|j|_|j|_ |j|_ |j|_|j|_t#||_t'||_t+||jj|j,|_|j0rg|j2rS|j4F|j4}t6j8j;|st=d|t6j8j?|d}t6j8j;|st=d|tA|5}tCjD|jG}dddtIjJtHjLd}tNjQd|d|_)|d|_*|d |_+|d |_,|d |_-|d |_.d |_/d|_0d|_1te||_3t6j8j?|d}tIjhtHjjjmrdnd} |jfjotIjp|| dd|jfjs| |jfju|_3|jfjwD] } d| _< nd|_3|j{y#1swYxYw)N)local_encoder_dimrz.Entropy model checkpoint directory not found: z params.jsonzparams.json not found in: entropy_modelzFUpdate checkpoint to load attn and sliding window args from checkpointrLrYrrrrarQr3izconsolidated.pthcudacpu) map_locationrF)strictr%)>rrrMrrtypeto_dictrsrLn_layers_globalrYn_heads_globalrn_kv_heads_globalrTglobal_dim_patch_embrorglobal_transformerr local_decoderinit_hash_embeddingsrrpatch_in_forwardrealtime_patchingentropy_model_checkpoint_dirr7pathexistsFileNotFoundErrorjoinopenjsonloadsreadr_set_default_dtypebfloat16loggerwarning patcher_dimpatcher_n_layerspatcher_n_headspatcher_max_seqlenpatcher_ffn_dim_multiplierpatcher_vocab_sizepatcher_attn_bias_typepatcher_attn_implpatcher_sliding_window BLTPatcherpatcherr]r is_availableload_state_dictloadraeval parametersr" post_init) rrM global_configr params_pathfrreloaded model_params state_pathr]paramrs r*rzBLTModel.__init__s   *&1%V 8v~~'78 "-- !'!7!7  & 5 5 #)#;#;  &,&A&A #"3M"B)&1+? "0044)/)L)L+ '  " "''F,O,O,[/5/R/R,ww~~&BC+.\]y\z,{||!ggll+GW ww~~k2+.HIeHf,ghh+&"#zz"'')4H'''7'8 \ &2%%8"*6z*B')5i)@&,8,F)4@AU4V1,8,F)0D-+1(03- *&1 WW\\02D  0G0G0IfuU ,,JJz?HQV- '#||002 !\\446E*/E'7 $  S'&s %$M22M<r=rrMc|j\}}tjtj|d|j|j |j dddddfgd}tj||j }|jd|jdjdk}|jddz }|S)u Convert patch lengths to patch IDs for each token position. For each token position in the sequence, determines which patch it belongs to. Args: patch_lengths: [batch_size, num_patches] - length of each patch seq_len: total sequence length Returns: patch_ids: [batch_size, seq_len] - patch index for each token position Example: patch_lengths = [[3, 2, 4, 1]] # 4 patches of lengths 3,2,4,1 seq_len = 10 Returns: [[0, 0, 0, 1, 1, 2, 2, 2, 2, 3]] # pos 0-2→patch 0, pos 3-4→patch 1, pos 5-8→patch 2, pos 9→patch 3 rrrr^Nr\r) rOr_rrrr]cumsumr`r)ry) rr=rrr+ patch_startstoken_positionsposition_ge_patch_startr*s r*_patch_ids_from_lengthsz BLTModel._patch_ids_from_lengthss&#0"5"5 Kyy KK A]-@-@I]I] ^  R (CRC 0"   ,,w}7K7KL#/"8"8";?X?XYZ?[?e?efh?i"i,//B/7!; r,rDc<|ddddf}|j||S)a< Create decoder patch IDs by skipping the first encoder patch. The decoder starts after the first patch (which contains BOE tokens), so we need to map decoder positions to the remaining patches. Args: patch_lengths: [batch_size, num_patches] from encoder nb_boe: number of beginning-of-example tokens in first patch seq_len: decoder sequence length Returns: decoder_patch_ids: [batch_size, seq_len] mapping decoder positions to patch indices Nr)r))rr=rDrdecoder_patch_lengthss r*_decoder_patch_ids_from_lengthsz(BLTModel._decoder_patch_ids_from_lengths+s+ !.ae 4++,A7KKr,r.c |j\}}t|jjdk7rdn|jjdz }t |d||jjt \}}}|D|jjtjk(r|j||jjd|jj|jj|jj|jj|jj|jj \}}}n|j\}} | dz} t!j"|| f|j$|j&}|jj|j)D cgc],} t*j-| |jj.}} t/|D cgc] } t1| c} } |D cgc]} t3| d| }} t!j4||j$|j&}t7|rJ|dk7j9dg jj;d j=} |ddd|jd| z f}n|dkDr|dddfxx|z cc<t!j<|dk\sJ|j?||jd }t!j.|dzt!j.|dk7jAd ksHJt!j.|dzd t!j.|dk7jAd d}|jjBrNtE|||d|jjF|jjH|jjJ}tM||jN|jP|jjR|jjT|jjV}|jO||d||jd|\\}}}|jY||jdd }|j[|jd|jdj]t }t!j^||jj`k(\}}|||f}|jj`|||f<|jc||\}}|dd|||zddf}|je|||jd }t!j.|dz|jdks/Jt!j.|dzd |jd|jd|jdk(s&J|jdd|jd|jjfsdt!jh|d|jkd jmd d |jd }d}|j|jdd k(sPJtE|||d|jjF|jjn|jjJ}|jq||||\}}|Scc} wcc} wcc} w)NrrF)r.rCrDrrET)rinclude_next_token threshold threshold_add monotonicitymax_patch_lengthpatching_batch_sizer]rrdimsr^rz > )r-r?r,r)rGrrrrr)r.rrr4r+r*)rr.r2)rrr.r4)9rOr6rM patching_moderrJrrentropyrpatching_thresholdpatching_threshold_addr2r3r4patching_devicer_onesrr]tolistrsplit_large_numbersmaxrrrr flipargmaxminr)ryrSrBr?cross_attn_window_encodercross_attn_use_flex_attentionrrrrrrrgr rr<rgrr,rTgatherr)rPcross_attn_window_decoderr)rr.r=rRr>rDrGrxrHrseq_len_next_tokplrlast_non_zero_col_reversedr*cross_attn_mask_encr h_encoderr cache_encoderr' global_tokensrowscols eos_patch_ids dec_embedsdecoder_patch_idscross_attn_mask_decrs r*rzBLTModel.forwardBs A$++33r9Qt{{?U?UXY?YZ8E(-{{-- 9 5a!5  {{((,<,D,DD&*ll(#{{55'+"kk<<"&++"D"D!%!9!9%)[[%A%A(, (G(G;;66'3 '#=!388 G#*Q; % )*2F2L2LUiUpUp! ;;//;#0"6"6"8%"8B#66r4;;;W;WX"8"%"]"C]r3r7]"CDGP]$^P]"Xb!W%EP]M$^$)LL%-A-G-GPdPkPk%M5]CCC#a'--A3-7;;=DDDKOOQ+!.L,,Q/2LLLL! zad#v-#yy'1,,,00 /55b9 yy#a'599 a  $ $ $ ,,   Yii "Q& 's599mq6H5M5MRT5M5U+V*W X Y # ;; ) )"1#'![[55{{<<;;DD#  7!5,,'+'F'F151a1a)-)Q)Q*.++*S*S  /3.@.@''*%++A. /A/ +Gm LL]003R 8 1771:qwwqz:@@H [[!59Q9Q!QR d!$*- -1[[-E-E dM)*&& ' 1 q&6A:"5q89 !@@ 6#7#=#=b#A  II' (1 , : @ii)*Q. /s1771:, ? @ :  # #A &**:*:1*= = D%%a( )j.>.>q.A-B C D ={{-- 1'11"5<5\\c v|jj|jj|jj|j_|jj dz}|jD]6}t jj|jd|d|zd|z8yyr) rrrrrrLr rrr)remb_stdembs r*rzBLTModel.init_weightss '') ,,. '')  * * 6((,,6G66%%JJ7l'k &7 7r,r$)rrrrrrr_rr6r)r,rrrrrs@r*rrsRyRh&U\\&C&TYT`T`&PLU\\LSVLadLiniuiuL415k k  -kZr,rcveZdZfdZ ddej deej deej deeeej e fde dzdee d e d ee d ee d e d ee de dee de fdZ ddZdZedZedZedZe ddZedZxZS)rc tt||||_|j|_|j |_|j|_|j|_ |j|_ |j|_|j}|j }|j"}|j$}|j&}|j(}|j*}|j,} |j.} |j0} t3||xs|j|z|j||_| |_t9|dr t;|ddn t;|dd} t9|dr t;|ddn t;|dd} t9|dr t;|ddn t;|d d}|j||| || ||d }t=j>|_ tC|D]&}|j@jEtG|(| |_$| |_%|d kDsJtLj<jO||j|_(tS|j| |_*t=jV|j|d |_,|jHr&|jPjZ|jX_-yy)NrVr patcher_n_kv_headsrTpatcher_multiple_ofrrrrrUrrFr).rrrMr rLpatcher_init_base_stdrerr0rr1patcher_init_std_factorrdrrr rpatcher_head_dimpatcher_rope_theta&patcher_rope_use_fp32_in_outer_productpatcher_norm_epsrpatcher_weight_tyingrpatcher_eos_token_idrrr-rrfr rirjrrr weight_tyingr/r_rkrrrrrr)rrMrYrrUrrYrrardr/rgrTrrrwrxrs r*rzBLTPatcher.__init__s   %%#9911$;;%== 33**((**.. )/)V)V&**.. 22 6622 .4W!4+I  # ELFTaDbWV%94@hopvyEGKiL ELVUbEcgf&;SAipqwzGILjM T[\bdqTrWV-I4PxAGI]_cyd88 $$&"4   mmo xA KK  2<@ A!),A~~#hh00TXXFDHH(3 ii HH     !%!4!4!;!;DKK  r,N token_valuesrGrrr0rr/r0r1r2r3r4r] enable_gradc | |jn|}g}g}tt|dd|j}|| z}t j |j |}|D]}||j|zz |z}t j||j|jd}t j||fd}|jd|}| |j| }|j\}}|j|}t!|||j"|j$||j&}|j)|d }t+|j,D]\}}|||d|| }|j/|j1|}|jd|jdd|j|z ddf}|j3||j5|}|j3|t j|d} | j|j} t j|d}!|!j|jdd}!|j\}"}#|r|#d zn|#}$|*|j7| |||| | }%|j9|%|$}&n.t j:|"|$f|j|j }&| |&j=D'cgc]}'|j?|'| }&}'tA|&D'cgc] }'tC|'c}'}(|&D'cgc]}'tE|'d|(}&}'t jF|&|j|j }&tI|&rJ|&dk7jKd gjMjOd j})|&ddd|&jd |)z f}&| |&|!fScc}'wcc}'wcc}'w)N max_lengthi F)rr]r"rr^rrrrr)r/r0r1r2rr5r6)(r0rCrfrr_splitrDnumelrrr]rrQrarOrr=r1r/r-rrqrjrrrr9find_entropy_patch_start_idspatch_lengths_from_start_idsr=r>r?r@rrrr rAr6rB)*rrerGrrr0rr/r0r1r2r3r4r]rf entropiespredsrh batch_numelsplitsripad_sizepadrr<r' chunk_maskrrrrrFpred_entropiesconcat_entropies concat_predsrRrrHpatch_start_idsr=rIrrJs* r*rzBLTPatcher.forwardJs"'0&7DNNY  |T:DOOL  #66 \113[AE"ekkmj&@AZOH++ ELLPUCIIucl2EMM"j1E!( ++KC##E*A+###22{{ J++64+HH%dkk25!Xt*PYZ3;;tyy|,D<<DJJrN3*%++-(**A-D LL !\\$/N   ^ ,AD!99YA6+33L4F4FGyyA. #++L,>,>q,A2F #(( G*<7Q;'  !"?? #5#+) @O!==!1M "JJ%&l.@.@I\I\M  '(..00B((-=>0 ];]r3r7];|js?tjj |jj d|d|zd|zyyr)rrLr rrrrrrqrjrMrrrerdr)rrrrrs r*rzBLTPatcher.init_weightss 88%     & &8m(l   --/%dkk2LE5[[44U;F   t116 :3  GG ! ! ""x-h, " !r,ctj|d}tj|}||z}|j d }|S)z scores: [bs, seq_len, vocab] returns [bs, seq_len] Computes the entropy for each token in the batch. Note: uses natural log. rr^)rArCr_expry)scores log_probsprobsp_log_pr9s r*r9zBLTPatcher.entropysDMM&b1  )$e#;;2;&&r,c`|j\}}|jdj}|dk(r5tj||f|tj |j }|Stj||j jdj|d}tj||f|tj |j }tj||fd}tj||fd}||j||ddd|f}|S)Nrr^rrr\) rOryr@r_fulllongr]r`r)repeatrrQ) patch_start_maskrR trunc_seq_len max_patchesrwr*extra_patch_ids all_patch_idspatch_start_mask_paddeds r*%patch_start_ids_from_patch_start_maskz0BLTPatcher.patch_start_ids_from_patch_start_masks+,22M&**q*1557 ! #jj]#jj'.. O2# ]3C3J3JK1A  $jj]#jj'.. O "IIy/&BJM&+ii!$4#451' #,,CDLLM+oOr,c tj|ddddf|dz }tj|ddddfdz |fd}||z dz}tj|dk\sJ|t |rJ||S)aI Calculate patch lengths from start ids. start ids: ex: [0, 1, 7, 7, 7, 7, 7], it has the start ids of the patches (here 0, 1), and then the rest are filled to the seq len. seq_len: ex: 7 length of the sequence returns the patch lengths: [1, 6] for the above example. Nrr^r)r_ full_likerallr )rwrlast_ids patch_end_idsr=s r*rlz'BLTPatcher.patch_lengths_from_start_ids s???1bqb5#97Q;G ?1ab5#9A#=x"HaP %7!; yy!+,@@,,];O O;r,c|jdd\}}tjddgtj|jj dj |d}|jd} |ddddf}|B||z} |j| dz dj} | jdj} n'||kD} |s | ddddf} tj| } tj|| | zfd} | S)az Use entropies to find the start ids of each patch. Use patch_size or threshold to figure out the total number of patches to allocate. When threshold is not None the number of patches is not constant between different sequences, but patches can be identified incrementally rather than decided globally using the entire sequence. Nrrrrr^r)rOr_rrr]r)rtopkindicessortvaluesrrr) rmrr0r1r2r/rRr first_idspreds_truncation_lenr+rwrs r*rkz'BLTPatcher.find_entropy_patch_start_idss" oobq) G LL!Quzz):J:J K Yq\ VB]  )  ae$  !Z/K'nn[1_!nDLLO-22q29@@O(94 %#3AssF#; (NNO_`O)) *>> ?Q r,cg}|D]K}||kDr3||kDr|j|||z}||kDr|j|;|j|Mt|t|k(sJt|dt||S)Nr2)rry)lstmnew_lstrrs r*r?zBLTPatcher.split_large_numbersGsA1u!eNN1%FA!eq!q!7|s3x'HCL>c#hZ)HH'r,) NNNNNTNNFNrNFr$)NNNFT)rrrrr_rrrr rr6rrErrr staticmethodr9rrlrkr?rrs@r*rrsGB $$(#'%))-"*.#$ $!p=llp=&p=%,,' p= uY c9:; p= : p=SMp=!p=E?p= p=p=#3-p=!p= p=p=d%2   >" ))V  r,rrc|jyg}|}|j}t|jD].}|D]'}|j t j ||)0t j|S)z;Initialize hash-based token embeddings for the BLT encoder.N)rrrrrr rkri)rMrr embeddingsrrrxs r*rrVsz **2JG$*$H$H! 6>> ?-A    1 .@ == $$r,)rrrrrr)g@F)r)rri0u)NF)FrNT)Senumrtypingrrrrrr_pydanticr r !torch.nn.attention.flex_attentionr r r rr"torch.nnrrAr7 contextlibrSEPrr6__annotations__rrrrrrr getLoggerr configuration_bltrrr modeling_utilsr!utilstransformers_loggingrr+rrr=rIrVrErrkrtrModulerrrrrr rrr'r0rBrJrLrrrrrilistrrrrr__all__r%r,r*rs44 $ZZ  $ "  C **     .4$"&!%  $J $J  LL4   $J 8  c    +0 P P P P%) PB"U\\"ell"S": 2  2  2 2|| 2  5<< %& 2"7,ehhoo7,tw 299w tC RYYC LB)"))B)J1 %9/S/&RW ||!$8;KN0=B#T -`_< LL_xvX>Xv3 3De eN) ,,) !# ) +. ) #' ) $' )  \\) X  :V!Vr S#Sl %%#'%2 r,