-@h86UddlmZmZddlmZmZmZmZmZddl Z ddl m Z ddl m Z ddl mZddlmZmZmZddlmZddlZddlZddl Z ddlZ ddlmZdd l mZdd lmZddlZddlZddlZdd lmZdd l m Z d Z!dZ"e#e$d<dZ%e#e$d<dZ&e#e$d<dZ'e#e$d<dZ(e#e$d<dZ)e#e$d<dZ*e#e$d<ejVZ+ejXZ-ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4e#ejjjmdddk(re jneZ8ndZ8dZ9dZ:d e jvd!e#fd"ZGd)d*e=eZ?d+Z@d,e jvd-e#d.e#d/e jvfd0ZA dd.e#d1e#d2eBd3eCfd4ZDd5e jvd,e jvd6e#fd7ZEd8e jvd9e jvd6e#d5e jvd/ee jve jvff d:ZFGd;de jvd/eIe jvfd?ZJGd@dAejZKGdBdCejZLGdDdEejZMGdFdGejZOGdHdIejeOZPGdJdKePe dLdMdNdOdPe4dQdRfiS ZQGdTdUe=eZRGdVdWe ZSdXZTdYZUddZZVd[ZWGd\d]ZXdd^ZYd_ZZd`Z[daZ\dbZ]dce=dzd/e^e#e#ffddZ_deZ`dfZagdgZbddhe#fdiZc dd,e jvdje#dhe#dke#fdlZd ddmZe ddnZfd&e jvdoeCdpe jvdqe#dre#f dsZgdtZhdue0fdvZiGdwdxejZjGdydzejZkGd{d|ejZlGd}d~ejZmGddePZnGddeZo ddeode#deIfdZpde jvdejde#deIde#d/e jvf dZrGddejeOe dLdMdNdOdPe0ddfiS Zsy))Enumauto)AnyListOptionalTupleUnionN)PyTorchModelHubMixin)model_validator)nn)create_block_mask BlockMaskflex_attention)Self) ConfigDict) functional) defaultdict) BaseModel BOS_IDEOS_IDPAD_IDBOE_IDBPE_IDOFFSET BYTE_UNITS)BaseTransformerArgsByteLatentTransformerArgsGlobalTransformerArgsLocalDecoderArgsLocalModelArgsLMTransformerArgs BLT_ALLOW_MISSING_FLEX_ATTENTIONFc*|j\}}}|jdjdd|jd}tj|||f|j |j }|j|d||d}|ddd|ddf}|S)ab Reduce variable length patches to single embedding per patch Note: this works with variable number of patches for different sequences in the batch It handles variable length patches by assuming that patch_lengths will be 0 for any extra patches on the *right*. Since there can be a variable number of patches this function also return the number of patches for each sequence in the batch. Any embeddings on the right that are not allocated to a patch (i.e. if the sum(patch_lengths[i]) < seq_len for any i) will be sent to a dummy patch, which is trimmed before returning. rdtypedevicerF)srcdimindexreduce include_selfN)shape unsqueezeexpandtorchzerosr,r-scatter_reduce)hmax_num_patches reduction patch_idsbsseq_lenemb_dim reduced_embss 0/fsx/ita_zaporozhets/blt/blt_wip/blt_one_file.py patch_reducerB;s77B##B'..r2qwwr{CI;; _g&aggahhL ..  /L #3O#3Q 67L c ||k\SN)br9q_idxkv_idxs rA causal_maskrJYs F?rCbatcheos_idc||k(}d|dddf<tj|\}}|dd|ddz |dd|ddz |jdzz}t|dj dzg|j zS)zL 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 -> 4 4 3 2 4 5 TNrrr)r6wherer3intitemtolist)rKrLmaskrowcolseqlenss rAtokens_to_seqlenrV]s F?DDBK{{4 HC12wSb!c!"gCR&8DJJqM%IIG A ! " #gnn&6 66rC)rLtokenssliding_window attn_implattn_bias_typerWrXc|dk(r@ttjjdd}|dk(ry|dk(ryt d|dk(rt t dd||Std|d |d ) NsdpaBLT_SUPPRESS_ATTN_ERRORrcausalrzSDPA attention being used, which doesn't have specialized attention implementations for block_causal and local_block_causal attention. To suppress this error and run the model anyway, set the environment variable BLT_SUPPRESS_ATTN_ERROR=1rz Attention z with z sliding window not implemented)rOosenvironget ValueErrorr rJNotImplementedError)seqlenrYrZrLrWrXr]s rAcreate_causal_maskreqsF"%bjjnn5NPQ&R"S X % "a 'A  & & dD&&II! 6.)99X Y  rCceZdZdZdZdZdZy) InitStdFactordisabled global_depth current_depth dim_ratioN)__name__ __module__ __qualname__DISABLED GLOBAL_DEPTH CURRENT_DEPTH DIM_RATIOrFrCrArgrgsH!L#MIrCrgc tjtj|jdj d|jdfi|S)N)end_dimr)Fnll_loss log_softmaxflattenfloat)predtargetkwargss rA cross_entropyr~sM :: dll2l.446;r"   rCxn_repr/returnc|dk(sJd|j\}}}}|dk(r|S|dddddddddfj|||||j||||z|S)z0torch.repeat_interleave(x, dim=2, repeats=n_rep)rzAOnly dim=2 is supported. Check the implementation for other dims.rN)r3r5reshape)rrr/r=slen n_kv_headshead_dims rA repeat_kvrsq !8XXX8%&WW"Bj( z !Q4  D*eX 6 T:-x 8rCendthetarope_use_fp32_in_outer_productcd|tjd|dd|dzj|z zz }tj||j}|r|j tj }tj ||j}|j|j}}tj|| ||fdjg|jddS)a Precompute the frequency tensor for complex exponentials (cis) with given dimensions. This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64 data type. Args: dim (int): Dimension of the frequency tensor. end (int): End index for precomputing frequencies. theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. Returns: torch.Tensor: Precomputed frequency tensor with complex exponentials. ?rrNr-rr/) r6arangerzr-tofloat32outercossinstackviewsize)r/rrrfreqstrrs rAprecompute_freqs_cisrs* 5U\\!S!4\qBHHJSPQ RE S.A% DD  KK5 ! ' ' )Eyy{EIIKC :5;;cT3," 5 : : OEJJL O! OQ OOrC freqs_cisseq_dimc||j}d|cxkr|ksJJ|j|j||jdddfk(s Jd|j|jft|jddDcgc]\}}||k(s||dz k(r|ndc}}ddgz}|j|Scc}}w) a  Reshape frequency tensor for broadcasting it with another tensor. This function reshapes the frequency tensor to have the same shape as the target tensor 'x' for the purpose of broadcasting the frequency tensor during element-wise operations. Args: freqs_cis (torch.Tensor): Frequency tensor to be reshaped. x (torch.Tensor): Target tensor for broadcasting compatibility. seq_dim (int): Sequence dimension index. Returns: torch.Tensor: Reshaped frequency tensor. rrzfreqs_cis vs x: Nrtrr)ndimr3 enumerater)rrrridr3s rAreshape_for_broadcastrs 66D  $    ??      7 9??AGG456 7 ?HPSQS >U >UdaQ'\Q$(]1>U  A E 9>>5 !! sB8xqxkc|jg|jddddd}|jg|jddddd}t|||j}||zj dj d}||zj dj d}|j ||j |fS)Nrrrr)rr3rrzsumrytype_as)rrrrxq_xk_xq_outxk_outs rAapply_rotary_embrs "** .bhhsm .R . .A .C "** .bhhsm .R . .A .C%3 egIo " "1 % - -a 0FIo " "1 % - -a 0F >>" v~~b1 11rCc teZdZdZ d dedededeffd ZdZ d de ed e e jfd Z xZ S) RotaryEmbeddingz RotaryEmbedding Module rr max_seqlenrc t|||_||_||_||_|j dt||||j dy)Nrr/rrrF) persistent)super__init__rrrrregister_bufferr)selfrrrr __class__s rArzRotaryEmbedding.__init__sb    $.L+   /3/R/R     rCct|j|j|j|j|j d<y)Nr.)rrrrrrrs rAreset_parametersz RotaryEmbedding.reset_parameterss32 **+/+N+N  srCrdtok_idxcn|duxs|du}|sJd||j|S||jd|Sy)a} Return freqs_cis corresponding to consecutive seqlen positions or the corresponding tok_idx positions Args: seqlen (int): Contiguous sequence length tok_idx (torch.Tensor[int]): Position indices of each token this overrides seqlen Returns: Tuple(torch.Tensor, torch.Tensor): Embedded input tensor and freqs_cis Nz(Should provide atleast seqlen or tok_idxr)r)rrdrtests rAforwardzRotaryEmbedding.forwardsWd"<t(;???t  >>'* *  >>!F+ + rC)iFNN)rlrmrn__doc__rzrOboolrrrr6Tensorr __classcell__rs@rArrsi/4      )-  2 OS,sm,5=ell5K,rCr attn_biastensorsc t|}t|r.|Dcgc]#}|jddg|jdd%}}|Scc}w)Nrrr)list isinstancerr3)rr to_transformrs rA_reshape_for_attn_biasr2sU=L)AMM 1  !R6!''!"+6 M Ns(Ac eZdZdededededef fd Z ddejdejd eejd ee e e fd e d ejf d Z ddZ xZS) Attentionr/rn_headsr rope_thetact|||_||_||_||_||_|j |j z|_tj|||zd|_ tj|||zd|_ tj|||zd|_ tj||z|d|_ yNFbias)rrr/rrrrheads_per_groupr Linearwqwkwvwo)rr/rrrrrs rArzAttention.__init__?s   $ $#||t>))  h   ))   !  ))   !  )) h   rCrfreq_cisrrRrYrcD|j\}}}|j|j|} |j|j|} |j |j|} | j} | j |||j |j} | j |||j|j} | j |||j|j} t| | d|d|\} } t|dr |jj| | |\} } t| |jd} t| |jd} |dk(rX|t|t sJt#d| | | f\} } } t%| | | |} | j'ddj)} n|d k(rt#d | | | f\} } } |"t|t*t,j.fsJt|t*r|d k(nd }t|t,j.r|nd}t1j2| | | || } | j'ddj)} nt5d|d| j7| }|j9|} | S)Nrrkv_cacherrrc&|jddSNrr transposees rAz#Attention.forward..q{{1a'8rC block_maskr\c&|jddSrrrs rArz#Attention.forward..rrCr^F) is_causal attn_maskzAttention implementation z not supported)r3rview_asrrrrrrrhasattrrupdaterrrrmapflex_attention_compr contiguousstrr6rrvscaled_dot_product_attentionrcrr)rrrrrRrYbszr>r/rrxv output_shapeoutputroutput_reshapeds rArzAttention.forwardgs^GGWc WWQYYq\ " WWQYYq\ " WWQYYq\ "xx WWS'4<< ? WWS'4??DMM B WWS'4??DMM B!"b!Xa-@AB 4 $]]))"b':FB r4//Q 7 r4//Q 7 ( (<:dI#> >>82r2,GJBB(REF%%a+668F & 82r2,GJBB<:dS%,,4G#H HH.8s.C)I%dELL94tD33# F%%a+668F%+I;nE !..6) rCc d|xs|jdz|z }|j|j|jfD]6}tj j |jd|d|zd|z8tj j |jjd|d|zd|zyNrrmeanstdarG) r/rrrr init trunc_normal_weightr)rinit_stdfactorws rArzAttention.reset_parameterss<T 2f<''477DGG,A GG ! !x-h, " -  GGNN8m(l  rCNNr\Nr)rlrmrnrOrzrr6rrr rrrrrrs@rArr>s& & &  &  &  & X+/04 : <<:,,:%,,' : uY^,- :  : :x rCrc ~eZdZ d dedededeedef fd Zdejdejfd Z d d Z xZ S) FeedForwardr/ hidden_dim multiple_offfn_dim_multipliermp_sizec^t|td|zdz }|t||z}|||zdz |zz}||zdk(sJ||_||_t j ||d|_t j ||d|_t j ||d|_ y)NrrrrFr) rrrOr/r r rw1w3w2)rr/r r r rrs rArzFeedForward.__init__s Z!+,  )/*<=J Z+%=%Ak$QR G#q((($))    ))    ))   rCrrc|j|j|}|j|j|}|jt j ||z}|SrE)rrrrrvsilu)rrx1x3rs rArzFeedForward.forwardsM WWQYYq\ " WWQYYq\ "b) rCc|xs|jdz|z }|xs|jdz|z }tjj |j j d|d|zd|ztjj |jj d|d|zd|ztjj |jj d|d|zd|zyr) r/r r rrrrrr)rrr in_init_std out_init_stds rArzFeedForward.reset_parameterss?488#5"? GDOO$=#G   GGNN;+o    GGNN<,    GGNN;+o  rC)rr) rlrmrnrOrrzrr6rrrrrs@rAr r sb ! ! !  ! %UO !  ! F%,, rCr c eZdZdeffd Z d dej dej deej deee e fde dej f d Z d d Z xZ S) TransformerBlockargscjt||j|jJd|jxs|j|jz|_|jxs|j|jz|_|j xs |j|_|j|j zdk(sJ|j|jzdk(sJt |j|j|j|j |j|_t|jd|jz|j|j|_ t|j|j|_t|j|j|_y)Nz+Should specify at least head_dim or n_headsr)r/rrrrr)r/r r r eps)rrrrr/rrr attentionr r r  feed_forwardRMSNormnorm_epsattention_normffn_normrrrs rArzTransformerBlock.__init__sK  ) LL $ 9 8 9  AT\\)A ||@txx4=='@ //9T\\||doo-222xx$,,&!+++"]]LL  (488|((#66   &dhhDMMBdmm< rCrrrrRrYrc|j|}|j|||||}||z}|j|} ||j| z} | S)NrrRrY)r$r r%r!) rrrrrRrYnorm_xattn_outr9h_normouts rArzTransformerBlock.forwardsi$$Q'>>   "  Lq!$##F++ rCc|jj|||jj|jj|||jjyrE)r rr$r!r%)rrrs rA init_weightszTransformerBlock.init_weights1sN ''&9 ,,. **8V< &&(rCrr)rlrmrnr#rr6rrr rrrr.rrs@rArrs=0=B+/04  <<,,%,,'  uY^,-    *)rCrc6eZdZejdefdZy)SequenceModelWithOutputrcyrErFrs rAget_output_seq_lenz*SequenceModelWithOutput.get_output_seq_len:s rCN)rlrmrnabcabstractmethodrOr2rFrCrAr0r09s! C  rCr0cveZdZdeffd ZdZ d deejdee e e fde fdZ dZ xZS) BaseTransformerrc~t||j|_|j|_|j|_|j |_t |j|_|j|_t|j|jxs|j|jz|j|j|_|j|_t!j"|_t'|j(D]&}|j$j+t-|(y)Nrrrr)rrr/ init_base_stdrYrZrginit_std_factorrrrrrrrope_embeddingsrLr ModuleListlayersrangen_layersappendrrr_rs rArzBaseTransformer.__init__@s 88!//"11,T-A-AB//.//]]>dhh$,,&>+/+N+N   kk mmo t}}%A KK  /5 6&rCc|jSrErrs rAr2z"BaseTransformer.get_output_seq_lenT rCrrRrYc|j|j|}t|jD]\}}||||||}|S)N)rdrr()r;rrr=)rr9rrRrYrrlayers rArzBaseTransformer.forwardWsK''t'P!$++.HAua7SA/rCc |jjt|jD]\}}tj d|dzzdztj dt|jdzzdztj|jdz tjdi|j}|j|j|y)Nrr?r)r;rrr=rgrqrplenrrr/ror:r.r9)rdepthrGrs rAr.zBaseTransformer.init_weightses --/%dkk2LE5++a519o#-E**Q#dkk2BQ2F-GC,O''D&&  "" $F   t116 :3rCr)rlrmrnr#rr2rr6rr rrrr.rrs@rAr6r6?s_707( +/04 %,,' uY^,-    ;rCr6c eZdZdeffd ZdZ d dejdeejdeejdee e eje fd e dzf fd Z dd Z fd ZxZS) LMTransformerrc"t|||j|_|j|_|jdkDsJt j j|j|j|_ t|j|j|_ t j|j|jd|_|jr0|jjj |j_yy)NrrFr)rr weight_tyingrX vocab_sizer6r Embeddingr/tok_embeddingsr"r#normrr embeddingsrr&s rArzLMTransformer.__init__s  --"11"""#hh00$((KDHH$--8 ii HH OO    !%!?!?!F!FDKK  rCctdNzFor meta authors: Do not push BLT weights with this, save weights with save_pretrained() then push them manually to HF hub to ensure the repository metadata is correct.rbrrr}s rA push_to_hubzLMTransformer.push_to_hub w  rCN token_valuesr|rrRrYcL| |j}|j\}}|j|}||n.t|||j|j ||j }t |!||||}|j|j|} | t| |S| S)NrXrWrLr() rYr3rSrerZrXrLrrrrTr~) rr\r|rrRrYrrdr9logitsrs rArzLMTransformer.forwards  I"(( V    - ####22#{{   GOAwTYO OTYYq\*   0 0MrCc8|jjyrE)rTr)rrs rArzLMTransformer.reset_parameterss ""$rCcr|j|jdz}tjj |j j d|d|zd|zt|!|js?tjj |jj d|d|zd|zyyr) rr/r rrrSrrr.rPr)rrrs rAr.zLMTransformer.init_weightss 88%     & &8m(l     GG ! ! ""x-h, " !rCNNNNrE)rlrmrnr(rrZr6rrr rrrrr.rrs@rArNrNrs G.G( *.*.>B $ ll & %,,'  uY c9:;  : D%rCrNz'https://github.com/facebookresearch/bltztext-generationotherz#fair-noncommercial-research-licensez5https://huggingface.co/facebook/blt/blob/main/LICENSEc&d|jiSNr model_dumprs rArr}vq||~.rCctdi|SNrF)r(datas rArr~s *2T2rC)repo_url pipeline_taglicense license_name license_linkcodersc$eZdZdZdZdZdZdZdZy)PatchingModeEnumentropybpe bpe_patcherspacestaticbyteN) rlrmrnrvrwrxryrzr{rFrCrArurus G CK E F DrCruceZdZUejZeed<dZeed<dZ edzed<dZ e ed<dZ e ed <dZe dzed <dZedzed <d Ze ed <dZeed<dZeed<dZe ed<dZe ed<ddZy) PatcherArgs patching_modecudapatching_deviceNentropy_model_checkpoint_dirFrealtime_patchingg]? threshold threshold_addmax_patch_lengthg@ patch_sizerpatching_batch_sizer- monotonicitylog_timect|SrE)Patcherrs rAbuildzPatcherArgs.builds t}rC)rr)rlrmrnrurvr~__annotations__rrrrrrrzrrrOrrr-rrrrFrCrAr}r}s&6&>&>M#>!OS!/3 #*3#t#(Iu("&M54<&#'cDj'J  FCL$HdrCr}c,||g|t|z zzSrE)rK)seqpad_idmax_lens rArightpadrs &Ws3x/0 00rCc|dk(}tjtj|jddtj|j |ddddfgd}|dk7|z}|j S)Nrrr+rr)r6catr7r3rr-any)tensor zero_mask shifted_masknon_zero_after_zeros rAcheck_non_zero_after_zerorsu! I99 KK Q%**V]] S a"f    L"Q;,6  " " $$rCcT|dk(rt}d|}|j|}||fS)Nrzcuda:)get_local_rankr) entropy_modelr-ranks rA to_devicers8 !$$V,M &  rCcg}|D]K}||kDr3||kDr|j|||z}||kDr|j|;|j|Mt|t|k(sJt|dt||S)N != )r@r)lstmnew_lstrs rAsplit_large_numbersr sG  q5a%q!Qa% NN1  NN1  w<3s8 #DG ~T#c(%DD # NrCc eZdZdefdZ d dej dedej dzdej dzded ej f d Z y) r patcher_argscX||_|j|_|j|_|jr|jJdtj j |jd}tj j|r|}n*tj j |jd}t|j|\}}|j|j}||_ nd|_ |j|_ |j|_ |j|_|j|_|j |_|j"|_|j$|_|j&|_|j&rt)t*|_yy)NzDCannot require realtime patching without an entropy model checkpointzconsolidated/consolidated.pthzconsolidated.pth)rr~rrr_pathjoinexistsload_entropy_modelrrrrrrrrr-rrrrzlog)rrmaybe_consolidated state_pathrrBs rArzPatcher.__init__sk()77!-!?!?  ! !99E VU VE!#99/" ww~~01/ WW\\ ==?Q  299  M1 *,,\-I-IJM!.D !%D %//)77 , = =&11#/#C#C ")) (55$-- =="5)DH rCNrWinclude_next_tokenpreds entropiesrrc|j\}}|r|dzn|}d} |jtjk(r/t j ||f|j |j} ntd|j|j| jD cgc]} t| |j} } t| D cgc] } t| c} } | D cgc]} t| d| } } t j| |j |j} t!| rJ| dk7j#dgj%j'dj)} | ddd| jd| z f} t j*| |j-||jdzzk(s@Jt j*| d |j-||jdzz|j.rc|j0d xxt3j2t4z z cc<|j0d xx| j+j7z cc<| | fScc} wcc} wcc} w) a tokens: 2D tensor of shape [batch_size, seq_len] that needs to be patched Returns patch lengths and optionally scores associated with the tokens (i.e. entropies, logprobs etc.) -> output tensor: [batch_size, max_num_patches] each tensor is processed independently and gets right padded with zeros. Patching with the following modes: 1. patching_mode = None: static patch size 2. patching_mode = "entropy": calculate entropy of each token, allocate patches so that the total number of patches is the same as static patching but choose to begin patches on tokens where the model is most uncertain (highest entropy). When threshold is provided, it uses the threshold to decide when to start a new patch. 3. patching_mode = "space": use space like tokens to define the patches. 4. patching_mode = "bpe": use bpe delim tokens to define the patches. To correctly patch the last token, it may be necessary to include the next token in the patch lengths calculations. This is controlled by the include_next_token argument. rNr+zself.patching_mode r)r)dimsrrpostprocessing_patch_lengthsrW)r3r~rur{r6onesr,r-rcrrQrmaxrKrrrfliprOargmaxminrnumelrrtimesrP)rrWrrrrr=r>seq_len_next_tokscores patch_lengthsplrlast_non_zero_col_reverseds rApatchz Patcher.patchBsr>ll G*<7Q;'   !1!6!6 6!JJ%&fll6==M&(;D0 ];]r3r7]; remaining final_paddings rA fill_tokensrsb ,,Jq  :!55  :y9??H yy&-0a88rCc|d}tj||dddfk(sJd||z dk(sJd|d|d|ddddf}|j|dz|jdzz|jk(s>J|j|dz|jdzzd|jtj|dk\sJ|t || }|S) N)rrrzRfirst patch should always be the same size (1 for dynamic, patch_size for static).rzFirst patch (patch length: z+) should have one non-boe token (boe toks: )r)rr>)r6allrr3patch_ids_from_lengths)rnb_boer>first_patch_lengthdecoder_patch_lengthsdecoder_patch_idss rAdecoder_patch_ids_from_lengthsr sG&t, 99mAqD11 \[\  V#q(n $%7$88cdjckklmn( *!QR%0!!#vz]5H5H5K&KK     i # # %!}7J7J17M(M M NdS`SdSdSfRghi  99*a/ 0L5J4KL 0.+W rC) iʚ;l21Ai oYlvtl.l}glAul 0lTlAK l| hash_func_nbc&tjt|tj|j}tj t |jdDcgc]}||z c}}tj||zdScc}w)Nr+rr) r6rprimesint64r-rr>r3r)rrprimer prime_powerss rArolling_polynomial_hashrsm LL -U[[ RE;;% 2DE2DQq2DEFL 99Q%2 .. Fs! B group_sizemax_hashcftj5|j\}}tj||dz tj|j }tj ||gd}|jd|d}t||}||z} dddd _ | S#1swYxYw)aX Returns a hash of the input x and maps it to a value in the range [0, max_hash]. expects: x of shape (batch_size, seq_len) with values as ids in the token vocab. returns a tensor of shape (batch_size, seq_len) with values in the range [0, max_hash]. Note: max hash can make a big difference on the number of collisions. rr+rNF) r6no_gradr3r7rr-runfoldrr) rrrrr=r>prefixwindowshasheshash_values_ranges rAbyte_group_hash_functionr s gg GRau{{188T IIvqkq )((1j!,(,?"X-  ',# # s BB''B0c |j\}}|ss|jdj|||}tj||j jdjdj|||}nr|jdj|||}tj||j jdjdj|||}|||k(}|S||k|||zkz}|S)aJ Creates a tensor of shape [bs, seq_len, num_patches] where each element at position (i, j, k) is True if the patch id at position (i, j) is less than or equal to k. Args: patch_ids (torch.Tensor): Tensor of shape [bs, seq_len] containing patch ids. num_patches (int): Total number of patches. window (int): If not None, only considers patches within a window of size window. patches_as_queries (bool): If True, the patches are used as queries Returns: torch.Tensor: Tensor of shape [bs, q_len, kv_len] with the desired mask. rrrr)r3r4r5r6rr-) r< num_patcheswindowpatches_as_queriesr=r>q_idskv_idsrRs rAcreate_patch_mask_from_idsr%?s//KB ##B'..r7KH LLY-=-= > Yq\ Yq\ VB - $$Q'..r;H LLY-=-= > Yq\ Yr] VB W -  ~ K%EFVO$;< KrCc v |jd}tj5t||jd||j ||rdnd |r|jd|zn|}|r|n|jd|z} j||| fk(sJ jd||| f|r! fd} t | |d|| d }|cdddStj tjd tjtd jdcdddS#1swYyxYw) Nrr)r!r"rrrc|||fSrErF)rGr9rHrI cross_masks rA patch_maskz#cross_attn_mask..patch_masks!!UF"233rCT)BHQ_LENKV_LEN_compilerz-inf) r3r6rr%repeat_interleaver rNrrzr4) r<rNr"rr!rr=q_lenkv_lenr)r(s @rAcross_attn_maskr3esN  B /     "1   L3Ea2  N  :L ##A&5QR(m.A.A!.D|.S   $   : tR$7#8 9  :  4+ J9 <;;ELL-u||E&M/JiA sB"D/AD//D8enforce_patch_size_multipler rboe_idc|j\}}|}|}|dkDr:|j||j|} tj| |fd}|r"|jd|zdk7r t |||}|d|fS)a This function returns X_et, X_gt and X_dt, the encoder, global, and decoder tokens respectively. Consider the input and target sequences: X=[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12,13] Y=[4,5,6,7,eos,bos,8,9,10,eos,bos,11,12,13,14] with patch_size=4 Note 1: that there will be no special tokens introduced at the patch level. Note 2: X_e needs to be trimmed to be passed to Global Current without boe: X_et = [[boe,boe,boe,boe] [3,4,5,6], [7,eos,bos,8], [9,10,eos,bos] [11,12,13, pad]] X_g = [[boe,boe,boe,boe] [3,4,5,6], [7,eos,bos,8], [9,10,eos,bos] [11,12,13, pad]] # remove last glob patch X_dt = [[3,4,5,6] [7,eos,bos,8], [9,10,eos,bos], [11,12,13]] Y = [[4,5,6,7] [eos,bos,8,9], [10,eos,bos,11], [12,13,14]] --> lag fix: X_et = [[boe,boe,boe,3] [4,5,6,7], [eos,bos,8,9], [10,eos,bos,11] [12,13,pad,pad]] X_g = [[boe,boe,boe,3] [4,5,6,7], [eos,bos,8,9], [10,eos,bos,11]] X_dt = [[3,4,5,6] [7,eos,bos,8], [9,10,eos,bos], [11,12,13]] Y = [[4,5,6,7] [eos,bos,8,9], [10,eos,bos,11], [12,13,14]] Dynamic (current): X = [3,4,5,6,7,eos,bos,8,9,10,eos,bos] Y = [4,5,6,7,eos,bos,8,9,10,eos,bos,11] entropy patching: input: 7, bos, 9, 10 pred (high entropy): eos, 8, 10, eos X_et = [[boe,3,4,5,6,7,eos,bos,8,9,10,eos,bos] X_g = [[boe], [3,4,5,6], [7,eos],[bos,8],[9], [10,eos]] X_dt = [[3,4,5,6], [7,eos], [bos,8],[9], [10,eos],[bos]] Y = [4,5,6,7,eos,bos,8,9,10,eos,bos,11] --> lag fix no boe (force single byte first patch): X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12] X_g = [[3], [4,5,6,7], [eos,bos],[8,9], [10], [eos,bos], [11,12]] # remove last global patch X_dt = [[3,4,5,6], [7,eos], [bos,8], [9], [10,eos], [bos,11,12]] Y = [4,5,6,7, eos,bos, 8,9, 10, eos,bos, 11,12,13] input: 4, 7, bos, 9, 10 pred (high entropy): 5, eos, 8, 10, eos X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12] X_g = [[3], [4] , [5,6,7], [eos,bos],[8,9], [10], [eos,bos], [11,12]] # remove last global patch X_dt = [[3] [4,5,6], [7,eos], [bos,8], [9], [10,eos], [bos,11,12]] Y = [4,] [5,6,7, eos,bos, 8,9, 10, eos,bos, 11,12,13] Handle the last byte properly. patch_lengths = [1, 1, 3, 2, 2 1 2 2 1] X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12] X_g = [[3], [4] , [5,6,7], [eos,bos],[8,9], [10], [eos,bos], [11,12]] # do not remove last global patch X_dt = [[3] [4,5,6], [7,eos], [bos,8], [9], [10,eos], [bos,11] [12]] Y = [4,] [5,6,7, eos,bos, 8,9, 10, eos,bos, 11,12, 13]] bpe delim X_et = [[3,4,5,6,7,,eos,bos,,8,9,,10,,eos,bos,11,12] X_g = [[3], [4,5,6,7,], [eos,bos,], .. X_dt = [[3,4,5,6,7], [,eos,bos], [,bos,8], .. Y = [4,5,6,7,, eos,bos, 8,9,, .. Note 1: that there will be no special tokens introduced at the patch level. Note 2: X_e needs to be trimmed to be passed to Global rrrrN)r3rrr6rr) rWr4r rr5rr>local_encoder_tokenslocal_decoder_tokens padded_patchs rA get_blt_inputr:sX!,,J!! zzz*f5;;FC $yy,8L)MSTU#';'A'A"'E 'RVW'W*+?VT '; ;;rCcX|j\}}tjtj|d|j|j |j dgd}|jdtj||j kjddz }tj||jdkDstj|dkrCJtj|d|jdd tj|d |S) Nrr+rrrrtr > z or z < 0) r3r6rr7r,r-cumsumr4rrrr)rr>r=r cum_dr<s rArrs#))OB II KKA]%8%8AUAU V  R (    E$ WU\\(RRWW X  I  )}22266%))I:NQR:R[ ))I  s=#6#6r#:";4 )@T?UUYZ[  rCrc|jdt|j|j|j|j dt |ddd }t|S)NTF) r/r?rrlocal_attention_window_lenrrrcross_attn_decoder)deepr) model_copydictrn_layers_globaln_heads_globaln_kv_heads_globalrGlobalTransformer)r global_argss rAcreate_global_transformerrJ sc// ))''--'+248$$  " K [ ))rCcDeZdZdeffd ZdefdZdZdZddZxZ S)LocalModelBaserct||j|_|j|_|j|_|j |_|j |_|j|_|j|_|j|_ |j|_ t|dd|_ t|dd|_ t|dd|_|j|_t |_t%j&t)|j*Dcgc] }t-|c}|_|js0t%j0|j2|j|_n^t7|j8|j:xs|j|j<z|j>|j@|_!d|_tE|drE|jF|jk7r,t%jH|jF|jdnd|_%|jM||_'ycc}w)NrrArr8rFr)(rrr/dropoutrQrrrYrXuse_roper:getattrrrArrLrr5r r<r>r?rr=rR max_lengthpos_embeddingsrrrrrrroperrrtoken_embedding_projection_create_patch_projectionpatch_embedding_projectionrAs rArzLocalModelBase.__init__s 88|| ////!//"11  #33")$0Dd"K")$0Dd"K#D.$?kk  mm-24==-A B-A d #-A B  }}"$,,t"ID 'ooB$((dll*B??/3/R/R DI #'D t_-$2D2D2P IId(($(( ? ' +/*G*G*M') Cs!Ict|dxr|j|jk7}|jxr |jxs|j xr |j}|xs|S)Nr)rPrr/rrrA)rrdimension_mismatchcross_attn_conditionss rA_should_create_patch_projectionz.LocalModelBase._should_create_patch_projectionIsh D/ * Mt/A/ATXX/M   # # G(G(G!K%%I$*I*I ":%::rCc|j|sy|j|jxsdz}tj|j |dS)NrF) in_features out_featuresr)rZrrr rr)rr output_dims rArUz'LocalModelBase._create_patch_projectionUsN33D9''4+<+<+AB yy**#  rCc,||S|j|SrE)rSrrWembedss rAapply_embeddingzLocalModelBase.apply_embeddingas  M&&v. .rCc |jjt|dr|jj|xs|jdz}t|dr>t j j|jjd|d|zd|z|j>t j j|jjd|d|zd|zt|jD]\}}tjd|d zzd ztjdt!|jd zzd ztj"|jd z tj$d i|j&}|j)d|t|d r>t j j|j*jd|d|zd|z|j,>t j j|j,jd|d|zd|z|j.M|j0dz}t j j|j.jd|d|zd|z|j2t|j2D]\}}tjd|d zzd ztjdt!|jd zzd ztj"|jd z tj$d i|j&}|j)d|yy)NrTrrSrrrrrrrIrJrr)rSrrrTr/r rrrSrrRrr=rgrqrprKrrror:r.rrTrVrcross_attn_layers)rrrLrGr patch_emb_stds rAr.zLocalModelBase.init_weightsgs ""$ 4 II & & (3T 2 4) * GG ! !##**x-h, "     * GG ! !##**x-h, " &dkk2LE5++a519o#-E**Q#dkk2BQ2F-GC,O''D&&  "" $F   tV ,3 4 " GG ! ! ""x-h, "   * * 6 GG ! !//66x-h, "   * * 6 ..48M GG ! !//66!}$m# "   ! ! - )$*@*@ A u!//!uqy/c1I!..c$++6F6J1KPS0S!++TXX_!**C  && (""40!B .rCrE) rlrmrnr'rrZrUrbr.rrs@rArLrLs.)N^)NV ;N ;  / F1rCrLcPeZdZdeffd ZdZ ddejdeejdeejdee deje fd eejd ee d eejd ee e ejeje ffd ZdZxZS) LocalEncoderrc t|||j|_|j|_|j du|_|j|_|j|_|j|_ |j|_ tj|j|j|_|jrt jj#|_|jr |j&nd}t)|D]j}|j$j+t-|j|j|jz|j|j|j.lyy)Nrr/rrrr#)rrrapply_transformerrencoder_hash_byte_group_sizeexpects_hash_embeddingsrcross_attn_all_layers_encoderrcross_attn_nheadsr rRrQr/rSr6r<rdr?r>r@CrossAttentionr#rr layers_to_addrBrs rArzLocalEncoder.__init__s' !%!C!C'+'C'C$'+'H'HPT'T$"&"9"9-1-O-O**.*I*I'!%!7!7 ll4??DHHE  " "%*XX%8%8%:D "-1-O-ODMMUVM=)&&--" HH!%T-C-C!C $ 6 6#'#9#9!% * #rCcR||jsJd|S|j|S)Nz&Not expecting embeddings to be passed.)rlrSr`s rArbzLocalEncoder.apply_embeddings7  ,, 87 8,M&&v. .rCrWra patch_embedsrRrr(r r<cachec h|j\} } |/t| |jd|j||j}|j ||} |j r|j| nd} tj| |j|j} t|jD]g\} }|| || |j} |js)| t|jdz k(s |jsQ|j!| || | |||}i|jr|nd}| |f|fS)rNrr^rdptrainingrRrrYr)r3rerYrXrLrbrOrSrvrNryrr=rrKrmapply_cross_attention)rrWrarsrRr(r r<rtr=rdr9rrrG h_residuals rArzLocalEncoder.forwards\\ F <%$#22{{ D   004 DIIVI,4 IIa4<<$-- @!$++.HAuadY$..QA&&S%))T-O-O#99|QKJ  /&*%<%<\$ :%%rCc<|jrd|bt||d|}|jH|j|}|j||jd|j z|j }|jr|nd}|j||||}||zS)NamaxrrrkvrR) rrBrVrr3rr/rmrd) rr9rs layer_idxr=r r<r(patch_embeds_crosss rAr{z"LocalEncoder.apply_cross_attentions  * *|/C(; JL..:#>>|L +33 **1-0A0AA488  "&!C!CI >T33I>  000rC)NNNNNNN)rlrmrnr'rrbr6rrr rrOrrrr{rrs@rArgrgs^6/*./3@D-1%),0HL'& '&&'&u||, '& u[%,,;<= '& U\\* '&c]'&ELL)'&U5<<s#BCDE'&R1rCrgceZdZdeffd Z d dej deej deej deedej e fdeej d ee e ej ej e ff d Z xZS) LocalDecoderrc t|||j|_|j|_|j|_|j |_t |j|j|_ |jrtjj|_ |jr |jnd}t|D]j}|jj!t#|j|j|j z|j |j |jltj$|j|j&d|_y)NrrriFr)rrrAcross_attn_all_layers_decoderrrnr"r/r#rTr6r r<rdr?r>r@rorrQrrps rArzLocalDecoder.__init__s #'"9"9-1-O-O**.*I*I'!%!7!7DHH$--8  " "%*XX%8%8%:D "-1-O-ODMMUVM=)&&--" HH!%T-C-C!C $ 6 6#'#9#9!% *ii HH OO  rCrWrarsrRrr(rtc|j\}}|Jd|/t||jd|j||j}|} |j ]|Jd|j |}|j 7|j||jd|j z|j}||js| |z} |jr|j|nd} tj| |j|j} t|j D]U\} } |jr.| dk(s |j"r|j$| | || } | | z} | | || |j } W|j'| }tj||j|j}|j)|}|j+}||fS) NzEmbeddings must be providedrr^z Patch embeddings must be passed.rrvrwrrrz)r3rerYrXrLrVrrr/rArOrSrvrNryrr=rrdrTrrz)rrWrarsrRr(rtr=rdr9rrrGh_crossh_predss rArzLocalDecoder.forward:s\\ F!@#@@! <%$#22{{ D   * * 6+ O-O O+::<HL  ,+33 **1-0A0AA488    #D,C,CL A04 DIIVI,4 IIa4<<$-- @!$++.HAu&&Q$<<4$003## KadY$..QA/))A,))Gt||dmmL++g&--/~rCrb)rlrmrnr'rr6rrr rrrrOrrrs@rArrs ^ F04@D-1HL8 8&8u||, 8 u[%,,;<= 8 U\\* 8U5<<s#BCDE8rCrc eZdZdZdededededef fd Z ddejd ejd e e e e fd ejfd Z dd edefdZxZS)rozk CrossAttention block to attend to the encoder states from the decoder. Rope is not supported. r/rrrr#ct|||_||_||_||_|j|j z|_tj|||_ t|||_ tj|||zd|_ tj|||zd|_ tj|||zd|_tj||z|d|_y)NrFr)rrr/rrrrr r"cross_attn_norm_qcross_attn_norm_kvrrrrr)rr/rrrr#rs rArzCrossAttention.__init__{s    $#||t>!#CX!>")#8"<))  h   ))   !  ))   !  )) h   rCrrrRrcL|j\}}}|j\}}}|j|}|j|}|j|} |j |} |j |} | j} | j |||j|j} | j |||j|j} | j |||j|j} t| |jd} t| |jd} |t|tsJtd| | | f\} } } t| | | |} | j!ddj#} |j%| j'| } || zS)Nrrc&|jddSrrrs rArz(CrossAttention.forward..s1;;q!#4rCrr)r3rrrrrrrrrrrrrrrrrrr)rrrrRrr>rBslen_kvx_normrrrrrs rArzCrossAttention.forwardsg''Wa 7A''*  $ $R ( WWV_ WWR[ WWR[xx WWS'4<< ? WWS'4??DMM B WWS'4??DMM B r4//Q 7 r4//Q 7|z$ :::4r2rlC B$RRDA!!!Q'224 566zrCbase_stdrc|xs|jdz|z }tjj|jj d|d|zd|ztjj|j j d|d|zd|ztjj|jj d|d|zd|ztjj|jj d|d|zd|z|jj|jjyr) r/r rrrrrrrrrr)rrrrs rAr.zCrossAttention.init_weightss7488-7  GGNN3h#g    GGNN3h#g    GGNN3h#g    GGNN3h#g   //1 002rCrE)r)rlrmrnrrOrzrr6rrr rrrr.rrs@rArorous ( ( (  (  (  ( \15  <<  LL uY^,-   D#3U#3E#3rCroceZdZdeffd Z d dej deej deej deee ej e fdee e ej ej e ff fd Zfd ZxZS) rHrcHt|||j|_|j|_|j|_d|_|jL|j|j k7r2tj|j|j d|_yyyr) rrrNrLrrTr/r rr&s rArzGlobalTransformer.__init__s || kk !//*.'    )d.@.@DHH.L.0ii""/D +/M )rCrWrrarRrtc|j\}}|}||n-t||j|j||j}|j -|jd|j k7r|j |}tj||j|j}t |-||||j}||fS)z Similar to BaseTransformer.forward, but with an additional embeds argument and projection to the token space. )rWrLrrwr() r3rerYrZrLrTr/rvrNryrr) rrWrrarRrtr=rdr9rs rArzGlobalTransformer.forwards\\ F  ###{{    * * 61772;$((;R//2A IIa4<<$-- @ GOAwTT^^O T%xrCct||jdz}|j?tj j |jjd|d|zd|zyyr)rr.rrTr rrr)rrrs rAr.zGlobalTransformer.init_weights si   T*  * * 6 GG ! !//66s(c' "  7rCrb)rlrmrnr#rr6rrr rrrrrOrr.rrs@rArHrHs 0 "+/)->BHL " "%,,'"& " uY c9:; " U5<<s#BCDE "H  rCrHc(eZdZeZeZy) EmbeddingTypeN)rlrmrnrHASH_TOKNGRAMrFrCrArr,svH FErCrembedding_typelocal_encoder_dimrkcR|tjk(r |jy|tjk(r |jyg}|tjk(rU|}|j }t |jD].}|D]'}|jtj||)0ni|tjk(rVt|j}|}d} |jD]*} |jtj| | z|,tj|S)Nr)rrrkrencoder_ngram_to_size_strencoder_hash_byte_group_vocabr>$encoder_hash_byte_group_nb_functionsr@r rRrvaluesr<) rrrrkrUr?rrBencoder_ngram_to_sizer ngram_vocab_sizes rAinit_embeddingsr1s -000  - - 5,,,1O1O1WJ///#(,(J(J%t@@AA1!!LL52B =.. . 3D4R4R S# 5 < < >    bll+;f+DgN O!? == $$rCr7encoder_hash_tok_embeddingrrc|y|j|}d}t|D]-}|D]&} t|| ||} ||} || | z}|dz }(/|t|k(sJ|S)a Compute embeddings using hash token embeddings. Args: local_encoder_tokens: Input tokens tensor local_encoder: Encoder object with tok_embeddings method encoder_hash_tok_embedding: ModuleList of hash token embeddings encoder_hash_byte_group_nb_functions: Number of hash functions encoder_hash_byte_group_size: List of byte group sizes encoder_hash_byte_group_vocab: Vocabulary size for hash embeddings Returns: torch.Tensor: Combined embeddings Nr)rrr)rSr>rrK) r7 local_encoderrrrkrlocal_encoder_embedsrfunc_nbbyte_group_sizehash_idshash_tok_embeddings rAcompute_hash_embeddingsrWs,")(778LM A=>;O/$$6 H " #7:LX:V#V FA <? ./ // / rCceZdZdZdeffd ZdZdZ d dejde ejde ejfd Z d Z xZ S) ByteLatentTransformera The ByteLatentTransformer (BLT) is a byte-level language model architecture that processes byte sequences by dynamically segmenting them into patches. It uses a combination of local encoders, global transformers, and local decoders to efficiently encode and decode byte sequences, leveraging patch-based processing for improved performance and inference efficiency. rc  t||j|_|j|_|j|_t t ttf\|_ |_ |_ |_ |j|_ |j|_|j|_|j |_t#|j$|_|j&|_|j(|_|j*|_|j,|_|j.|_|j0|_|j2|_|j4|_|j6|_|j8|_t;d)id|j<d|j>d|j@dtC|dtE|d|j(ddd |j(r |j,ndd |jFd |jHd |jJd |jLd|jN|jPzd|jRd|jd|jTd|jVd|jXd|jZd|j d|j$d|j\d|j^ddd|j`d|jbd|jd|jdd|jd |j4d!|jfd"|jhd#|jjd$|j}tm||_7t;d)id|jpd|jrd|jtdtw|d|jxddd|j*d dd |j*r |j,ndd |jHd |jJd |jLd|jN|jPzd|jRd|jd|jTd|jVd|jXd|jZd|j d|j$d|j\d|j^ddd|j`d|jbd|jd|jdd|jd |j4d!|jfd"|jhd#|jjd$|j}t{||_>t||_@t|tj|jnj|j4%|_Dt|tj|jnjd%|_Fd|_F|jrtj|_F|jJt|j|_M|jnj}|jjD]8}|jjtj|tz|:|jNd&kDsJd'|jr\tt|j|j|j|j|j|j(|_Xyy)*Nr/r?rrrrrAFrrrrrNrQr#rrXrOrrr9r:rrYrZrr r r~rrrkrmrrnrL)rrkrz!vocab_size must be greater than 0)rr~patching_thresholdpatching_threshold_addrrrF)YrrrPrr~rrrrr5bos_idrrLrrr/r9rgr:rrrArcross_attn_window_encodercross_attn_window_decodercross_attn_use_flex_attentionrkrrr'rn_layers_local_encodern_heads_local_encoderrrrrmax_encoder_seq_lengthrNrQpm_sizer#r@rOrrrrYr r rrmrrnrgrrn_layers_local_decodern_heads_local_decoderrrrJglobal_transformerr local_decoderrrrrrencoder_ngram_embeddingencoder_enable_byte_ngramsr r<ngram_vocab_sizesrrrrr@rRr patch_in_forwardrr}rrrpatcher)rrlocal_encoder_argslocal_decoder_args ngram_emb_dimrrs rArzByteLatentTransformer.__init__s !--//!//     > : T[$+t{ (,'C'C$"&"9"988!//,T-A-AB//#'"9"9"&"9"9 --)-)G)G&)-)G)G&-1-O-O*-1,M,M)-1-O-O*  5 5 1 ,% &&% 00% .. % 4D9 % 4D9 %  $66%  %% /3.E.E**4% (,'F'F% ]]% 22% LL% 5% ]]!% "#% $ ::%% &]]'% ()% *,0+N+N+% ,,,-% .!00/% 01% 2nn3% 405% 6((7% 8 $669% :,,;% <+/*L*L=% >%)$@$@?% @*.)J)JA% B+/*L*LC% D+/*L*LE% F#44G% H;;I% L**<=+$ &&$ 00$ ..$ 4D9 $ // $ % $  $66$ (-$ /3.E.E**4$ ]]$ 22$ LL$ 5$ ]]$ !$ " ::#$ $]]%$ &'$ (,0+N+N)$ *,,+$ ,!00-$ ./$ 0nn1$ 203$ 4((5$ 6 $667$ 8,,9$ :+/*L*L;$ <%)$@$@=$ >*.)J)J?$ @+/*L*LA$ B+/*L*LC$ D#44E$ F;;G$ L#$>$E$E$G ,,33LL!1F!:MJ%H "G$GG"  "#"&"4"4'+'>'>+/+F+F!%!2!2%)%:%:  DL !rCctdrWrXrYs rArZz!ByteLatentTransformer.push_to_hub9r[rCc|jSrErDrs rAr2z(ByteLatentTransformer.get_output_seq_len>rErCrWr ngram_idsc F t|tjs|Jdt||j\}}t |j dk7rdn|jdz }t|d||j|j\}}} |Jt|ddJd|jj|d |jj \}} n|dkDr|dddfxx|z cc<tj|dk\sJt||jd } tj | dztj |dk7j#d ksHJtj | dzd tj |dk7j#d d} |j$r0t'| ||d |j(|j*|j,} t/||j0|j2|j4|j6|j8} |j:|Jd| |j0j=|} t?|t?|j:k(s:Jd|jddt?|j:d|jtA|jdD]h}|j:|}|||}| j|jk(s.Jd| jd|jd|j| |z} j|j1|| d| |jd| \\}}}|jC||jdd }|jE|jd|jdjG|j}tjH||jJk(\}}| ||f}|jJ|||f<|jM||\}}|dd|||zddf}tO||| jd }tj |dz|jdks/Jtj |dzd |jd|jd|jdk(s&J|jdd|jd|jPsdtjR|d|jUd jWd d |jd }d}| j|jdd k(s2Jt'|||d|j(|jX|j,}|j[||| |\}}|S)Nz-ngram_ids must be a tensor or None, but was: rrF)rWr4r rr5rz0Patcher not defined and no patch_lengths passed.T)rrrrr<)r"rr!r)r7rrrrkrzngram_ids must be providedzngram_ids.shape[0]=z% versus len(encoder_ngram_embedding)=z, ngram_ids.shape=zShape mismatch: z vs )rWrarsr(r r<)rarWr)rarsrWr().rr6rtyper3rOr~rr:r5rPrrrrrrrrr3rrrrrrrrkrrrSrKr>rrrrNrLrr rAgatherr4r5rr)rrWrrr=r0r r7rBr8 tok_scoresr<cross_attn_mask_encrrngram_embedding ngram_embeds h_encoderr cache_encoderr9 global_tokensrowscols eos_patch_ids dec_embedsr cross_attn_mask_decrs rArzByteLatentTransformer.forwardAs3 y%,, /93D M :4 ?:K L M D A$,,2Q!8KL8E(-;; 9 5a!5  i.: BA B:(, (:(:$#',,00);) %M: zad#v-#yy'1,,,+ /55b9 yy#a'599 a  $ $ $ ,,   Yii "Q& 's599mq6H5M5MRT5M5U+V*W X Y #  " ""1#'!..55==#  7!5,,'+'F'F151Z1Z)-)J)J*.*L*L    ' ' 3( F*F F(#+'+'9'9'H'H(($y>S,,& e$Y__Q%7$88]^abfb~b~^^AAST]TcTcSde e9??1-."&">">q"A.y|< (..,2D2DD~%&:&@&@%AlFXFXEYYklul{l{k|}~D';l'J$ //3.@.@''*%++A. /A/ +Gm LL]003R 8 1771:qwwqz:@@M [[!5!DE d!$*- -1[[ dM)*&& ' 1 q&6A:"5q89 ; 6#7#=#=b#A  II' (1 , : @ii)*Q. /s1771:, ? @ :  # #A &**:*:1*= = D%%a( )j.>.>q.A-B C D =&& 1'11"5<rsY44 0$ZZ"  $ #  C **    rzz~~8%@AQF'%--7<7ELL7#72"&!%  $J $J  LL4   $J :C   c    +0 P P P P%) PB"U\\"ell"S": 2  2  2 2|| 2  5<< %& 2"7,ehhoo7,t  ll  %,, w  w tC "))C L8)ryy8)v cgg 0;bii!80;fa 6" 6H . 2  aPsD)"1 %! m%m%b-B( 3: $sCx. 9. /S/ RW ||!$8;KN@=B#T ,^_< LL_g1TX>Xvu3RYYu3p==~D*. #%!#%#%#' #%L) ,,) !# ) +. ) #' ) $' )  \\) XdII 6" 6H! . :$  drC