'hL ddlZddlmZddlmZddlmZ ddlZddlmZdZ dZ dd d d d d ddZ dZ ejeZGddeZy#e $rdZ Y5wxYw)N)copy)Path) Tokenizer)load_tiktoken_bpeTFzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+)<|begin_of_text|><|end_of_text|>z<|fim_prefix|>z<|fim_middle|>z<|fim_end_fill|>z <|fim_pad|>z<|fim_suffix|>ic eZdZdeddfdZdefdZdededefd Zd e efd Z dd ed e edzde e ee effd Z y)TikTokenTokenizer model_pathreturnNct|}tt}tt dt|j z }|D] }||d|d< |D]}||xxt |z cc<tjjt|jt|||_ |jjd|_|jjd|_|jj"|_t&j)d|j$d|jd |j y) Nz<|reserved_special_token_z|>)namepat_strmergeable_ranksspecial_tokensr rz#words: z - BOS ID: z - EOS ID: )rrDEFAULT_TIKTOKEN_SPECIAL_TOKENSsetrangevalueslentiktokencoreEncodingrstemDEFAULT_TIKTOKEN_PATTERN tkt_modelencode_single_tokenbos_ideos_idn_vocabn_wordsloggerinfo)selfrrall_special_tokens_with_ids missing_idsidrs D/fsx/ita_zaporozhets/blt/bytelatent/tokenizers/tiktoken_tokenizer.py__init__zTikTokenTokenizer.__init__s+J7&*+J&K#%*o,G,N,N,P(QQ BNP '*CB4r(J K/D ' -_1E E -0"//j!&&,+6 0  >>==>QR >>==>OP  NN22  t||nK }K } U c|jSN)r()r+s r/get_vocab_sizez TikTokenTokenizer.get_vocab_size8s ||r1sadd_bosadd_eosc.t|tsJg}tdt|tD]}|j |||tz|j g|zt|jj|gz|jg|zzS)Nr)start) isinstancestrrrTIKTOKEN_MAX_ENCODE_CHARSappendr%sumr#encode_ordinary_batchr&)r+r5r6r7subsis r/encodezTikTokenTokenizer.encode;s!S!!!q#a&";z6TikTokenTokenizer.get_token_offsets..WsEu!DA4D4DAus r)r#decode_tokens_bytesrBr=maxr>zip) r+rFrC token_bytestext_lenoffsetstokenr5esubstrss r/get_token_offsetsz#TikTokenTokenizer.get_token_offsetsJs  ..<ris>/LV#$   8 $; ; #Ls AAA