%1gZddlmZmZddlmZmZddlmZddlZej e Z ddZ GddZ Gd d e ZGd d e ZGd de ZGddeZGdde ZGdde ZGdde ZGddZdS))DictList)get_tokenizer_model_type ints2bytes) AutoTokenizerNFctd|jjtdt ||sXd|jjvsd|jjvs`d|jjvs@d|jjvs d|jjvrt |Sd|jjvrt|Sd |jjvrt|Sd |jjvrt|Std |jjd|jjvrt|Std |jj) Nztokenizer type: ztokenizer model type: gpt2bloompretrainedtokenizercodegengptneoxt5llamaxglmzUnknown tokenizer type: zUnicode mapping for ) logdebug __class____name__rlower BBPEMapping BPEMappingLlamaBPEMappingUniGramMapping ValueErrorUnicodeBBPEMappingNotImplementedError) tokenizerunicodes T/mnt/d/dev/semgus/TinyLlama_v1.1_GAD/../transformers-GAD/transformers_gad/mapping.py get_mappingr sII?!4!=??@@@IIL'? 'J'JLLMMM  i)288:: : :)-6<<>>>>$ (;(D(J(J(L(LLLI/8>>@@@@I/8>>@@@@y)) ) Y(17799 9 9i(( (  +4::<< < <"9-- - y*399;; ; ;!),, ,V 8K8TVVWW W Y(17799 9 9%i00 0%Ey':'CEE c<eZdZdZdZdedefdZd dedefdZ dS) Mappingc\|j|_|j|_||_|j|_dSN) eos_token_id bos_token_idrall_special_idsspecialselfrs r__init__zMapping.__init__(s-%2%2" 0 r!cNt|jSr%)lenr get_vocab)r+s r__len__zMapping.__len__.s4>++--...r!token_idreturnc||jvrdSt|dr|}|j|}|S)Nitem)r)hasattrr5rconvert_ids_to_tokens)r+r1 raw_tokens r_mapz Mapping._map1sN t| # #2 8V $ $ '}}HN88BB r!Fc||}|r td|d|t|dS)N token_id: z , token: utf-8)r9rrbytes)r+r1verbosetokens rmapz Mapping.map;sN (##  ? II=8==e== > > >UG$$$r!NF) r __module__ __qualname__r,r0intstrr9r=r@r!rr#r#'s{111 ///SS%%C%5%%%%%%r!r#c2eZdZfdZdedeffd ZxZS)rc:tj|i|dSr%superr,r+argskwargsrs rr,zBBPEMapping.__init__Cs%$)&)))))r!r1r2ct|}|dr|dd}|S)NuĠ )rJr9 startswithreplace)r+r1r8rs rr9zBBPEMapping._mapFsIGGLL**    % % 5!))$44Ir!rrBrCr,rDrEr9 __classcell__rs@rrrBsd*****SSr!rc\eZdZfdZddedeffd ZddedefdZe dZ xZ S) rctj|i|t|j|_dSr%)rJr,rget_intermediate_encodingrintermediate_encodingrKs rr,zUnicodeBBPEMapping.__init__NsB$)&)))%7%Q%Q N& & """r!Fr1r2cJt|}|Sr%)rJr9)r+r1r>r8rs rr9zUnicodeBBPEMapping._mapTs GGLL** r!c|||}|r td|d||j|S)Nr;z , raw_token: )r9rrrX token2bytes)r+r1r>r8s rr@zUnicodeBBPEMapping.mapZsXIIh00  G IIE8EE)EE F F F)55i@@@r!cdd|jjvrt|SdS)Nr )rrr ByteEncoding)rs rrWz,UnicodeBBPEMapping.get_intermediate_encoding`s2 Y(17799 9 9 ** *4r!rA) rrBrCr,rDrEr9r=r@ staticmethodrWrSrTs@rrrMs     SC AACA5AAAA \r!rc2eZdZfdZdedeffd ZxZS)rcXt|d|_dSr%)rJr, last_token_idr+rrs rr,zBPEMapping.__init__is) ###!r!r1r2ct|}d}|j|j|jkrd}||_|dr"|dd}|r |dd}|S)NFTu▁rO)rJr9rar'rPrQ)r+r1r8at_bosrs rr9zBPEMapping._mapmsGGLL**    )d.@DDU.U.UF%    & & *!))%55I *%abbM r!rRrTs@rrrhsd"""""SSr!rc2eZdZfdZdedeffd ZxZS)rcJt|dSr%rIrbs rr,zLlamaBPEMapping.__init__! #####r!r1r2ct|}|dr'|dd}tt |d}|S)Nz<0x)rJr9rPchrrD)r+r1r8 hex_valuers rr9zLlamaBPEMapping._mapsZGGLL**     & & 0!!B$IC 2..//Ir!rRrTs@rrr~sd$$$$$SSr!rc.eZdZfdZdedefdZxZS)WordPieceMappingcJt|dSr%rIrbs rr,zWordPieceMapping.__init__rhr!r1r2c||jvrtSt|j|gddSNF)clean_up_tokenization_spacesr<r)r=rdecoder+r1s rr@zWordPieceMapping.mapG t| # #77N N ! !8*5 ! Q Q    r!rrBrCr,rDr=r@rSrTs@rrprpZ$$$$$ C E        r!rpc.eZdZfdZdedefdZxZS)rcJt|dSr%rIrbs rr,zUniGramMapping.__init__rhr!r1r2c||jvrtSt|j|gddSrsrurws rr@zUniGramMapping.maprxr!ryrTs@rrrrzr!rceZdZfdZxZS)XGLMUniGramMappingcpt||j|_d|_dSr%)rJr,r&r'rbs rr,zXGLMUniGramMapping.__init__s4 ###%2 r!)rrBrCr,rSrTs@rrrs8!!!!!!!!!r!rc`eZdZdZdedefdZdeedefdZdedefdZ d e defd Z d S) r]c(|jrtj|jd}||_|j|_|j|_d|j D|_ d|j D|_ dS)NF)use_fastc4i|]\}}t||SrF)ord).0cbs r z)ByteEncoding.__init__..s$(V(V(Vtq!Q(V(V(Vr!ci|]\}}|| SrFrF)rkvs rrz)ByteEncoding.__init__..s(P(P(P$!QA(P(P(Pr!) is_fastrfrom_pretrained name_or_pathr byte_encoder byte2char byte_decoder char2byteitemscdp2bytebyte2cdpr*s rr,zByteEncoding.__init__s   %5&I#)2)?)2)?(V(Vt~?S?S?U?U(V(V(V (P(P$-:M:M:O:O(P(P(P r!byter2cpd|cxkrdksnJd|dt|j|S)Nrzbyte: z is not in the range [0, 256))rr)r+rs rr@zByteEncoding.mapsDD3 L L L L4>$'(((r! token_idscj|}fd|D}fd|D}tt|gS)Nc4g|]}|jjvrdn|S)r4)rr(rr?r+s r z0ByteEncoding.token_ids2bytes..s9   IN%4>999BBu   r!c:g|]}|SrF)r[rs rrz0ByteEncoding.token_ids2bytes..s'!N!N!Ne$"2"25"9"9!N!N!Nr!)rr7rsum)r+rtokensr=s` rtoken_ids2byteszByteEncoding.token_ids2bytessr N@@KK    RX   "O!N!N!Nv!N!N!N#eR..)))r!r1c`|j|}||Sr%)rr7r[)r+r1r?s rtoken_id2byteszByteEncoding.token_id2bytess+^99(CC&&&r!r?c>fd|D}t|S)Nc*g|]}j|SrF)r)rrr+s rrz,ByteEncoding.token2bytes..s AAAaq 1AAAr!)r=)r+r? bytes_seqs` rr[zByteEncoding.token2bytess*AAAA5AAA Yr!N) rrBrCr,rDr@rr=rrrEr[rFr!rr]r]s Q Q Q)))))) *c *u * * * *'s'u''''        r!r]rA)typingrrtransformers_gad.utilsrr transformersrlogging getLoggerrrr r#rrrrrprrr]rFr!rrsGGGGGGGG&&&&&&g!!:%%%%%%%%6'6,     j         w          W    !!!!!!!!$ $ $ $ $ $ $ $ $ $ r!