%1g JddlmZddlmZddlmZeGddZddlmZmZddlmZedd ed ed eee effd Z d ed e fd Z ded e fdZedkrdZedZedksJe e\ZZedeedeeeZedeeddZeZe ee\ZZedeedejejdSdS)) dataclass)Tuplec<eZdZUdZdZeed<dZeed<dZdZ dS) PartialUTF8a A data class representing the state of a partially decoded UTF-8 sequence. Attributes: - value (int): The current accumulated value of the partially decoded Unicode code point. This attribute stores the bits that have been decoded so far. For a fully decoded character or before any partial decoding has started, this would typically be `0`. - n_remain (int): The number of bytes remaining to complete the current UTF-8 encoded character. A value of `-1` indicates that there is no ongoing partial decoding, i.e., either decoding has not started, or the last character was fully decoded. This class is used to handle situations where UTF-8 encoded data may end in the middle of a character sequence, allowing for the decoding process to be resumed when more data becomes available. rvaluen_remainc8t|j|jfSN)hashrr )selfs W/mnt/d/dev/semgus/TinyLlama_v1.1_GAD/../transformers-GAD/transformers_gad/utf8_utils.py__hash__zPartialUTF8.__hash__sTZ/000czt|tstS|j|jko|j|jkSr ) isinstancerNotImplementedrr )r others r__eq__zPartialUTF8.__eq__!s7%-- "! !zU[(LT]en-LLrN) __name__ __module__ __qualname____doc__rint__annotations__r rrrrrrsk E3NNN  c111MMMMMrr)Listr) lru_cachei-)maxsizesrc partial_startreturnc<gd}d}g}|j}|j}|t|krZ|dkrT||}|dz dkrdg}|tddfS|dz|dzz}|dz }|dz}|t|kr|dkT|jdkr|dkr|||t|kr||}|dz } || dz }|dkrdg}|tddfSdd |z zdz } || z}|dz }|t|kr<|dkr6||}|dz|dzz}|dz }|dz}|t|kr|dk6|dkr|||t|k|dkrd}d}|t||fS) N)r$r$r$r$r$r$r$rrrrr%rr%r?r$r')rr lenrappend) r r!lookuppos code_pointsrr next_byte first_bytehighbitsmasks r decode_utf8r4+s > = =F CK  E%H C..X\\H Nq #K Ar 2 22 2! D 01 qA  C..X\\!!h!mm5!!! C..X ?(#a' a<<#K Ar 2 22 2a(l#q(T! qCHHnnACIaZI$45E 1HC MH CHHnnA q==   u % % %5 C..>1}}  E844 44rc|d}|dz }gd}||}|d|}|d}t|}||d}||fS)Nrr')r$r$r$r$r$r$r$r$r$r$r$r$r%r%r&r'utf-8)decodeord) r r1r2r-char_lenutf8_char_byteschar code_pointremaining_bytess rdecode_utf8_leading_charr>pstQJQH = = =FhH)8)nO  ! !' * *DTJ())nO  &&r utf8_bytesc`g}|r)t|\}}|||)|Sr )r>r,)r?r/r<s rdecode_utf8_stringrAsGK '!9*!E!E J:&&& ' r__main__u€Hellor6s€Helloz Code Point: zRemaining Bytes: z Code Points: z2--------------------------------------------------z Code Points:zRemaining UTF-8 State:N) dataclassesrtypingrrr functoolsrbytesrr4tupler>listrAr my_stringencoder?r<r=printr/r! partial_utf8rr rrrrMsh!!!!!!!!!!!! MMMMMMM M> 7A5 A5*A5 49k !"A5A5A5A5H'%'E''''*5T zI!!'**J - - - - -#;"::"F"FJ E % % %&&& E /o / /000%$Z00K E% %% E(OOO&JKMMM + J F FK E.+&&& E "L$6 8MNNNNN=r