4g*6ddlZddlZddlZddlZddlZddlZddlZddlZddl m Z ddl m Z GddZ GddZdS)N) load_dataset) defaultdictcBeZdZ ddZdZdZd Zd Zdd Zdd Z dS)DatasetWrapperlmsys/lmsys-chat-1mTjson/conversations_index.json2cN||_||_dd|ji|_||_||_||_d|j}||}|d|dD|_|jrtd|jD]n} t| | | } t| jd} t| dd d | d o t|d d 5} tj| |_dddn #1swxYwYn#t tjf$rtd|dt%jt$j|d||t|d d 5} tj| |_dddn #1swxYwYYnwxYw t/jd|_tdt5|jd|j|jd|_n>#t t:f$r*t/j|_tdYnwxYw|jjsQ tA|jj!d|_"dS#tF$r} td| Yd} ~ dSd} ~ wwxYwd|_"dS)N AuthorizationzBearer z7https://datasets-server.huggingface.co/parquet?dataset=cg|] }|d S)url).0files YC:\Users\david\Documents\git\chatbot-arena-dataset-wrapper\./src\lmsys_dataset_wrapper.py z+DatasetWrapper.__init__..s Z Z Ze Z Z Z parquet_filesz Parquet URLs:zContent-Length/: z bytesrutf-8encodingzEConversations index file not found or invalid. Creating a new one at .T)exist_ok)output_index_filezpkl/cached_chats.pklzLoaded z cached chats)dropzNo cached chats foundrNo conversations available: )$hf_token dataset_nameheaderstimeout cache_sizeverbose _safe_getjson parquet_urlsprint _safe_headintsplitopenloadconversations_indexFileNotFoundErrorJSONDecodeErrorosmakedirspathdirnamecreate_conversations_indexpd read_pickle active_dflensample reset_index ValueError DataFrameempty Conversationilocactive_conversation Exception)selfr"r#r'r1r&request_timeoutparquet_list_urlresponser head_response file_sizefes r__init__zDatasetWrapper.__init__ s  (')B4=)B)BC & $ hUYUfhh>>"233   Z Z9Y Z Z ZD | F'(((,FFC#JJJ$(OOC$8$8M #M$9:J$K L LISYYs^^B/DD9DDDEEEE 8)3AAA 8Q+/9Q<<( 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8!4#78 8 8 8 pZmppp q q q K(;<Q + R R R)3AAA 8Q+/9Q<<( 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8  8 +^,BCCDN >C//>>> ? ? ?!^224?CCOOUYOZZDNN!:. + + +\^^DN ) * * * * * +~# , :+78KA8N+O+O((( : : :8Q88999999999 :(,D $ $ $s EE8 EEE E EBHG:. H:G> >HG> HH A6J8J>=J>$K44 L>LLc(|jdkrtddS tj||j|j}|jdkrt d|d|j|S#tjj$rtd|dYdSwxYw) Nrz*Timeout is set to 0. Skipping GET request.)r$r%zFailed to retrieve z. Status code: Timeout occurred for GET . Skipping.) r%r+requestsgetr$ status_coder? exceptionsTimeoutrFrrIs rr(zDatasetWrapper._safe_get?s <1   > ? ? ?4 #<T\4<XXX'3..$%e3%e%exOc%e%efff&.   B#BBBCCCtt sAA&&'BBc|jdkrtddS tj|d|j|j}|S#tjj$rtd|dYdSwxYw)Nrz+Timeout is set to 0. Skipping HEAD request.T)allow_redirectsr$r%rQrR)r%r+rSheadr$rVrWrXs rr,zDatasetWrapper._safe_headMs <1   ? @ @ @4 }S$ ^b^jkkkHO"*    >c>>> ? ? ?44 s#A'A-,A-c2tj|j}td|||}|td|d|jSt jdd5}||j |j }dddn #1swxYwY tj d|d| }||_ t|jd |_n)#t"$r}td |Yd}~nd}~wwxYwt$j|rt%j|n9#t$j|rt%j|wwxYw|S) NzSampling conversations from rQz. Skipping sample extraction..parquetFsuffixdeletezSELECT * FROM read_parquet('z') USING SAMPLE rr!)randomchoicer*r+r(r;tempfileNamedTemporaryFilewritecontentnameduckdbquerydfrBrCrDrEr4r6existsunlink)rF n_samplesrrtmptmp_path query_resultrMs rextract_sample_conversationsz+DatasetWrapper.extract_sample_conversationsXsmD-.. 2S22333 NN3   9 PcPPP Q Q Q> !  ( 5 I I I S IIai xH                $!<(lx(l(laj(l(lmmpprrL)DN :+7 8I!8L+M+M(( : : :8Q8899999999 :w~~h'' $ (###w~~h'' $ (#### $sH3"B!!B%(B%-3E!DE D' D"E"D''E6Fc d|jD}tt}|D]1}||jvr&||j||2t j}|D]\}}||vrtd|d||}td|dt|d | |} | dkrtd|d|d |tj d d 5} | | j| j} dddn #1swxYwY d |} d| d| d} t#j| }t(j| rt)j| n9#t(j| rt)j| wwxYw|js:tdt|d|t j||gd}#t4$r }td|d|Yd}~d}~wwxYw||_ t9|jjd|_n)#t4$r}td|Yd}~nd}~wwxYw|S)NcFi|]}|dd|S)rr)r.)rrs r z8DatasetWrapper.extract_conversations..vs)MMMC #r*CMMMrzFile z! not found in URL list, skipping.Querying file: z for z conversationsrQ. Skipping file rr]Fr^z', 'z5 SELECT * FROM read_parquet('z7') WHERE conversation_id IN ('z') Found z conversations in T ignore_indexzError processing rrr!)r*rlistr1appendr9r@itemsr+r<r(rcrdrerfrgjoinrhrirjr4r6rkrlrAconcatrEr;rBrCrD)rFconversation_ids file_url_mapfile_to_conversationsconvid result_df file_nameconv_idsfile_urlrrnro conv_id_list query_strrjrMs rextract_conversationsz$DatasetWrapper.extract_conversationsss~NM4;LMMM !,D 1 1& W WF111%d&>v&FGNNvVVVLNN #8#>#>#@#@! <! < Ix ,,JiJJJKKK#I.H QIQQCMMQQQ R R R <NN8,,99\h\\PY\\\]]]0 5QQQ(UXIIai((("xH((((((((((((((( ,#);;x#8#8L!5=!!4@!!!I i003355Bw~~h//, (+++w~~h//, (++++,xNI3r77IIiIIJJJ " 9b/ M M MI < < <:)::q::;;;;;;;; <# 6'3DN4G4J'K'KD $ $ 6 6 6 444 5 5 5 5 5 5 5 5 6st 1I=I"E5 IE IE I AG4I6G;;AI I+ I&&I+6$J K%J<<Kc ||dkr|d}|j}tj|t j}|D]}td|||}|dkrtd|d|dGtj dd 5}| |j |j }dddn #1swxYwY d |d |d } tj| } t"j|rt#j|n9#t"j|rt#j|wwxYwtd t+| d|ddt+| dkrt j|| gd}t+||krnt+|dkrEtdddddddddgddddd} t j| g}t|||_ t3|jjd|_n)#t8$r} td| Yd} ~ nd} ~ wwxYw|S)Nr rurQrvrr]Fr^z1 SELECT * FROM read_parquet('zT') WHERE contains(lower(cast(conversation as VARCHAR)), lower('z')) rwz result(s) in rrrTrxz,No results found. Returning empty DataFrame.zNo result found-user)rfrole assistantz[{'-': '-', '-': '-'}])conversation_idmodel conversationturnlanguageopenai_moderationredactedr!)rqr*copyrashuffler9r@r+r(rcrdrerfrgrhrirjr4r6rkrlr<r.r~r;rBrCrDrE) rF filter_str min_resultsrurlsrrrnrorrjplaceholder_rowrMs rliteral_text_searchz"DatasetWrapper.literal_text_searchsD   99"==I %%''tLNN   C )C)) * * *s##ADyyM#MMsMMMNNN,JuMMM $QT !)$$$8 $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ (19Q[ \),,//117>>(++(Ih'''7>>(++(Ih''''( F3r77FF#))C..2DFF G G G2ww{{Iy"oDIII 9~~,,- y>>Q   @ A A A2C(+,/ @ @,/ E E0!(++.4L+. 1 1O o%677I )   " 6'3DN4G4J'K'KD $ $ 6 6 6 444 5 5 5 5 5 5 5 5 6s6;"C))C- 0C- 5/E6F.$J J9J44J9ci}|jD]}|dd}td| tj||j}t jdd5}||j |j }dddn #1swxYwY d |d }tj | } tj|rtj|n9#tj|rtj|wwxYw| D]\} } ||| d <\#t&$r } td |d | Yd} ~ d} ~ wwxYwt)|dd5} t+j|| ddddn #1swxYwY|S)z Builds an index of conversation IDs from a list of Parquet file URLs. Stores the index as a JSON mapping conversation IDs to their respective file names. rrzIndexing file: )r$r]Fr^Nz*SELECT conversation_id FROM read_parquet('z')rzError indexing rwrr)indent)r*r.r+rSrTr$rcrdrerfrgrhrito_dfr4r6rkrliterrowsrEr/r)dump)rFrindexrrrrnrorirj_rowrMrLs rr8z)DatasetWrapper.create_conversations_indexs} $ : :C #r*I /I// 0 0 0 :Ldl;;;0 5QQQ(UXIIai((("xH(((((((((((((((,UUUUEe,,2244Bw~~h//, (+++w~~h//, (++++,!kkmm>>FAs4=E#/011> : : :8 88Q8899999999 :#S7 ; ; ; *q IeQq ) ) ) ) * * * * * * * * * * * * * * *! se1E&,"B E&B E&!B "E&&,D4E&6D<<(E&& F0F  F%G  G G N)rTrr r )r)r) __name__ __module__ __qualname__rNr(r,rqrrr8rrrrr sMQeg0,0,0,0,d      6666p7777r$!$!$!$!$!$!rrc"eZdZdZdZddZdS)rBc6t|tjtfrii|_t|tjr|n|D]\}}|dkr||_||j|<dS||_i|_dS)a Initialize a conversation object either from conversation data directly or from a DataFrame row. Parameters: - data: Can be either a list of conversation messages or a pandas Series/dict containing conversation data rN) isinstancer9Seriesdictconversation_metadatar|conversation_data)rFdatakeyvalues rrNzConversation.__init__ s dRY- . . ,)+D &/9$ /J/J\tzz|||PTPZPZP\P\ < < U.((-2D**6;D.s33  < <&*D ")+D & & &rcTd}|jD]}|ddkr|dz }||d<|jS)z Adds a 'turn' key to each dictionary in the conversation, identifying the turn (pair of user and assistant messages). Returns: - list: The updated conversation with 'turn' keys added. rrrrr)r)rF turn_countermessages r add_turnszConversation.add_turns!sH - + +Gv&((! *GFOO%%rPc tj||jD]o}|ddkr|}n|ddkr|}n!dfd|dD}t |d|dpd S) a5 Prints the conversation with specified prefixes and wrapped text. Parameters: - user_prefix (str): Prefix to prepend to user messages. - assistant_prefix (str): Prefix to prepend to assistant messages. - width (int): Maximum characters per line for wrapping. )widthrrr c3BK|]}|VdS)N)fill)rlinewrappers r z,Conversation.pretty_print..Ds@(('+ T""((((((rrf N)textwrap TextWrapperrr} splitlinesr+)rF user_prefixassistant_prefixrrprefixwrapped_contentrs @r pretty_printzConversation.pretty_print0s&U333- 3 3Gv&(($K//)#ii((((/6y/A/L/L/N/N(((O V11o111 2 2 2 2 3 3rN)r)rrrrNrrrrrrBrB sF,,,* & & &333333rrB)r4pandasr9rrarhrSr)rcdatasetsr collectionsrrrBrrrrs   !!!!!!######|!|!|!|!|!|!|!|!~<3<3<3<3<3<3<3<3<3<3r