import pandas as pd import streamlit as st import analysis_utils as au from analysis_utils import number_breakdown_from_df from app import load_and_cache_data # from app import VA_ROOT from query_comp import QueryWrapper, get_base_url from varco_arena.varco_arena_core.prompts import load_prompt from view_utils import ( default_page_setting, escape_markdown, set_nav_bar, show_linebreak_in_md, ) DEFAULT_LAYOUT_DICT = { "title": {"font": {"size": 20, "family": "Gothic A1"}}, "font": {"size": 16, "family": "Gothic A1"}, "xaxis": {"tickfont": {"size": 12, "family": "Gothic A1"}}, "yaxis": {"tickfont": {"size": 12, "family": "Gothic A1"}}, "legend": {"font": {"size": 12, "family": "Gothic A1"}}, } def navigate(t, source, key, val): # print(key, val) if source is None: return target_index = t.index(source) + val if 0 <= target_index < len(t): st.session_state[key] = t[target_index] st.rerun() def main(): sidebar_placeholder = default_page_setting(layout="wide") set_nav_bar( False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="see_results_init", ) # load the data # print(f"{st.session_state.get('result_file_path', None)=}") most_recent_run = st.session_state.get("result_file_path", None) most_recent_run = str(most_recent_run) if most_recent_run is not None else None ( st.session_state["all_result_dict"], st.session_state["df_dict"], ) = load_and_cache_data(result_file_path=most_recent_run) # side bar st.sidebar.title("Select Result:") result_select = QueryWrapper("expname")( st.sidebar.selectbox, list(st.session_state["all_result_dict"].keys()), ) if result_select is None: if st.session_state.korean: st.markdown("결과를 확인하려면 먼저 **🔥Arena-Lite를 구동**하셔야 합니다") else: st.markdown("You should **🔥Run Arena-Lite** first to see results") st.image("streamlit_app_local/page_result_1.png") st.image("streamlit_app_local/page_result_2.png") st.image("streamlit_app_local/page_result_3.png") st.image("streamlit_app_local/page_result_3.png") st.stop() eval_prompt_name = result_select.split("/")[-1].strip() if st.sidebar.button("Clear Cache"): st.cache_data.clear() st.cache_resource.clear() st.rerun() if result_select: if "alpha2names" in st.session_state: del st.session_state["alpha2names"] fig_dict_per_task = st.session_state["all_result_dict"][result_select] task_list = list(fig_dict_per_task.keys()) elo_rating_by_task = fig_dict_per_task["Overall"]["elo_rating_by_task"] # tabs = st.tabs(task_list) df_dict_per_task = st.session_state["df_dict"][result_select] default_layout_dict = DEFAULT_LAYOUT_DICT task = QueryWrapper("task", "Select Task")(st.selectbox, task_list) if task is None: st.stop() figure_dict = fig_dict_per_task[task] judgename = figure_dict["judgename"] df = df_dict_per_task[task] interpretation, n_models, size_testset = number_breakdown_from_df(df) if st.session_state.korean: st.markdown(f"## 결과 ({task})") st.markdown(f"##### Judge 모델: {judgename} / 평가프롬: {eval_prompt_name}") st.markdown(f"##### 테스트셋 사이즈: {int(size_testset)} 행") else: st.markdown(f"## Results ({task})") st.markdown(f"##### Judge Model: {judgename} / prompt: {eval_prompt_name}") st.markdown(f"##### Size of Testset: {int(size_testset)} rows") col1, col2 = st.columns(2) with col1: with st.container(border=True): st.markdown(f"#### Ratings ({task})") st.table(figure_dict["elo_rating"]) st.write(show_linebreak_in_md(escape_markdown(interpretation))) with col2: with st.container(border=True): st.plotly_chart( elo_rating_by_task.update_layout(**default_layout_dict), use_container_width=True, key=f"{task}_elo_rating_by_task", ) st.divider() if st.session_state.korean: st.markdown("### 토너먼트 (테스트 시나리오) 별로 보기") else: st.markdown("### Tournament Results by Test Scenario") # with st.expander("볼 토너먼트 고르기"): d = list(df.idx_inst_src.unique()) default_idx = st.session_state.get("selected_tournament", None) cols = st.columns((1, 18, 1)) with cols[0]: if st.button("◀", key="prev_tournament"): navigate(d, default_idx, "selected_tournament", -1) with cols[1]: tournament_prm_select = QueryWrapper("tournament", "Select Tournament")( st.selectbox, d, default_idx, key=f"{task}_tournament_select", on_change=lambda: st.session_state.update( selected_tournament=st.session_state.get(f"{task}_tournament_select"), selected_match=None, ), label_visibility="collapsed", ) with cols[2]: if st.button("▶", key="next_tournament"): navigate(d, default_idx, "selected_tournament", 1) # tournament_prm_select = st.selectbox( # "Select Tournament", # df.idx_inst_src.unique(), # index=d.index(st.session_state.get("selected_tournament")), # key=f"{task}_tournament_{result_select}", # ) # print(tournament_prm_select, type(tournament_prm_select)) st.session_state["selected_tournament"] = tournament_prm_select # tournament_prm_select = st.selectbox( # "Select Tournament", # df.idx_inst_src.unique(), # key=f"{task}_tournament_{result_select}", # ) df_now_processed = None if tournament_prm_select: df_now = df[df.idx_inst_src == tournament_prm_select] df_now_processed, _alpha2names = au.init_tournament_dataframe( df_now, alpha2names=st.session_state["alpha2names"] if "alpha2names" in st.session_state.keys() else None, ) if "alpha2names" not in st.session_state: st.session_state["alpha2names"] = _alpha2names try: bracket_drawing = au.draw( df_now_processed, alpha2names=st.session_state["alpha2names"], ) legend = au.make_legend_str( df_now_processed, st.session_state["alpha2names"] ) st.code(bracket_drawing + legend) m = list(df_now_processed.human_readable_idx) default_idx = st.session_state.get("selected_match", None) cols = st.columns((1, 18, 1)) with cols[0]: if st.button("◀", key="prev_match"): navigate(m, default_idx, "selected_match", -1) with cols[1]: match_idx_human = QueryWrapper("match", "Select Match")( st.selectbox, m, default_idx, key=f"{task}_match_select", label_visibility="collapsed", ) with cols[2]: if st.button("▶", key="next_match"): navigate(m, default_idx, "selected_match", 1) # match_idx_human = st.selectbox( # "Select Match", # df_now_processed.human_readable_idx, # key=f"{task}_match_{result_select}", # ) # print(match_idx_human) st.session_state["selected_match"] = match_idx_human # match_idx_human = st.selectbox( # "Select Match", # df_now_processed.human_readable_idx, # key=f"{task}_match_{result_select}", # ) if match_idx_human: match_idx = int(match_idx_human.split(": ")[0]) row = df_now_processed.loc[match_idx] st.markdown("#### Current Test Scenario:") with st.expander( f"### Evaluation Prompt (evalprompt: {eval_prompt_name}--{task})" ): prompt = load_prompt(eval_prompt_name, task=task) kwargs = dict( inst="{inst}", src="{src}", out_a="{out_a}", out_b="{out_b}", task=task, ) if eval_prompt_name == "translation_pair": kwargs["source_lang"] = "{source_lang}" kwargs["target_lang"] = "{target_lang}" prompt_cmpl = prompt.complete_prompt(**kwargs) for msg in prompt_cmpl: st.markdown(f"**{msg['role']}**") st.info(show_linebreak_in_md(escape_markdown(msg["content"]))) st.info(show_linebreak_in_md(tournament_prm_select)) winner = row.winner col1, col2 = st.columns(2) winnerbox = st.success loserbox = st.error with col1: iswinner = winner == "model_a" writemsg = winnerbox if iswinner else loserbox st.markdown(f"#### ({row.model_a}) {row.human_readable_model_a}") writemsg( show_linebreak_in_md(row.generated_a), icon="✅" if iswinner else "❌", ) with col2: iswinner = winner == "model_b" writemsg = winnerbox if iswinner else loserbox st.markdown(f"#### ({row.model_b}) {row.human_readable_model_b}") writemsg( show_linebreak_in_md(row.generated_b), icon="✅" if iswinner else "❌", ) except Exception as e: import traceback traceback.print_exc() st.markdown( "**Bug: 아래 표를 복사해서 이슈로 남겨주시면 개선에 도움이 됩니다. 감사합니다🙏**" if st.session_state.korean else "Bug: Please open issue and attach the table output below to help me out. Thanks in advance.🙏" ) st.error(e) st.info(tournament_prm_select) st.table( df_now_processed[ [ "depth", "round", "winner_nodes", "winner_resolved", "winner", "model_a", "model_b", ] ] ) st.write("Sharable link") st.code(f"{get_base_url()}/see_results?{QueryWrapper.get_sharable_link()}") st.divider() if st.session_state.korean: st.markdown("### 매치 통계") else: st.markdown("### Match Stats.") col1, col2 = st.columns(2) col1, col2 = st.columns(2) with col1: with st.container(border=True): st.plotly_chart( figure_dict[ "fraction_of_model_a_wins_for_all_a_vs_b_matches" ].update_layout(autosize=True, **default_layout_dict), use_container_width=True, key=f"{task}_fraction_of_model_a_wins_for_all_a_vs_b_matches", ) with col2: with st.container(border=True): st.plotly_chart( figure_dict["match_count_of_each_combination_of_models"].update_layout( autosize=True, **default_layout_dict ), use_container_width=True, key=f"{task}_match_count_of_each_combination_of_models", ) with col1: with st.container(border=True): st.plotly_chart( figure_dict["match_count_for_each_model"].update_layout( **default_layout_dict ), use_container_width=True, key=f"{task}_match_count_for_each_model", ) with col2: pass if st.session_state.korean: st.markdown("### 참고용 LLM Judge 편향 정보") else: st.markdown("### FYI: How biased is your LLM Judge?") with st.expander("펼쳐서 보기" if st.session_state.korean else "Expand to show"): st.info( """ Arena-Lite에서는 position bias의 영향을 최소화하기 위해 모든 모델이 A나 B위치에 번갈아 위치하도록 하였습니다. 그러나 LLM Judge 혹은 Prompt의 성능이 부족하다고 느껴진다면, 아래 알려진 LLM Judge bias가 참고가 될겁니다. * position bias (왼쪽) * length bias (오른쪽) 결과의 왜곡이 LLM Judge의 부족함 떄문이었다는 점을 규명하려면 사용하신 LLM Judge와 Prompt의 binary classification 정확도를 측정해보시길 바랍니다 (Arena-Lite를 활용하여 이를 수행해볼 수 있습니다!).""".strip() if st.session_state.korean else """ In Arena-Lite, to minimize the effect of position bias, all models are alternately positioned in either position A or B. However, if you feel the LLM Judge or Prompt performance is insufficient, the following known LLM Judge biases may be helpful to reference: * position bias (left) * length bias (right) To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use Arena-Lite for this purpose!). """.strip() ) st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}") col1, col2 = st.columns(2) with col1: with st.container(border=True): st.plotly_chart( figure_dict["counts_of_match_winners"].update_layout( **default_layout_dict ), use_container_width=True, key=f"{task}_counts_of_match_winners", ) with col2: with st.container(border=True): st.plotly_chart( figure_dict["length_bias"].update_layout(**default_layout_dict), use_container_width=True, key=f"{task}_length_bias", ) st.table(figure_dict["length_bias_df"].groupby("category").describe().T) if __name__ == "__main__": main()