Spaces:
Sleeping
Sleeping
rebrand: varco-arena -> arena-lite
Browse files- .vscode/launch.json +4 -2
- README.md +61 -24
- README_en.md +59 -8
- README_kr.md +57 -7
- app.py +8 -8
- eval_prompt_list.txt +1 -0
- guide_mds/input_jsonls_en.md +32 -31
- guide_mds/input_jsonls_kr.md +14 -17
- modules/nav.py +1 -1
- pages/brief_intro.py +3 -3
- pages/see_results.py +6 -6
- streamlit_app_local/README.md +1 -1
- streamlit_app_local/app.py +10 -10
- streamlit_app_local/modules/nav.py +1 -1
- streamlit_app_local/pages/brief_intro.py +3 -3
- streamlit_app_local/pages/see_results.py +4 -4
- streamlit_app_local/view_utils.py +2 -2
- view_utils.py +2 -2
.vscode/launch.json
CHANGED
@@ -13,13 +13,15 @@
|
|
13 |
"console": "integratedTerminal",
|
14 |
"args": [
|
15 |
"-i",
|
16 |
-
"rsc/inputs_for_dbg/dbg_llmbar_inputs/",
|
|
|
17 |
"-o",
|
18 |
"DBGOUT",
|
19 |
"-e",
|
20 |
"gpt-4.1-mini",
|
21 |
"-p",
|
22 |
-
|
|
|
23 |
|
24 |
]
|
25 |
}
|
|
|
13 |
"console": "integratedTerminal",
|
14 |
"args": [
|
15 |
"-i",
|
16 |
+
// "rsc/inputs_for_dbg/dbg_llmbar_inputs/",
|
17 |
+
"rsc/inputs_for_dbg/dbg_trans_inputs/",
|
18 |
"-o",
|
19 |
"DBGOUT",
|
20 |
"-e",
|
21 |
"gpt-4.1-mini",
|
22 |
"-p",
|
23 |
+
// "llmbar",
|
24 |
+
"translation_pair",
|
25 |
|
26 |
]
|
27 |
}
|
README.md
CHANGED
@@ -1,21 +1,8 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: ๐ฅ
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.40.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: cc-by-4.0
|
11 |
-
short_description: VARCO Arena is a reference-free LLM benchmarking approach
|
12 |
-
---
|
13 |
-
|
14 |
-
# Varco Arena
|
15 |
-
Varco Arena conducts tournaments between models to be compared for each test set command, ranking models accurately at an affordable price. This is more accurate and cost-effective than rating win rates by comparing against reference outputs.
|
16 |
|
17 |
For more information, the followings may help understanding how it works.
|
18 |
-
* [Paper](https://
|
19 |
* [Blog Post (KR)](https://ncsoft.github.io/ncresearch/12cc62c1ea0d981971a8923401e8fe6a0f18563d)
|
20 |
|
21 |
|
@@ -42,7 +29,7 @@ python main.py -i "./some/dirpath/to/jsonl/files" -o SOME_REL_PATH_TO_CREATE -e
|
|
42 |
|
43 |
# dbg lines
|
44 |
## openai api judge dbg
|
45 |
-
python main.py -i "rsc/inputs_for_dbg/dbg_400_error_inputs/" -o SOME_WANTED_TARGET_DIR -e
|
46 |
## other testing lines
|
47 |
python main.py -i "rsc/inputs_for_dbg/[SOME_DIRECTORY]/" -o SOME_WANTED_TARGET_DIR -e gpt-4o-mini
|
48 |
## dummy judge dbg (checking errors without api requests)
|
@@ -102,15 +89,66 @@ pre-commit install
|
|
102 |
bash precommit.sh # black formatter will reformat the codes
|
103 |
```
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
## FAQ
|
106 |
-
* I want to apply my custom judge prompt to run
|
107 |
* [`./varco_arena/prompts/`](./varco_arena/prompts/__init__.py) defines the prompts with `yaml` file and the class objects for those. Edit those as your need.
|
108 |
* I want tailored judge prompts for each line of the test set row (i.e. ~100th row - `prompt1`, 101st~ - `prompt2`)
|
109 |
* You could see `load_prompt` at the above link receives `promptname` + `task` as a parameters to load the prompt. The function is called at [`./varco_arena/manager.py:async_run`](./varco_arena/manager.py).
|
110 |
-
* I want more fields for my llm outputs jsonl files for tailored use, i.e. want more fields beyond `instruction`, `source`, `generated`.
|
111 |
-
* It's going to get tricky but let me briefly guide you about this.
|
112 |
-
* You might have to edit `varco_arena/eval_utils.py`:`async_eval_w_prompt` (this part calls `PROMPT_OBJ.complete_prompt()`)
|
113 |
-
* And all the related codes will require revision.
|
114 |
|
115 |
## Special Thanks to (contributors)
|
116 |
- Minho Lee (@Dialogue Model Team, NCSOFT) [github](https://github.com/minolee/)
|
@@ -122,10 +160,9 @@ bash precommit.sh # black formatter will reformat the codes
|
|
122 |
|
123 |
## Citation
|
124 |
If you found our work helpful, consider citing our paper!
|
125 |
-
[arxiv](https://arxiv.org/abs/2411.19103v1)
|
126 |
```
|
127 |
@misc{son2024varcoarenatournamentapproach,
|
128 |
-
title={
|
129 |
author={Seonil Son and Ju-Min Oh and Heegon Jin and Cheolhun Jang and Jeongbeom Jeong and Kuntae Kim},
|
130 |
year={2024},
|
131 |
eprint={2411.01281},
|
|
|
1 |
+
# Arena-Lite
|
2 |
+
Arena-Lite conducts tournaments between models to be compared for each test set command, ranking models accurately at an affordable price. This is more accurate and cost-effective than rating win rates by comparing against reference outputs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
For more information, the followings may help understanding how it works.
|
5 |
+
* [Paper](https://arxiv.org/abs/2411.01281)
|
6 |
* [Blog Post (KR)](https://ncsoft.github.io/ncresearch/12cc62c1ea0d981971a8923401e8fe6a0f18563d)
|
7 |
|
8 |
|
|
|
29 |
|
30 |
# dbg lines
|
31 |
## openai api judge dbg
|
32 |
+
python main.py -i "rsc/inputs_for_dbg/dbg_400_error_inputs/" -o SOME_WANTED_TARGET_DIR -e gpt-4o-mini
|
33 |
## other testing lines
|
34 |
python main.py -i "rsc/inputs_for_dbg/[SOME_DIRECTORY]/" -o SOME_WANTED_TARGET_DIR -e gpt-4o-mini
|
35 |
## dummy judge dbg (checking errors without api requests)
|
|
|
89 |
bash precommit.sh # black formatter will reformat the codes
|
90 |
```
|
91 |
|
92 |
+
### ๐ Adding a Custom Prompt
|
93 |
+
|
94 |
+
Hereโs how to add a new evaluation prompt. The process has been simplified recently, as the Judge logic now only relies on the `parsed_output` method.
|
95 |
+
|
96 |
+
The easiest way is to copy `llmbar_brief.py` and `llmbar_brief.yaml` to create your own prompt.
|
97 |
+
|
98 |
+
#### 1. Create Prompt `.py` and `.yaml` Files
|
99 |
+
|
100 |
+
- Create files like `my_prompt.py` and `my_prompt.yaml` in the `varco_arena/varco_arena_core/prompts/` directory.
|
101 |
+
- **`my_prompt.py`**:
|
102 |
+
- Define a class that inherits from `ComparisonPromptBase`.
|
103 |
+
- You **must** implement the `parsed_output(self, response)` method. This function should take the LLM Judge's `response` and return a decision token (e.g., `'a'`, `'b'`) indicating the winner.
|
104 |
+
- **`my_prompt.yaml`**:
|
105 |
+
- Define necessary elements for your prompt, such as `sampling_parameters`, `decision_tokens`, and `prompt_template`.
|
106 |
+
- The strings in prompt_template are processed by string.Template and finalized in eval_utils.py via the BasePrompt.complete_prompt() function.
|
107 |
+
- Do not use ${task} ${generated} ${model_id} in prompt_template. They are reserved for Arena-Lite.
|
108 |
+
|
109 |
+
#### 2. Register the Prompt in `prompts/__init__.py`
|
110 |
+
|
111 |
+
- Import your new prompt class:
|
112 |
+
```python
|
113 |
+
from .my_prompt import MyPrompt
|
114 |
+
```
|
115 |
+
- Add your new prompt's name and class instance to the `NAME2PROMPT_CLS` dictionary:
|
116 |
+
```python
|
117 |
+
NAME2PROMPT_CLS = dict(
|
118 |
+
# ... other prompts
|
119 |
+
my_prompt=MyPrompt(),
|
120 |
+
)
|
121 |
+
```
|
122 |
+
- Add the new prompt name to the `Literal` type hint for the `promptname` argument in the `load_prompt` function:
|
123 |
+
```python
|
124 |
+
def load_prompt(
|
125 |
+
promptname: Literal[
|
126 |
+
# ... other prompt names
|
127 |
+
"my_prompt",
|
128 |
+
],
|
129 |
+
# ...
|
130 |
+
):
|
131 |
+
```
|
132 |
+
|
133 |
+
#### 3. Add the Prompt to `eval_prompt_list.txt`
|
134 |
+
|
135 |
+
- Open the `eval_prompt_list.txt` file in the project root and add the name of your new prompt (`my_prompt`) on a new line.
|
136 |
+
|
137 |
+
#### 4. (Recommended) Test and Debug
|
138 |
+
|
139 |
+
- It is highly recommended to debug your prompt to ensure it works as expected.
|
140 |
+
- In the `.vscode/launch.json` file, modify the `"VA"` configuration's `args`:
|
141 |
+
- Change `"-p", "translation_fortunecookie"` to `"-p", "my_prompt"`.
|
142 |
+
- If necessary, update the `"-i", "..."` argument to the path of your test data suitable for the new prompt.
|
143 |
+
- Go to the `Run and Debug` tab in VS Code (Ctrl+Shift+D), select the "VA" configuration, and press F5 to run the debugger.
|
144 |
+
- Find `result.json` inside the output directory you specified after `-o`. It will show every judge prompt used for each match.
|
145 |
+
|
146 |
+
|
147 |
## FAQ
|
148 |
+
* I want to apply my custom judge prompt to run Arena-Lite
|
149 |
* [`./varco_arena/prompts/`](./varco_arena/prompts/__init__.py) defines the prompts with `yaml` file and the class objects for those. Edit those as your need.
|
150 |
* I want tailored judge prompts for each line of the test set row (i.e. ~100th row - `prompt1`, 101st~ - `prompt2`)
|
151 |
* You could see `load_prompt` at the above link receives `promptname` + `task` as a parameters to load the prompt. The function is called at [`./varco_arena/manager.py:async_run`](./varco_arena/manager.py).
|
|
|
|
|
|
|
|
|
152 |
|
153 |
## Special Thanks to (contributors)
|
154 |
- Minho Lee (@Dialogue Model Team, NCSOFT) [github](https://github.com/minolee/)
|
|
|
160 |
|
161 |
## Citation
|
162 |
If you found our work helpful, consider citing our paper!
|
|
|
163 |
```
|
164 |
@misc{son2024varcoarenatournamentapproach,
|
165 |
+
title={VARCO Arena: A Tournament Approach to Reference-Free Benchmarking Large Language Models},
|
166 |
author={Seonil Son and Ju-Min Oh and Heegon Jin and Cheolhun Jang and Jeongbeom Jeong and Kuntae Kim},
|
167 |
year={2024},
|
168 |
eprint={2411.01281},
|
README_en.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
#
|
2 |
-
|
3 |
|
4 |
For more information, the followings may help understanding how it works.
|
5 |
* [Paper](https://arxiv.org/abs/2411.01281)
|
@@ -89,15 +89,66 @@ pre-commit install
|
|
89 |
bash precommit.sh # black formatter will reformat the codes
|
90 |
```
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
## FAQ
|
93 |
-
* I want to apply my custom judge prompt to run
|
94 |
* [`./varco_arena/prompts/`](./varco_arena/prompts/__init__.py) defines the prompts with `yaml` file and the class objects for those. Edit those as your need.
|
95 |
* I want tailored judge prompts for each line of the test set row (i.e. ~100th row - `prompt1`, 101st~ - `prompt2`)
|
96 |
* You could see `load_prompt` at the above link receives `promptname` + `task` as a parameters to load the prompt. The function is called at [`./varco_arena/manager.py:async_run`](./varco_arena/manager.py).
|
97 |
-
* I want more fields for my llm outputs jsonl files for tailored use, i.e. want more fields beyond `instruction`, `source`, `generated`.
|
98 |
-
* It's going to get tricky but let me briefly guide you about this.
|
99 |
-
* You might have to edit `varco_arena/eval_utils.py`:`async_eval_w_prompt` (this part calls `PROMPT_OBJ.complete_prompt()`)
|
100 |
-
* And all the related codes will require revision.
|
101 |
|
102 |
## Special Thanks to (contributors)
|
103 |
- Minho Lee (@Dialogue Model Team, NCSOFT) [github](https://github.com/minolee/)
|
@@ -111,7 +162,7 @@ bash precommit.sh # black formatter will reformat the codes
|
|
111 |
If you found our work helpful, consider citing our paper!
|
112 |
```
|
113 |
@misc{son2024varcoarenatournamentapproach,
|
114 |
-
title={
|
115 |
author={Seonil Son and Ju-Min Oh and Heegon Jin and Cheolhun Jang and Jeongbeom Jeong and Kuntae Kim},
|
116 |
year={2024},
|
117 |
eprint={2411.01281},
|
|
|
1 |
+
# Arena-Lite (former Arena-Lite)
|
2 |
+
Arena-Lite conducts tournaments between models to be compared for each test set command, ranking models accurately at an affordable price. This is more accurate and cost-effective than rating win rates by comparing against reference outputs.
|
3 |
|
4 |
For more information, the followings may help understanding how it works.
|
5 |
* [Paper](https://arxiv.org/abs/2411.01281)
|
|
|
89 |
bash precommit.sh # black formatter will reformat the codes
|
90 |
```
|
91 |
|
92 |
+
### ๐ Adding a Custom Prompt
|
93 |
+
|
94 |
+
Hereโs how to add a new evaluation prompt. The process has been simplified recently, as the Judge logic now only relies on the `parsed_output` method.
|
95 |
+
|
96 |
+
The easiest way is to copy `llmbar_brief.py` and `llmbar_brief.yaml` to create your own prompt.
|
97 |
+
|
98 |
+
#### 1. Create Prompt `.py` and `.yaml` Files
|
99 |
+
|
100 |
+
- Create files like `my_prompt.py` and `my_prompt.yaml` in the `varco_arena/varco_arena_core/prompts/` directory.
|
101 |
+
- **`my_prompt.py`**:
|
102 |
+
- Define a class that inherits from `ComparisonPromptBase`.
|
103 |
+
- You **must** implement the `parsed_output(self, response)` method. This function should take the LLM Judge's `response` and return a decision token (e.g., `'a'`, `'b'`) indicating the winner.
|
104 |
+
- **`my_prompt.yaml`**:
|
105 |
+
- Define necessary elements for your prompt, such as `sampling_parameters`, `decision_tokens`, and `prompt_template`.
|
106 |
+
- The strings in prompt_template are processed by string.Template and finalized in eval_utils.py via the BasePrompt.complete_prompt() function.
|
107 |
+
- Do not use ${task} in prompt_template. It is a reserved keyword due to the llmbar prompt.
|
108 |
+
|
109 |
+
#### 2. Register the Prompt in `prompts/__init__.py`
|
110 |
+
|
111 |
+
- Import your new prompt class:
|
112 |
+
```python
|
113 |
+
from .my_prompt import MyPrompt
|
114 |
+
```
|
115 |
+
- Add your new prompt's name and class instance to the `NAME2PROMPT_CLS` dictionary:
|
116 |
+
```python
|
117 |
+
NAME2PROMPT_CLS = dict(
|
118 |
+
# ... other prompts
|
119 |
+
my_prompt=MyPrompt(),
|
120 |
+
)
|
121 |
+
```
|
122 |
+
- Add the new prompt name to the `Literal` type hint for the `promptname` argument in the `load_prompt` function:
|
123 |
+
```python
|
124 |
+
def load_prompt(
|
125 |
+
promptname: Literal[
|
126 |
+
# ... other prompt names
|
127 |
+
"my_prompt",
|
128 |
+
],
|
129 |
+
# ...
|
130 |
+
):
|
131 |
+
```
|
132 |
+
|
133 |
+
#### 3. Add the Prompt to `eval_prompt_list.txt`
|
134 |
+
|
135 |
+
- Open the `eval_prompt_list.txt` file in the project root and add the name of your new prompt (`my_prompt`) on a new line.
|
136 |
+
|
137 |
+
#### 4. (Recommended) Test and Debug
|
138 |
+
|
139 |
+
- It is highly recommended to debug your prompt to ensure it works as expected.
|
140 |
+
- In the `.vscode/launch.json` file, modify the `"VA"` configuration's `args`:
|
141 |
+
- Change `"-p", "translation_fortunecookie"` to `"-p", "my_prompt"`.
|
142 |
+
- If necessary, update the `"-i", "..."` argument to the path of your test data suitable for the new prompt.
|
143 |
+
- Go to the `Run and Debug` tab in VS Code (Ctrl+Shift+D), select the "VA" configuration, and press F5 to run the debugger.
|
144 |
+
- Find `result.json` inside the output directory you specified after `-o`. It will show every judge prompt used for each match.
|
145 |
+
|
146 |
+
|
147 |
## FAQ
|
148 |
+
* I want to apply my custom judge prompt to run Arena-Lite
|
149 |
* [`./varco_arena/prompts/`](./varco_arena/prompts/__init__.py) defines the prompts with `yaml` file and the class objects for those. Edit those as your need.
|
150 |
* I want tailored judge prompts for each line of the test set row (i.e. ~100th row - `prompt1`, 101st~ - `prompt2`)
|
151 |
* You could see `load_prompt` at the above link receives `promptname` + `task` as a parameters to load the prompt. The function is called at [`./varco_arena/manager.py:async_run`](./varco_arena/manager.py).
|
|
|
|
|
|
|
|
|
152 |
|
153 |
## Special Thanks to (contributors)
|
154 |
- Minho Lee (@Dialogue Model Team, NCSOFT) [github](https://github.com/minolee/)
|
|
|
162 |
If you found our work helpful, consider citing our paper!
|
163 |
```
|
164 |
@misc{son2024varcoarenatournamentapproach,
|
165 |
+
title={VARCO Arena: A Tournament Approach to Reference-Free Benchmarking Large Language Models},
|
166 |
author={Seonil Son and Ju-Min Oh and Heegon Jin and Cheolhun Jang and Jeongbeom Jeong and Kuntae Kim},
|
167 |
year={2024},
|
168 |
eprint={2411.01281},
|
README_kr.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
#
|
2 |
-
|
3 |
|
4 |
๋ ์์ธํ ๋ด์ฉ์ ๋ํด์๋ ์๋์ ๋งํฌ๋ฅผ ์ฐธ์กฐํ์๋ฉด ๋ฉ๋๋ค.
|
5 |
* [๋
ผ๋ฌธ](https://arxiv.org/abs/2411.01281)
|
@@ -91,16 +91,66 @@ pre-commit install
|
|
91 |
bash precommit.sh # ์ด๊ฒ ์ฝ๋๋ค์ ๋ค ๋ฆฌํฌ๋งทํด์ค๊ฑฐ์
|
92 |
```
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
๋ฌธ์: ์์ ์ผ
|
96 |
* ๋ด๊ฐ ๋ง๋ ํ๋กฌํํธ๋ฅผ ์ฌ์ฉํ๊ณ ์ถ์ด์
|
97 |
* [`./varco_arena/prompts/`](./varco_arena_core/prompts/__init__.py) ์์ ๊ฐ์ข
ํ๋กฌํํธ ํด๋์ค ๋ฐ `yaml` ํ์ผ ํํ๋ก ์ ์๋ ํ๋กฌํํธ๋ฅผ ๋ก๋ํฉ๋๋ค. ํ๋ฆฌ์
์ ์ฐธ์กฐํ์ฌ ์์ฑํ์๋ฉด ๋ฉ๋๋ค.
|
98 |
* ํ
์คํธ์
๋ณ๋ก ๋ค๋ฅธ ํ๊ฐ ํ๋กฌํํธ๋ฅผ ์ฌ์ฉํ๊ณ ์ถ์ด์ (e.g. ์์
์ ๋ฐ๋ผ ๋ค๋ฅธ ํ๋กฌํํธ๋ฅผ ์ฌ์ฉํ๊ณ ์ถ์ด์)
|
99 |
* ์ ๊ฑธ์ด๋๋ฆฐ ๋งํฌ์ `load_prompt` ๋ฅผ ํตํด์ `promptname` + `task` ํํ๋ก [`./varco_arena_core/manager.py:async_run`](./varco_arena_core/manager.py) ํ๋กฌํํธ๊ฐ ๋ก๋๋๋๋ก ํด๋์์ต๋๋ค.
|
100 |
-
|
101 |
-
* ์กฐ๊ธ ๋ณต์กํด์ง๋๋ฐ ๋ค์ ๋ถ๋ถ์ ๊ณ ์ณ์ฃผ์ธ์
|
102 |
-
* `varco_arena/eval_utils.py` ์์ `async_eval_w_prompt` ๋ถ๋ถ์ ์๋ด์ผํ ์ ์์ต๋๋ค (์ฌ๊ธฐ์์ PROMPT_OBJ.complete_prompt()์ ํธ์ถํจ)
|
103 |
-
* ๊ทธ ์ธ ์ฐ๊ด๋ ๋ถ๋ถ์ ํ๊ณ ํ๊ณ ๊ณ ์ณ์ฃผ์
์ผ...
|
104 |
|
105 |
## Special Thanks to (contributors)
|
106 |
- ์ด๋ฏผํธ (@๋ํ๋ชจ๋ธํ, NCSOFT) [github](https://github.com/minolee/)
|
@@ -113,7 +163,7 @@ bash precommit.sh # ์ด๊ฒ ์ฝ๋๋ค์ ๋ค ๋ฆฌํฌ๋งทํด์ค๊ฑฐ์
|
|
113 |
์ ํฌ ์์
๋ฌผ์ด ๋์์ด ๋์๋ค๋ฉด ์ ํฌ๋ ๋์์ ๋ฐ์๋ณผ ์ ์์๊น์?๐
|
114 |
```
|
115 |
@misc{son2024varcoarenatournamentapproach,
|
116 |
-
title={
|
117 |
author={Seonil Son and Ju-Min Oh and Heegon Jin and Cheolhun Jang and Jeongbeom Jeong and Kuntae Kim},
|
118 |
year={2024},
|
119 |
eprint={2411.01281},
|
|
|
1 |
+
# Arena-Lite (๊ตฌ Arena-Lite)
|
2 |
+
์๋ ๋-๋ผ์ดํธ๋ ํ
์คํธ์
๋ช
๋ น์ด๋ณ๋ก ๋น๊ตํ ๋ชจ๋ธ๋ค์ ํ ๋๋จผํธ๋ฅผ ์ํํ์ฌ ์ ํํ๊ฒ ๋ชจ๋ธ๋ค์ ์์๋ฅผ ๋งค๊น๋๋ค. ์ด๊ฒ์ reference ์์ํ๊ณผ ๋น๊ตํ์ฌ ์น๋ฅ ์ ๋งค๊ธฐ๋ ๋ฐฉ๋ฒ๋ณด๋ค ์ ํํ๋ฉฐ ์กฐ๊ธ ๋ ์ ๋ ดํฉ๋๋ค.
|
3 |
|
4 |
๋ ์์ธํ ๋ด์ฉ์ ๋ํด์๋ ์๋์ ๋งํฌ๋ฅผ ์ฐธ์กฐํ์๋ฉด ๋ฉ๋๋ค.
|
5 |
* [๋
ผ๋ฌธ](https://arxiv.org/abs/2411.01281)
|
|
|
91 |
bash precommit.sh # ์ด๊ฒ ์ฝ๋๋ค์ ๋ค ๋ฆฌํฌ๋งทํด์ค๊ฑฐ์
|
92 |
```
|
93 |
|
94 |
+
### ๐ ์ปค์คํ
ํ๋กฌํํธ ์ถ๊ฐํ๊ธฐ
|
95 |
+
|
96 |
+
์๋ก์ด ํ๊ฐ ํ๋กฌํํธ๋ฅผ ์ถ๊ฐํ๋ ๊ณผ์ ์ ๋ค์๊ณผ ๊ฐ์ต๋๋ค. ์ต๊ทผ Judge ๋ก์ง์ด `parsed_output` ๋ฉ์๋๋ง ์ฌ์ฉํ๋๋ก ๊ฐ์ํ๋์ด ์ด์ ๋ณด๋ค ์ฝ๊ฒ ํ๋กฌํํธ๋ฅผ ์ถ๊ฐํ ์ ์์ต๋๋ค.
|
97 |
+
|
98 |
+
๊ฐ์ฅ ๊ฐ๋จํ ๋ฐฉ๋ฒ์ `llmbar_brief.py`์ `llmbar_brief.yaml` ํ์ผ์ ๋ณต์ฌํ์ฌ ์์ ๋ง์ ํ๋กฌํํธ๋ฅผ ๋ง๋๋ ๊ฒ์
๋๋ค.
|
99 |
+
|
100 |
+
#### 1. ํ๋กฌํํธ `.py` ๋ฐ `.yaml` ํ์ผ ์์ฑ
|
101 |
+
|
102 |
+
- `varco_arena/varco_arena_core/prompts/` ๊ฒฝ๋ก์ `my_prompt.py`์ `my_prompt.yaml`์ฒ๋ผ ํ์ผ์ ์์ฑํฉ๋๋ค.
|
103 |
+
- **`my_prompt.py`**:
|
104 |
+
- `ComparisonPromptBase`๋ฅผ ์์๋ฐ๋ ํด๋์ค๋ฅผ ์ ์ํฉ๋๋ค.
|
105 |
+
- `parsed_output(self, response)` ๋ฉ์๋๋ฅผ ๋ฐ๋์ ๊ตฌํํด์ผ ํฉ๋๋ค. ์ด ํจ์๋ LLM Judge์ ์๋ต(`response`)์ ๋ฐ์, ์น์๋ฅผ ๋ํ๋ด๋ ๊ฒฐ์ ํ ํฐ(์: `'a'`, `'b'`)์ ๋ฐํํด์ผ ํฉ๋๋ค.
|
106 |
+
- **`my_prompt.yaml`**:
|
107 |
+
- `sampling_parameters`, `decision_tokens`, `prompt_template` ๋ฑ ํ๋กฌํํธ์ ํ์ํ ์์๋ค์ ์ ์ํฉ๋๋ค.
|
108 |
+
- `prompt_template` ์ ๋ค์ด๊ฐ๋ ๋ฌธ์์ด์ `string.Template`์ผ๋ก ์ฒ๋ฆฌ๋๋ฉฐ `BasePrompt.complete_prompt()` ํจ์๋ฅผ ํตํด `eval_utils.py`์์ ์ต์ข
์์ฑ๋ฉ๋๋ค.
|
109 |
+
- `${task}, ${generated}, ${model_id}`๋ฅผ `prompt_template`์ ์ฌ์ฉํ์ง ๋ง์ธ์. ์์ฝ๋ ํค์๋๋ค์
๋๋ค.
|
110 |
+
|
111 |
+
#### 2. `prompts/__init__.py`์ ํ๋กฌํํธ ๋ฑ๋ก
|
112 |
+
|
113 |
+
- ์์ฑํ ํ๋กฌํํธ ํด๋์ค๋ฅผ `import` ํฉ๋๋ค.
|
114 |
+
```python
|
115 |
+
from .my_prompt import MyPrompt
|
116 |
+
```
|
117 |
+
- `NAME2PROMPT_CLS` ๋์
๋๋ฆฌ์ ์ ํ๋กฌํํธ ์ด๋ฆ๊ณผ ํด๋์ค ๊ฐ์ฒด๋ฅผ ์ถ๊ฐํฉ๋๋ค.
|
118 |
+
```python
|
119 |
+
NAME2PROMPT_CLS = dict(
|
120 |
+
# ... ๊ธฐ์กด ํ๋กฌํํธ๋ค
|
121 |
+
my_prompt=MyPrompt(),
|
122 |
+
)
|
123 |
+
```
|
124 |
+
- `load_prompt` ํจ์์ `promptname` ์ธ์์ `Literal` ํ์
ํํธ์ ์ ํ๋กฌํํธ ์ด๋ฆ์ ์ถ๊ฐํฉ๋๋ค.
|
125 |
+
```python
|
126 |
+
def load_prompt(
|
127 |
+
promptname: Literal[
|
128 |
+
# ... ๊ธฐ์กด ํ๋กฌํํธ ์ด๋ฆ๋ค
|
129 |
+
"my_prompt",
|
130 |
+
],
|
131 |
+
# ...
|
132 |
+
):
|
133 |
+
```
|
134 |
+
|
135 |
+
#### 3. `eval_prompt_list.txt`์ ํ๋กฌํํธ ์ถ๊ฐ
|
136 |
+
|
137 |
+
- ํ๋ก์ ํธ ๋ฃจํธ์ `eval_prompt_list.txt` ํ์ผ์ ์ด๊ณ , ์ ํ๋กฌํํธ์ ์ด๋ฆ(`my_prompt`)์ ์ ์ค์ ์ถ๊ฐํฉ๋๋ค.
|
138 |
+
|
139 |
+
#### 4. (๊ถ์ฅ) ํ
์คํธ ๋ฐ ๋๋ฒ๊น
|
140 |
+
|
141 |
+
- ํ๋กฌํํธ๊ฐ ์๋๋๋ก ์๋ํ๋์ง ํ์ธํ๊ธฐ ์ํด ๋๋ฒ๊น
์ ๊ถ์ฅํฉ๋๋ค.
|
142 |
+
- `.vscode/launch.json` ํ์ผ์ `"VA"` ์ค์ ์์ `args`๋ฅผ ๋ค์๊ณผ ๊ฐ์ด ์์ ํฉ๋๋ค.
|
143 |
+
- `"-p", "translation_fortunecookie"` ๋ถ๋ถ์ `"-p", "my_prompt"`๋ก ๋ณ๊ฒฝํฉ๋๋ค.
|
144 |
+
- ํ์์ `"-i", "..."` ๋ถ๋ถ์ ์ ํ๋กฌํํธ์ ์ ํฉํ ํ
์คํธ ๋ฐ์ดํฐ ๊ฒฝ๋ก๋ฅผ ์ง์ ํฉ๋๋ค.
|
145 |
+
- VS Code์ `Run and Debug` ํญ(Ctrl+Shift+D)์ผ๋ก ์ด๋ํ์ฌ "VA" ์ค์ ์ ์ ํํ๊ณ F5 ํค๋ฅผ ๋๋ฌ ๋๋ฒ๊ฑฐ๋ฅผ ์คํํฉ๋๋ค.
|
146 |
+
- `-o` ๋ค์ ๋ช
์ํ output ๋๋ ํ ๋ฆฌ ์์์ `result.json` ๋ฅผ ์ฐพ์์ ์ํ๋๋๋ก ๋์ํ๋์ง ํ์ธํด๋ณด์ธ์. ๋ชจ๋ judge์ ๋งค์น์ ํ์ฉ๋ ํ๋กฌํํธ ์ ๋ณด๊ฐ ๋ด๊ฒจ์์ต๋๋ค.
|
147 |
|
148 |
๋ฌธ์: ์์ ์ผ
|
149 |
* ๋ด๊ฐ ๋ง๋ ํ๋กฌํํธ๋ฅผ ์ฌ์ฉํ๊ณ ์ถ์ด์
|
150 |
* [`./varco_arena/prompts/`](./varco_arena_core/prompts/__init__.py) ์์ ๊ฐ์ข
ํ๋กฌํํธ ํด๋์ค ๋ฐ `yaml` ํ์ผ ํํ๋ก ์ ์๋ ํ๋กฌํํธ๋ฅผ ๋ก๋ํฉ๋๋ค. ํ๋ฆฌ์
์ ์ฐธ์กฐํ์ฌ ์์ฑํ์๋ฉด ๋ฉ๋๋ค.
|
151 |
* ํ
์คํธ์
๋ณ๋ก ๋ค๋ฅธ ํ๊ฐ ํ๋กฌํํธ๋ฅผ ์ฌ์ฉํ๊ณ ์ถ์ด์ (e.g. ์์
์ ๋ฐ๋ผ ๋ค๋ฅธ ํ๋กฌํํธ๋ฅผ ์ฌ์ฉํ๊ณ ์ถ์ด์)
|
152 |
* ์ ๊ฑธ์ด๋๋ฆฐ ๋งํฌ์ `load_prompt` ๋ฅผ ํตํด์ `promptname` + `task` ํํ๋ก [`./varco_arena_core/manager.py:async_run`](./varco_arena_core/manager.py) ํ๋กฌํํธ๊ฐ ๋ก๋๋๋๋ก ํด๋์์ต๋๋ค.
|
153 |
+
|
|
|
|
|
|
|
154 |
|
155 |
## Special Thanks to (contributors)
|
156 |
- ์ด๋ฏผํธ (@๋ํ๋ชจ๋ธํ, NCSOFT) [github](https://github.com/minolee/)
|
|
|
163 |
์ ํฌ ์์
๋ฌผ์ด ๋์์ด ๋์๋ค๋ฉด ์ ํฌ๋ ๋์์ ๋ฐ์๋ณผ ์ ์์๊น์?๐
|
164 |
```
|
165 |
@misc{son2024varcoarenatournamentapproach,
|
166 |
+
title={VARCO Arena: A Tournament Approach to Reference-Free Benchmarking Large Language Models},
|
167 |
author={Seonil Son and Ju-Min Oh and Heegon Jin and Cheolhun Jang and Jeongbeom Jeong and Kuntae Kim},
|
168 |
year={2024},
|
169 |
eprint={2411.01281},
|
app.py
CHANGED
@@ -253,18 +253,18 @@ def main():
|
|
253 |
False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_init"
|
254 |
)
|
255 |
|
256 |
-
st.title("โ๏ธ
|
257 |
if st.session_state.korean:
|
258 |
st.write(
|
259 |
-
"""
|
260 |
|
261 |
๋ชจ๋ฒ๋ต์์ ํ์๋ก ํ์ง ์์ผ๋ฏ๋ก ์ปค์คํ
ํ
์คํธ์
(50+ ํ) ์ ํ์ฉํ๋ ๊ฒฝ์ฐ ํธ๋ฆฌํ ๋ฒค์น๋งํน์ด ๊ฐ๋ฅํฉ๋๋ค."""
|
262 |
)
|
263 |
else:
|
264 |
st.write(
|
265 |
-
"""**
|
266 |
|
267 |
-
|
268 |
)
|
269 |
|
270 |
st.divider()
|
@@ -389,9 +389,9 @@ def main():
|
|
389 |
# Form for actual run
|
390 |
with st.form("run_arena_form"):
|
391 |
if st.session_state.korean:
|
392 |
-
st.write("### 3.
|
393 |
else:
|
394 |
-
st.write("### 3. Run
|
395 |
api_key = st.text_input("Enter your OpenAI API Key", type="password")
|
396 |
|
397 |
# demo exp name fixated
|
@@ -434,12 +434,12 @@ def main():
|
|
434 |
)
|
435 |
if return_code:
|
436 |
st.error(
|
437 |
-
"โ RuntimeError: An error occurred during
|
438 |
)
|
439 |
purge_user_sub_data(data_path_to_purge=VA_ROOT)
|
440 |
|
441 |
else:
|
442 |
-
st.success("โ
|
443 |
st.session_state.result_file_path = list(
|
444 |
result_file_path.glob("**/result.json")
|
445 |
)[-1]
|
|
|
253 |
False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_init"
|
254 |
)
|
255 |
|
256 |
+
st.title("โ๏ธ Arena-Lite โ๏ธ")
|
257 |
if st.session_state.korean:
|
258 |
st.write(
|
259 |
+
"""**Arena-Lite๋ ํ
์คํธ์
๋ช
๋ น์ด๋ณ๋ก ๋น๊ตํ ๋ชจ๋ธ(์์ฑ๋ฌธ)์ ํ ๋๋จผํธ๋ฅผ ์ํํ๊ณ ๊ฒฐ๊ณผ๋ค์ ์ข
ํฉํ์ฌ ๋ชจ๋ธ๋ค์ ์์๋ฅผ ๋งค๊ธฐ๋ ๋ฒค์น๋งํน ์์คํ
์
๋๋ค. ์ด๊ฒ์ reference ์์ํ๊ณผ ๋น๊ตํ์ฌ ์น๋ฅ ์ ๋งค๊ธฐ๋ ๋ฐฉ๋ฒ๋ณด๋ค ์ ํํ๋ฉฐ ๋ ์ ๋ ดํฉ๋๋ค.**
|
260 |
|
261 |
๋ชจ๋ฒ๋ต์์ ํ์๋ก ํ์ง ์์ผ๋ฏ๋ก ์ปค์คํ
ํ
์คํธ์
(50+ ํ) ์ ํ์ฉํ๋ ๊ฒฝ์ฐ ํธ๋ฆฌํ ๋ฒค์น๋งํน์ด ๊ฐ๋ฅํฉ๋๋ค."""
|
262 |
)
|
263 |
else:
|
264 |
st.write(
|
265 |
+
"""**Arena-Lite is an LLM benchmarking system that compares model responses across customized test scenarios (recommend >50 prompts) without requiring reference answers.**
|
266 |
|
267 |
+
Arena-Lite conducts tournaments between models to be compared for each test set command, ranking models accurately at an affordable price. This is more accurate and cost-effective than rating win rates by comparing against reference outputs."""
|
268 |
)
|
269 |
|
270 |
st.divider()
|
|
|
389 |
# Form for actual run
|
390 |
with st.form("run_arena_form"):
|
391 |
if st.session_state.korean:
|
392 |
+
st.write("### 3. Arena-Lite ๊ตฌ๋ํ๊ธฐ")
|
393 |
else:
|
394 |
+
st.write("### 3. Run Arena-Lite")
|
395 |
api_key = st.text_input("Enter your OpenAI API Key", type="password")
|
396 |
|
397 |
# demo exp name fixated
|
|
|
434 |
)
|
435 |
if return_code:
|
436 |
st.error(
|
437 |
+
"โ RuntimeError: An error occurred during Arena-Lite run. Check the file and **restart from file upload!**"
|
438 |
)
|
439 |
purge_user_sub_data(data_path_to_purge=VA_ROOT)
|
440 |
|
441 |
else:
|
442 |
+
st.success("โ
Arena-Lite run completed successfully")
|
443 |
st.session_state.result_file_path = list(
|
444 |
result_file_path.glob("**/result.json")
|
445 |
)[-1]
|
eval_prompt_list.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
llmbar
|
|
|
2 |
translation_pair
|
3 |
rag_pair_kr
|
4 |
translation_fortunecookie
|
|
|
1 |
llmbar
|
2 |
+
llmbar_brief
|
3 |
translation_pair
|
4 |
rag_pair_kr
|
5 |
translation_fortunecookie
|
guide_mds/input_jsonls_en.md
CHANGED
@@ -1,37 +1,38 @@
|
|
1 |
-
####
|
2 |
-
|
3 |
-
*
|
4 |
-
*
|
5 |
-
*
|
6 |
-
|
7 |
-
**Required Fields**
|
8 |
-
*
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
*
|
13 |
-
*
|
14 |
-
|
15 |
-
|
16 |
-
*
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
**Example Format**
|
21 |
```python
|
22 |
# model1.jsonl
|
23 |
-
{"model_id": "
|
24 |
-
{"model_id": "
|
25 |
|
26 |
-
# model2.jsonl
|
27 |
-
{"model_id": "
|
28 |
-
{"model_id": "
|
29 |
...
|
30 |
..
|
31 |
-
|
32 |
```
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
1 |
+
#### \[EN\] Guide for Input .jsonl Files
|
2 |
+
If you have five models to compare, upload five .jsonl files.
|
3 |
+
* ๐ฅAll `.jsonl` files must have the same number of rows.
|
4 |
+
* ๐ฅThe `model_id` field must be different for each file and unique within each file.
|
5 |
+
* ๐ฅEach `.jsonl` file should have different `generated`, `model_id` from the other files. `instruction`, `task` should be the same.
|
6 |
+
|
7 |
+
**Required `.jsonl` Fields**
|
8 |
+
* Reserved Fields (Mandatory)
|
9 |
+
* `model_id`: The name of the model being evaluated. (Recommended to be short)
|
10 |
+
* `instruction`: The instruction given to the model. This corresponds to the test set prompt (not the evaluation prompt).
|
11 |
+
* `generated`: Enter the response generated by the model for the test set instruction.
|
12 |
+
* `task`: Used to group and display overall results as a subset. Can be utilized when you want to use different evaluation prompts per row.
|
13 |
+
* Additional
|
14 |
+
* Depending on the evaluation prompt you use, you can utilize other additional fields. You can freely add them to your `.jsonl` files, avoiding the keywords
|
15 |
+
mentioned above.
|
16 |
+
* Example: For `translation_pair.yaml` and `translation_fortunecookie.yaml` prompts, the `source_lang` and `target_lang` fields are read from the `.jsonl` and
|
17 |
+
utilized.
|
18 |
+
|
19 |
+
For example, when evaluating with the `translation_pair` prompt, each .jsonl file looks like this:
|
|
|
20 |
```python
|
21 |
# model1.jsonl
|
22 |
+
{"model_id": "๋ชจ๋ธ1", "task": "์ํ", "instruction": "์ด๋๋ก ๊ฐ์ผํ์ค", "generated": "Where should I go", "source_lang": "Korean", "target_lang": "English"}
|
23 |
+
{"model_id": "๋ชจ๋ธ1", "task": "ํ์", "instruction": "1+1?", "generated": "1+1?", "source_lang": "English", "target_lang": "Korean"}
|
24 |
|
25 |
+
# model2.jsonl -* model1.jsonl๊ณผ `instruction`์ ๊ฐ๊ณ `generated`, `model_id` ๋ ๋ค๋ฆ
๋๋ค!
|
26 |
+
{"model_id": "๋ชจ๋ธ2", "task": "์ํ", "instruction": "์ด๋๋ก ๊ฐ์ผํ์ค", "generated": "๊ธ์๋ค", "source_lang": "Korean", "target_lang": "English"}
|
27 |
+
{"model_id": "๋ชจ๋ธ2", "task": "ํ์", "instruction": "1+1?", "generated": "2", "source_lang": "English", "target_lang": "Korean"}
|
28 |
...
|
29 |
..
|
30 |
+
|
31 |
```
|
32 |
+
On the other hand, when evaluating with the `llmbar` prompt, fields like source_lang and target_lang are not used, similar to translation evaluation, and naturally, you don't need to add them to your .jsonl.
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
|
guide_mds/input_jsonls_kr.md
CHANGED
@@ -2,33 +2,30 @@
|
|
2 |
๋น๊ตํ ๋ชจ๋ธ์ด ๋ค์ฏ ๊ฐ๋ผ๋ฉด ๋ค์ฏ ๊ฐ์ .jsonl ํ์ผ์ ์
๋ก๋ํ์ธ์.
|
3 |
* ๐ฅ๋ชจ๋ jsonl ์ ๊ฐ์ ์์ ํ์ ๊ฐ์ ธ์ผํฉ๋๋ค.
|
4 |
* ๐ฅ`model_id` ํ๋๋ ํ์ผ๋ง๋ค ๋ฌ๋ผ์ผํ๋ฉฐ ํ์ผ ๋ด์์๋ ์ ์ผํด์ผํฉ๋๋ค.
|
|
|
|
|
5 |
|
6 |
**jsonl ํ์ ํ๋**
|
7 |
-
*
|
8 |
* `model_id`: ํ๊ฐ๋ฐ๋ ๋ชจ๋ธ์ ์ด๋ฆ์
๋๋ค. (์งง๊ฒ ์ฐ๋ ๊ฒ ์ถ์ฒ)
|
|
|
9 |
* `generated`: ๋ชจ๋ธ์ด testset instruction ์ ์์ฑํ ์๋ต์ ๋ฃ์ผ์ธ์.
|
10 |
-
|
11 |
-
* ๋ฒ์ญํ๊ฐ ํ๋กฌํํธ ์ฌ์ฉ์ (`translation_pair`. `streamlit_app_local/user_submit/mt/llama5.jsonl` ์์ ์์ ๋ณผ ์ ์์)
|
12 |
-
* `source_lang`: input language (e.g. Korean, KR, kor, ...)
|
13 |
-
* `target_lang`: output language (e.g. English, EN, ...)
|
14 |
-
|
15 |
-
* ๊ณตํต ๋ถ๋ถ (**๋ชจ๋ ํ์ผ์ ๋ํด ๊ฐ์์ผ ํจ**)
|
16 |
-
* `instruction`: ๋ชจ๋ธ์ ์ง์ด๋ฃ๋ `testset instruction` ํน์ `input`์ ํด๋นํ๋ ๋ฌด์ธ๊ฐ์
๋๋ค.
|
17 |
* `task`: ์ ์ฒด ๊ฒฐ๊ณผ๋ฅผ subset์ผ๋ก ๊ทธ๋ฃน์ง์ด์ ๋ณด์ฌ์ค ๋ ์ฌ์ฉ๋ฉ๋๋ค. `evaluation prompt`๋ฅผ ํ๋ณ๋ก ๋ค๋ฅด๊ฒ ์ฌ์ฉํ๊ณ ์ถ์ ๋ ํ์ฉ๋ ์ ์์ต๋๋ค.
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
๊ฐ jsonl ํ์ผ์ ์๋์ฒ๋ผ ์๊ฒผ์ต๋๋ค.
|
21 |
```python
|
22 |
# model1.jsonl
|
23 |
-
{"model_id": "๋ชจ๋ธ1", "task": "
|
24 |
-
{"model_id": "๋ชจ๋ธ1", "task": "
|
25 |
|
26 |
# model2.jsonl -* model1.jsonl๊ณผ `instruction`์ ๊ฐ๊ณ `generated`, `model_id` ๋ ๋ค๋ฆ
๋๋ค!
|
27 |
-
{"model_id": "๋ชจ๋ธ2", "task": "
|
28 |
-
{"model_id": "๋ชจ๋ธ2", "task": "
|
29 |
-
|
30 |
...
|
31 |
..
|
32 |
-
```
|
33 |
|
34 |
-
|
|
|
|
2 |
๋น๊ตํ ๋ชจ๋ธ์ด ๋ค์ฏ ๊ฐ๋ผ๋ฉด ๋ค์ฏ ๊ฐ์ .jsonl ํ์ผ์ ์
๋ก๋ํ์ธ์.
|
3 |
* ๐ฅ๋ชจ๋ jsonl ์ ๊ฐ์ ์์ ํ์ ๊ฐ์ ธ์ผํฉ๋๋ค.
|
4 |
* ๐ฅ`model_id` ํ๋๋ ํ์ผ๋ง๋ค ๋ฌ๋ผ์ผํ๋ฉฐ ํ์ผ ๋ด์์๋ ์ ์ผํด์ผํฉ๋๋ค.
|
5 |
+
* ๐ฅ๊ฐ jsonl ํ์ผ์ด ์๋ก ๋ค๋ฅธ generated ๋ฅผ ๊ฐ์ง๋๋ค. `instruction`, `model_id`, `task` ๋ ๊ฐ์์ผํฉ๋๋ค.
|
6 |
+
|
7 |
|
8 |
**jsonl ํ์ ํ๋**
|
9 |
+
* ์์ฝ๋ ํ๋ (ํ์)
|
10 |
* `model_id`: ํ๊ฐ๋ฐ๋ ๋ชจ๋ธ์ ์ด๋ฆ์
๋๋ค. (์งง๊ฒ ์ฐ๋ ๊ฒ ์ถ์ฒ)
|
11 |
+
* `instruction`: ๋ชจ๋ธ์ด ๋ฐ์ ์ง์๋ฌธ์
๋๋ค. ํ
์คํธ์
ํ๋กฌํํธ์ ํด๋นํฉ๋๋ค (ํ๊ฐ ํ๋กฌํํธ ์๋)
|
12 |
* `generated`: ๋ชจ๋ธ์ด testset instruction ์ ์์ฑํ ์๋ต์ ๋ฃ์ผ์ธ์.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
* `task`: ์ ์ฒด ๊ฒฐ๊ณผ๋ฅผ subset์ผ๋ก ๊ทธ๋ฃน์ง์ด์ ๋ณด์ฌ์ค ๋ ์ฌ์ฉ๋ฉ๋๋ค. `evaluation prompt`๋ฅผ ํ๋ณ๋ก ๋ค๋ฅด๊ฒ ์ฌ์ฉํ๊ณ ์ถ์ ๋ ํ์ฉ๋ ์ ์์ต๋๋ค.
|
14 |
+
* ์ถ๊ฐ
|
15 |
+
* ๋น์ ์ด ์ฌ์ฉํ๋ ํ๊ฐ ํ๋กฌํํธ์ ๋ฐ๋ผ์ ์ถ๊ฐ๋ก ๋ค๋ฅธ ํ๋๋ค์ ๋ ํ์ฉํ ์ ์์ต๋๋ค. ์์ ํค์๋๋ค์ ํผํด์ ์์ ๋กญ๊ฒ jsonl์ ์ถ๊ฐํ์ฌ ์ฌ์ฉํ ์ ์์ต๋๋ค.
|
16 |
+
* ์์: translation_pair.yaml, translation_fortunecookie.yaml ํ๋กฌํํธ์ ๊ฒฝ์ฐ๋ `source_lang`, `target_lang` ํ๋๋ฅผ jsonl ์์ ์ฝ์ด์ ํ์ฉํฉ๋๋ค.
|
17 |
|
18 |
+
์๋ฅผ๋ค์ด translation_pair ํ๋กฌํํธ๋ก ํ๊ฐํ๋ ๊ฒฝ์ฐ ๊ฐ jsonl ํ์ผ์ ์๋์ฒ๋ผ ์๊ฒผ์ต๋๋ค.
|
|
|
19 |
```python
|
20 |
# model1.jsonl
|
21 |
+
{"model_id": "๋ชจ๋ธ1", "task": "์ํ", "instruction": "์ด๋๋ก ๊ฐ์ผํ์ค", "generated": "Where should I go", "source_lang": "Korean", "target_lang": "English"}
|
22 |
+
{"model_id": "๋ชจ๋ธ1", "task": "ํ์", "instruction": "1+1?", "generated": "1+1?", "source_lang": "English", "target_lang": "Korean"}
|
23 |
|
24 |
# model2.jsonl -* model1.jsonl๊ณผ `instruction`์ ๊ฐ๊ณ `generated`, `model_id` ๋ ๋ค๋ฆ
๋๋ค!
|
25 |
+
{"model_id": "๋ชจ๋ธ2", "task": "์ํ", "instruction": "์ด๋๋ก ๊ฐ์ผํ์ค", "generated": "๊ธ์๋ค", "source_lang": "Korean", "target_lang": "English"}
|
26 |
+
{"model_id": "๋ชจ๋ธ2", "task": "ํ์", "instruction": "1+1?", "generated": "2", "source_lang": "English", "target_lang": "Korean"}
|
|
|
27 |
...
|
28 |
..
|
|
|
29 |
|
30 |
+
```
|
31 |
+
๋ฐ๋ฉด `llmbar` ํ๋กฌํํธ๋ก ํ๊ฐํ๋ ๊ฒฝ์ฐ, ๋ฒ์ญํ๊ฐ์ฒ๋ผ `source_lang`, `target_lang` ํ๋๊ฐ ์ฌ์ฉ๋์ง ์์ผ๋ฉฐ ๋น์ฐํ jsonl์๋ ์ถ๊ฐํ์ง ์์ผ์
๋ ๋ฉ๋๋ค.
|
modules/nav.py
CHANGED
@@ -24,7 +24,7 @@ def Navbar(sidebar_placeholder, toggle_hashstr: str = ""):
|
|
24 |
|
25 |
st.page_link(
|
26 |
"app.py",
|
27 |
-
label="
|
28 |
icon="๐ฅ",
|
29 |
)
|
30 |
st.page_link(
|
|
|
24 |
|
25 |
st.page_link(
|
26 |
"app.py",
|
27 |
+
label="Arena-Lite ๊ตฌ๋" if st.session_state.korean else "Run Arena-Lite",
|
28 |
icon="๐ฅ",
|
29 |
)
|
30 |
st.page_link(
|
pages/brief_intro.py
CHANGED
@@ -23,7 +23,7 @@ else:
|
|
23 |
st.image("va_concept_new.png")
|
24 |
st.markdown(
|
25 |
"""
|
26 |
-
| |Current Practice|
|
27 |
|-|-|-|
|
28 |
|Total no. matches|$$n_{\\text{model}}*\\|X\\|$$|$$(n_{\\text{model}}-1)*\\|X\\|$$|
|
29 |
|No. matches per LLM|$$\\|X\\|$$|$$\\left[\\|X\\|,\\|X\\|\\text{log}n_{\\text{model}}\\right]$$|
|
@@ -32,9 +32,9 @@ st.markdown(
|
|
32 |
)
|
33 |
if st.session_state.korean:
|
34 |
st.info(
|
35 |
-
"
|
36 |
)
|
37 |
else:
|
38 |
st.info(
|
39 |
-
"
|
40 |
)
|
|
|
23 |
st.image("va_concept_new.png")
|
24 |
st.markdown(
|
25 |
"""
|
26 |
+
| |Current Practice|Arena-Lite|
|
27 |
|-|-|-|
|
28 |
|Total no. matches|$$n_{\\text{model}}*\\|X\\|$$|$$(n_{\\text{model}}-1)*\\|X\\|$$|
|
29 |
|No. matches per LLM|$$\\|X\\|$$|$$\\left[\\|X\\|,\\|X\\|\\text{log}n_{\\text{model}}\\right]$$|
|
|
|
32 |
)
|
33 |
if st.session_state.korean:
|
34 |
st.info(
|
35 |
+
"Arena-Lite๋ ์ ๋ขฐ์ฑ ์๋ ์์๋ฅผ ๋ ์ ์ ํ์์ ๋น๊ต ๋ด์ ์ป์ด๋ด๋ฉฐ, ์ด๋ฌํ ํน์ง์ LLM ์ง์ ๋น๊ต์ ์ด์ ์ผ๋ก๋ถํฐ ๊ธฐ์ธํฉ๋๋ค."
|
36 |
)
|
37 |
else:
|
38 |
st.info(
|
39 |
+
"Arena-Lite takes advantage of direct comparison between LLM responses to guarantee better reliability in fewer number of total matches."
|
40 |
)
|
pages/see_results.py
CHANGED
@@ -60,9 +60,9 @@ def main():
|
|
60 |
|
61 |
if result_select is None:
|
62 |
if st.session_state.korean:
|
63 |
-
st.markdown("๊ฒฐ๊ณผ๋ฅผ ํ์ธํ๋ ค๋ฉด ๋จผ์ **๐ฅ
|
64 |
else:
|
65 |
-
st.markdown("You should **๐ฅRun
|
66 |
st.image("streamlit_app_local/page_result_1.png")
|
67 |
st.image("streamlit_app_local/page_result_2.png")
|
68 |
st.image("streamlit_app_local/page_result_3.png")
|
@@ -334,18 +334,18 @@ def main():
|
|
334 |
with st.expander("ํผ์ณ์ ๋ณด๊ธฐ" if st.session_state.korean else "Expand to show"):
|
335 |
st.info(
|
336 |
"""
|
337 |
-
|
338 |
* position bias (์ผ์ชฝ)
|
339 |
* length bias (์ค๋ฅธ์ชฝ)
|
340 |
|
341 |
-
๊ฒฐ๊ณผ์ ์๊ณก์ด LLM Judge์ ๋ถ์กฑํจ ๋๋ฌธ์ด์๋ค๋ ์ ์ ๊ท๋ช
ํ๋ ค๋ฉด ์ฌ์ฉํ์ LLM Judge์ Prompt์ binary classification ์ ํ๋๋ฅผ ์ธก์ ํด๋ณด์๊ธธ ๋ฐ๋๋๋ค (
|
342 |
if st.session_state.korean
|
343 |
else """
|
344 |
-
In
|
345 |
* position bias (left)
|
346 |
* length bias (right)
|
347 |
|
348 |
-
To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use
|
349 |
""".strip()
|
350 |
)
|
351 |
st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}")
|
|
|
60 |
|
61 |
if result_select is None:
|
62 |
if st.session_state.korean:
|
63 |
+
st.markdown("๊ฒฐ๊ณผ๋ฅผ ํ์ธํ๋ ค๋ฉด ๋จผ์ **๐ฅArena-Lite๋ฅผ ๊ตฌ๋**ํ์
์ผ ํฉ๋๋ค")
|
64 |
else:
|
65 |
+
st.markdown("You should **๐ฅRun Arena-Lite** first to see results")
|
66 |
st.image("streamlit_app_local/page_result_1.png")
|
67 |
st.image("streamlit_app_local/page_result_2.png")
|
68 |
st.image("streamlit_app_local/page_result_3.png")
|
|
|
334 |
with st.expander("ํผ์ณ์ ๋ณด๊ธฐ" if st.session_state.korean else "Expand to show"):
|
335 |
st.info(
|
336 |
"""
|
337 |
+
Arena-Lite์์๋ position bias์ ์ํฅ์ ์ต์ํํ๊ธฐ ์ํด ๋ชจ๋ ๋ชจ๋ธ์ด A๋ B์์น์ ๋ฒ๊ฐ์ ์์นํ๋๋ก ํ์์ต๋๋ค. ๊ทธ๋ฌ๋ LLM Judge ํน์ Prompt์ ์ฑ๋ฅ์ด ๋ถ์กฑํ๋ค๊ณ ๋๊ปด์ง๋ค๋ฉด, ์๋ ์๋ ค์ง LLM Judge bias๊ฐ ์ฐธ๊ณ ๊ฐ ๋ ๊ฒ๋๋ค.
|
338 |
* position bias (์ผ์ชฝ)
|
339 |
* length bias (์ค๋ฅธ์ชฝ)
|
340 |
|
341 |
+
๊ฒฐ๊ณผ์ ์๊ณก์ด LLM Judge์ ๋ถ์กฑํจ ๋๋ฌธ์ด์๋ค๋ ์ ์ ๊ท๋ช
ํ๋ ค๋ฉด ์ฌ์ฉํ์ LLM Judge์ Prompt์ binary classification ์ ํ๋๋ฅผ ์ธก์ ํด๋ณด์๊ธธ ๋ฐ๋๋๋ค (Arena-Lite๋ฅผ ํ์ฉํ์ฌ ์ด๋ฅผ ์ํํด๋ณผ ์ ์์ต๋๋ค!).""".strip()
|
342 |
if st.session_state.korean
|
343 |
else """
|
344 |
+
In Arena-Lite, to minimize the effect of position bias, all models are alternately positioned in either position A or B. However, if you feel the LLM Judge or Prompt performance is insufficient, the following known LLM Judge biases may be helpful to reference:
|
345 |
* position bias (left)
|
346 |
* length bias (right)
|
347 |
|
348 |
+
To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use Arena-Lite for this purpose!).
|
349 |
""".strip()
|
350 |
)
|
351 |
st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}")
|
streamlit_app_local/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
#
|
2 |
```bash
|
3 |
cd ./streamlit_app_local/
|
4 |
bash run.sh
|
|
|
1 |
+
# Arena-Lite web app
|
2 |
```bash
|
3 |
cd ./streamlit_app_local/
|
4 |
bash run.sh
|
streamlit_app_local/app.py
CHANGED
@@ -51,7 +51,7 @@ def upload_files(uploaded_files) -> Path:
|
|
51 |
if not uploaded_files:
|
52 |
st.warning("โ No files to upload. Please drag/drop or browse files to upload.")
|
53 |
elif len(uploaded_files) < 2:
|
54 |
-
st.error("โ You need at least 2 jsonlines files to properly run
|
55 |
else: # properly uploaded
|
56 |
for file in uploaded_files:
|
57 |
# Create a path for the file in the server directory
|
@@ -154,18 +154,18 @@ def main():
|
|
154 |
False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_init"
|
155 |
)
|
156 |
|
157 |
-
st.title("โ๏ธ VARCO ARENA โ๏ธ")
|
158 |
if st.session_state.korean:
|
159 |
st.write(
|
160 |
-
"""
|
161 |
|
162 |
๋ชจ๋ฒ๋ต์์ ํ์๋ก ํ์ง ์์ผ๋ฏ๋ก ์ปค์คํ
ํ
์คํธ์
(50+ ํ) ์ ํ์ฉํ๋ ๊ฒฝ์ฐ ํธ๋ฆฌํ ๋ฒค์น๋งํน์ด ๊ฐ๋ฅํฉ๋๋ค."""
|
163 |
)
|
164 |
else:
|
165 |
st.write(
|
166 |
-
"""**
|
167 |
|
168 |
-
|
169 |
)
|
170 |
|
171 |
st.divider()
|
@@ -261,9 +261,9 @@ def main():
|
|
261 |
# Form for actual run
|
262 |
with st.form("run_arena_form"):
|
263 |
if st.session_state.korean:
|
264 |
-
st.write("### 3.
|
265 |
else:
|
266 |
-
st.write("### 3. Run
|
267 |
api_key = st.text_input("Enter your OpenAI API Key", type="password")
|
268 |
exp_name = st.text_input("(Optional) Enter Exp. name")
|
269 |
exp_name = exp_name.replace(
|
@@ -298,7 +298,7 @@ def main():
|
|
298 |
"โ Requirements: You have to upload jsonlines files first to proceed"
|
299 |
)
|
300 |
elif not api_key:
|
301 |
-
st.error("โ Requirements: OpenAI key required to run
|
302 |
else:
|
303 |
result_file_path, return_code = run_varco_arena(
|
304 |
# upload_dir=st.session_state.upfiles_dir,
|
@@ -309,9 +309,9 @@ def main():
|
|
309 |
evaluation_model=eval_model,
|
310 |
)
|
311 |
if return_code:
|
312 |
-
st.error("โ RuntimeError: An error occurred during
|
313 |
else:
|
314 |
-
st.success("โ
|
315 |
st.session_state.result_file_path = result_file_path
|
316 |
set_nav_bar(
|
317 |
False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_run_done"
|
|
|
51 |
if not uploaded_files:
|
52 |
st.warning("โ No files to upload. Please drag/drop or browse files to upload.")
|
53 |
elif len(uploaded_files) < 2:
|
54 |
+
st.error("โ You need at least 2 jsonlines files to properly run.")
|
55 |
else: # properly uploaded
|
56 |
for file in uploaded_files:
|
57 |
# Create a path for the file in the server directory
|
|
|
154 |
False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_init"
|
155 |
)
|
156 |
|
157 |
+
st.title("โ๏ธ Arena-Lite (former VARCO ARENA) โ๏ธ")
|
158 |
if st.session_state.korean:
|
159 |
st.write(
|
160 |
+
"""**Arena-Lite๋ ํ
์คํธ์
๋ช
๋ น์ด๋ณ๋ก ๋น๊ตํ ๋ชจ๋ธ(์์ฑ๋ฌธ)์ ํ ๋๋จผํธ๋ฅผ ์ํํ๊ณ ๊ฒฐ๊ณผ๋ค์ ์ข
ํฉํ์ฌ ๋ชจ๋ธ๋ค์ ์์๋ฅผ ๋งค๊ธฐ๋ ๋ฒค์น๋งํน ์์คํ
์
๋๋ค. ์ด๊ฒ์ reference ์์ํ๊ณผ ๋น๊ตํ์ฌ ์น๋ฅ ์ ๋งค๊ธฐ๋ ๋ฐฉ๋ฒ๋ณด๋ค ์ ํํ๋ฉฐ ๋ ์ ๋ ดํฉ๋๋ค.**
|
161 |
|
162 |
๋ชจ๋ฒ๋ต์์ ํ์๋ก ํ์ง ์์ผ๋ฏ๋ก ์ปค์คํ
ํ
์คํธ์
(50+ ํ) ์ ํ์ฉํ๋ ๊ฒฝ์ฐ ํธ๋ฆฌํ ๋ฒค์น๋งํน์ด ๊ฐ๋ฅํฉ๋๋ค."""
|
163 |
)
|
164 |
else:
|
165 |
st.write(
|
166 |
+
"""**Arena-Lite is an LLM benchmarking system that compares model responses across customized test scenarios (recommend >50 prompts) without requiring reference answers.**
|
167 |
|
168 |
+
Arena-Lite conducts tournaments between models to be compared for each test set command, ranking models accurately at an affordable price. This is more accurate and cost-effective than rating win rates by comparing against reference outputs."""
|
169 |
)
|
170 |
|
171 |
st.divider()
|
|
|
261 |
# Form for actual run
|
262 |
with st.form("run_arena_form"):
|
263 |
if st.session_state.korean:
|
264 |
+
st.write("### 3. Arena-Lite ๊ตฌ๋ํ๊ธฐ")
|
265 |
else:
|
266 |
+
st.write("### 3. Run Arena-Lite")
|
267 |
api_key = st.text_input("Enter your OpenAI API Key", type="password")
|
268 |
exp_name = st.text_input("(Optional) Enter Exp. name")
|
269 |
exp_name = exp_name.replace(
|
|
|
298 |
"โ Requirements: You have to upload jsonlines files first to proceed"
|
299 |
)
|
300 |
elif not api_key:
|
301 |
+
st.error("โ Requirements: OpenAI key required to run.")
|
302 |
else:
|
303 |
result_file_path, return_code = run_varco_arena(
|
304 |
# upload_dir=st.session_state.upfiles_dir,
|
|
|
309 |
evaluation_model=eval_model,
|
310 |
)
|
311 |
if return_code:
|
312 |
+
st.error("โ RuntimeError: An error occurred during Arena-Lite run")
|
313 |
else:
|
314 |
+
st.success("โ
Arena-Lite run completed successfully")
|
315 |
st.session_state.result_file_path = result_file_path
|
316 |
set_nav_bar(
|
317 |
False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_run_done"
|
streamlit_app_local/modules/nav.py
CHANGED
@@ -16,7 +16,7 @@ def Navbar(sidebar_placeholder, toggle_hashstr: str = ""):
|
|
16 |
|
17 |
st.page_link(
|
18 |
"app.py",
|
19 |
-
label="
|
20 |
icon="๐ฅ",
|
21 |
)
|
22 |
st.page_link(
|
|
|
16 |
|
17 |
st.page_link(
|
18 |
"app.py",
|
19 |
+
label="Arena-Lite ๊ตฌ๋" if st.session_state.korean else "Run Arena-Lite",
|
20 |
icon="๐ฅ",
|
21 |
)
|
22 |
st.page_link(
|
streamlit_app_local/pages/brief_intro.py
CHANGED
@@ -23,7 +23,7 @@ else:
|
|
23 |
st.image("va_concept_new.png")
|
24 |
st.markdown(
|
25 |
"""
|
26 |
-
| |Current Practice|
|
27 |
|-|-|-|
|
28 |
|Total no. matches|$$n_{\\text{model}}*\\|X\\|$$|$$(n_{\\text{model}}-1)*\\|X\\|$$|
|
29 |
|No. matches per LLM|$$\\|X\\|$$|$$\\left[\\|X\\|,\\|X\\|\\text{log}n_{\\text{model}}\\right]$$|
|
@@ -32,9 +32,9 @@ st.markdown(
|
|
32 |
)
|
33 |
if st.session_state.korean:
|
34 |
st.info(
|
35 |
-
"
|
36 |
)
|
37 |
else:
|
38 |
st.info(
|
39 |
-
"
|
40 |
)
|
|
|
23 |
st.image("va_concept_new.png")
|
24 |
st.markdown(
|
25 |
"""
|
26 |
+
| |Current Practice|Arena-Lite|
|
27 |
|-|-|-|
|
28 |
|Total no. matches|$$n_{\\text{model}}*\\|X\\|$$|$$(n_{\\text{model}}-1)*\\|X\\|$$|
|
29 |
|No. matches per LLM|$$\\|X\\|$$|$$\\left[\\|X\\|,\\|X\\|\\text{log}n_{\\text{model}}\\right]$$|
|
|
|
32 |
)
|
33 |
if st.session_state.korean:
|
34 |
st.info(
|
35 |
+
"Arena-Lite๋ ์ ๋ขฐ์ฑ ์๋ ์์๋ฅผ ๋ ์ ์ ํ์์ ๋น๊ต ๋ด์ ์ป์ด๋ด๋ฉฐ, ์ด๋ฌํ ํน์ง์ LLM ์ง์ ๋น๊ต์ ์ด์ ์ผ๋ก๋ถํฐ ๊ธฐ์ธํฉ๋๋ค."
|
36 |
)
|
37 |
else:
|
38 |
st.info(
|
39 |
+
"Arena-Lite takes advantage of direct comparison between LLM responses to guarantee better reliability in fewer number of total matches."
|
40 |
)
|
streamlit_app_local/pages/see_results.py
CHANGED
@@ -354,18 +354,18 @@ def main():
|
|
354 |
with st.expander("ํผ์ณ์ ๋ณด๊ธฐ" if st.session_state.korean else "Expand to show"):
|
355 |
st.info(
|
356 |
"""
|
357 |
-
|
358 |
* position bias (์ผ์ชฝ)
|
359 |
* length bias (์ค๋ฅธ์ชฝ)
|
360 |
|
361 |
-
๊ฒฐ๊ณผ์ ์๊ณก์ด LLM Judge์ ๋ถ์กฑํจ ๋๋ฌธ์ด์๋ค๋ ์ ์ ๊ท๋ช
ํ๋ ค๋ฉด ์ฌ์ฉํ์ LLM Judge์ Prompt์ binary classification ์ ํ๋๋ฅผ ์ธก์ ํด๋ณด์๊ธธ ๋ฐ๋๋๋ค (
|
362 |
if st.session_state.korean
|
363 |
else """
|
364 |
-
In
|
365 |
* position bias (left)
|
366 |
* length bias (right)
|
367 |
|
368 |
-
To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use
|
369 |
""".strip()
|
370 |
)
|
371 |
st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}")
|
|
|
354 |
with st.expander("ํผ์ณ์ ๋ณด๊ธฐ" if st.session_state.korean else "Expand to show"):
|
355 |
st.info(
|
356 |
"""
|
357 |
+
Arena-Lite์์๋ position bias์ ์ํฅ์ ์ต์ํํ๊ธฐ ์ํด ๋ชจ๋ ๋ชจ๋ธ์ด A๋ B์์น์ ๋ฒ๊ฐ์ ์์นํ๋๋ก ํ์์ต๋๋ค. ๊ทธ๋ฌ๋ LLM Judge ํน์ Prompt์ ์ฑ๋ฅ์ด ๋ถ์กฑํ๋ค๊ณ ๋๊ปด์ง๋ค๋ฉด, ์๋ ์๋ ค์ง LLM Judge bias๊ฐ ์ฐธ๊ณ ๊ฐ ๋ ๊ฒ๋๋ค.
|
358 |
* position bias (์ผ์ชฝ)
|
359 |
* length bias (์ค๋ฅธ์ชฝ)
|
360 |
|
361 |
+
๊ฒฐ๊ณผ์ ์๊ณก์ด LLM Judge์ ๋ถ์กฑํจ ๋๋ฌธ์ด์๋ค๋ ์ ์ ๊ท๋ช
ํ๋ ค๋ฉด ์ฌ์ฉํ์ LLM Judge์ Prompt์ binary classification ์ ํ๋๋ฅผ ์ธก์ ํด๋ณด์๊ธธ ๋ฐ๋๋๋ค (Arena-Lite๋ฅผ ํ์ฉํ์ฌ ์ด๋ฅผ ์ํํด๋ณผ ์ ์์ต๋๋ค!).""".strip()
|
362 |
if st.session_state.korean
|
363 |
else """
|
364 |
+
In Arena-Lite, to minimize the effect of position bias, all models are alternately positioned in either position A or B. However, if you feel the LLM Judge or Prompt performance is insufficient, the following known LLM Judge biases may be helpful to reference:
|
365 |
* position bias (left)
|
366 |
* length bias (right)
|
367 |
|
368 |
+
To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use Arena-Lite for this purpose!).
|
369 |
""".strip()
|
370 |
)
|
371 |
st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}")
|
streamlit_app_local/view_utils.py
CHANGED
@@ -16,7 +16,7 @@ from modules.nav import Navbar
|
|
16 |
def default_page_setting(
|
17 |
layout: Literal["wide", "centered"] = "centered",
|
18 |
):
|
19 |
-
st.set_page_config(page_title="
|
20 |
sidebar_placeholder = st.sidebar.empty()
|
21 |
|
22 |
css = f"""
|
@@ -126,7 +126,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
126 |
Y = np.zeros(n)
|
127 |
Y[df["winner"] == "A"] = 1.0
|
128 |
|
129 |
-
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the
|
130 |
if (Y == 0).all():
|
131 |
print(WARNING.format(L=32))
|
132 |
Y[-1] = 1.0
|
|
|
16 |
def default_page_setting(
|
17 |
layout: Literal["wide", "centered"] = "centered",
|
18 |
):
|
19 |
+
st.set_page_config(page_title="Arena-Lite", layout=layout)
|
20 |
sidebar_placeholder = st.sidebar.empty()
|
21 |
|
22 |
css = f"""
|
|
|
126 |
Y = np.zeros(n)
|
127 |
Y[df["winner"] == "A"] = 1.0
|
128 |
|
129 |
+
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
|
130 |
if (Y == 0).all():
|
131 |
print(WARNING.format(L=32))
|
132 |
Y[-1] = 1.0
|
view_utils.py
CHANGED
@@ -16,7 +16,7 @@ from modules.nav import Navbar
|
|
16 |
def default_page_setting(
|
17 |
layout: Literal["wide", "centered"] = "centered",
|
18 |
):
|
19 |
-
st.set_page_config(page_title="
|
20 |
sidebar_placeholder = st.sidebar.empty()
|
21 |
|
22 |
css = f"""
|
@@ -126,7 +126,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
126 |
Y = np.zeros(n)
|
127 |
Y[df["winner"] == "A"] = 1.0
|
128 |
|
129 |
-
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the
|
130 |
if (Y == 0).all():
|
131 |
print(WARNING.format(L=32))
|
132 |
Y[-1] = 1.0
|
|
|
16 |
def default_page_setting(
|
17 |
layout: Literal["wide", "centered"] = "centered",
|
18 |
):
|
19 |
+
st.set_page_config(page_title="Arena-Lite", layout=layout)
|
20 |
sidebar_placeholder = st.sidebar.empty()
|
21 |
|
22 |
css = f"""
|
|
|
126 |
Y = np.zeros(n)
|
127 |
Y[df["winner"] == "A"] = 1.0
|
128 |
|
129 |
+
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
|
130 |
if (Y == 0).all():
|
131 |
print(WARNING.format(L=32))
|
132 |
Y[-1] = 1.0
|