sonsus commited on
Commit
5bdfe24
·
1 Parent(s): e60d760

Judge 로직 간소화 및 프롬프트 구조 개선

Browse files
.gitignore CHANGED
@@ -2,10 +2,13 @@
2
  **/user_submit/*/
3
  **/__pycache__/
4
  **/*.pyc
 
 
 
5
 
6
  # But re-include these four
7
  !**/user_submit/llm/
8
  !**/user_submit/rag/
9
  !**/user_submit/mt/
10
  !**/user_submit/12-02-14:29:30/
11
- rslora_es_ja_dpo_results.txt
 
2
  **/user_submit/*/
3
  **/__pycache__/
4
  **/*.pyc
5
+ DBGOUT/
6
+
7
+ es_ja_results.txt
8
 
9
  # But re-include these four
10
  !**/user_submit/llm/
11
  !**/user_submit/rag/
12
  !**/user_submit/mt/
13
  !**/user_submit/12-02-14:29:30/
14
+ rslora_es_ja_dpo_results.txt
.vscode/launch.json CHANGED
@@ -4,13 +4,6 @@
4
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
  "version": "0.2.0",
6
  "configurations": [
7
- {
8
- "name": "Python Debugger: Current File",
9
- "type": "debugpy",
10
- "request": "launch",
11
- "program": "${file}",
12
- "console": "integratedTerminal"
13
- },
14
  {
15
  "name": "VA",
16
  "type": "debugpy",
@@ -20,13 +13,14 @@
20
  "console": "integratedTerminal",
21
  "args": [
22
  "-i",
23
- "rsc/inputs_for_dbg/dbg_trans_inputs/",
24
  "-o",
25
- "SOME_FOLDER1",
26
  "-e",
27
- "o4-mini",
28
  "-p",
29
- "translation_new"
 
30
  ]
31
  }
32
  ]
 
4
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
  "version": "0.2.0",
6
  "configurations": [
 
 
 
 
 
 
 
7
  {
8
  "name": "VA",
9
  "type": "debugpy",
 
13
  "console": "integratedTerminal",
14
  "args": [
15
  "-i",
16
+ "rsc/inputs_for_dbg/dbg_llmbar_inputs/", // "rsc/inputs_for_dbg/dbg_trans_inputs/",
17
  "-o",
18
+ "DBGOUT",
19
  "-e",
20
+ "gpt-4.1-mini",
21
  "-p",
22
+ "llmbar", // "translation_fortunecookie",
23
+
24
  ]
25
  }
26
  ]
eval_prompt_list.txt CHANGED
@@ -1,4 +1,4 @@
1
  llmbar
2
  translation_pair
3
  rag_pair_kr
4
- translation_new
 
1
  llmbar
2
  translation_pair
3
  rag_pair_kr
4
+ translation_fortunecookie
requirements_2025jun24.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: va
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=conda_forge
6
+ - _openmp_mutex=4.5=2_gnu
7
+ - bzip2=1.0.8=hd590300_5
8
+ - ca-certificates=2024.2.2=hbcca054_0
9
+ - ld_impl_linux-64=2.40=hf3520f5_1
10
+ - libexpat=2.6.2=h59595ed_0
11
+ - libffi=3.4.2=h7f98852_5
12
+ - libgcc-ng=13.2.0=h77fa898_7
13
+ - libgomp=13.2.0=h77fa898_7
14
+ - libnsl=2.0.1=hd590300_0
15
+ - libsqlite=3.45.3=h2797004_0
16
+ - libuuid=2.38.1=h0b41bf4_0
17
+ - libxcrypt=4.4.36=hd590300_1
18
+ - libzlib=1.2.13=h4ab18f5_6
19
+ - ncurses=6.5=h59595ed_0
20
+ - openssl=3.3.0=h4ab18f5_3
21
+ - pip=24.0=pyhd8ed1ab_0
22
+ - python=3.11.9=hb806964_0_cpython
23
+ - readline=8.2=h8228510_1
24
+ - setuptools=70.0.0=pyhd8ed1ab_0
25
+ - tk=8.6.13=noxft_h4845f30_101
26
+ - wheel=0.43.0=pyhd8ed1ab_1
27
+ - xz=5.2.6=h166bdaf_0
28
+ - pip:
29
+ - aiofiles==23.2.1
30
+ - altair==5.4.1
31
+ - annotated-types==0.7.0
32
+ - anyio==4.4.0
33
+ - asttokens==2.4.1
34
+ - attrs==23.2.0
35
+ - blinker==1.8.2
36
+ - cachetools==5.5.0
37
+ - certifi==2024.2.2
38
+ - cfgv==3.4.0
39
+ - charset-normalizer==3.3.2
40
+ - click==8.1.7
41
+ - comm==0.2.2
42
+ - contourpy==1.2.1
43
+ - cycler==0.12.1
44
+ - debugpy==1.8.2
45
+ - decorator==5.1.1
46
+ - distlib==0.3.8
47
+ - distro==1.9.0
48
+ - et-xmlfile==1.1.0
49
+ - executing==2.0.1
50
+ - fastapi==0.115.3
51
+ - fastchat==0.1.0
52
+ - ffmpy==0.4.0
53
+ - filelock==3.16.1
54
+ - fire==0.6.0
55
+ - fonttools==4.53.1
56
+ - fsspec==2024.9.0
57
+ - gitdb==4.0.11
58
+ - gitpython==3.1.43
59
+ - gradio==5.3.0
60
+ - gradio-client==1.4.2
61
+ - h11==0.14.0
62
+ - httpcore==1.0.5
63
+ - httpx==0.27.0
64
+ - huggingface-hub==0.25.1
65
+ - identify==2.6.1
66
+ - idna==3.7
67
+ - ipdb==0.13.13
68
+ - ipykernel==6.29.5
69
+ - ipython==8.26.0
70
+ - jedi==0.19.1
71
+ - jinja2==3.1.4
72
+ - jiter==0.7.1
73
+ - joblib==1.4.2
74
+ - jsonlines==4.0.0
75
+ - jsonschema==4.23.0
76
+ - jsonschema-specifications==2023.12.1
77
+ - jupyter-client==8.6.2
78
+ - jupyter-core==5.7.2
79
+ - kaleido==0.2.1
80
+ - kiwisolver==1.4.5
81
+ - markdown-it-py==3.0.0
82
+ - markupsafe==2.1.5
83
+ - matplotlib==3.9.1
84
+ - matplotlib-inline==0.1.7
85
+ - mdurl==0.1.2
86
+ - munch==4.0.0
87
+ - narwhals==1.8.3
88
+ - nest-asyncio==1.6.0
89
+ - networkx==3.3
90
+ - nodeenv==1.9.1
91
+ - numpy==1.26.4
92
+ - openai==1.54.4
93
+ - openpyxl==3.1.5
94
+ - orjson==3.10.10
95
+ - packaging==24.0
96
+ - pandas==2.2.2
97
+ - parso==0.8.4
98
+ - pexpect==4.9.0
99
+ - pillow==10.4.0
100
+ - platformdirs==4.2.2
101
+ - plotly==5.22.0
102
+ - pre-commit==3.8.0
103
+ - prompt-toolkit==3.0.47
104
+ - protobuf==5.28.2
105
+ - psutil==6.0.0
106
+ - ptyprocess==0.7.0
107
+ - pure-eval==0.2.2
108
+ - pyarrow==17.0.0
109
+ - pydantic==2.7.2
110
+ - pydantic-core==2.18.3
111
+ - pydeck==0.9.1
112
+ - pydub==0.25.1
113
+ - pygments==2.18.0
114
+ - pyparsing==3.1.2
115
+ - python-dateutil==2.9.0.post0
116
+ - python-multipart==0.0.12
117
+ - pytz==2024.1
118
+ - pyyaml==6.0.2
119
+ - pyzmq==26.0.3
120
+ - redis==5.0.4
121
+ - referencing==0.35.1
122
+ - regex==2024.5.15
123
+ - requests==2.32.3
124
+ - rich==13.8.1
125
+ - rpds-py==0.20.0
126
+ - ruff==0.7.0
127
+ - safetensors==0.4.5
128
+ - scikit-learn==1.5.0
129
+ - scipy==1.13.1
130
+ - seaborn==0.13.2
131
+ - semantic-version==2.10.0
132
+ - shellingham==1.5.4
133
+ - six==1.16.0
134
+ - smmap==5.0.1
135
+ - sniffio==1.3.1
136
+ - stack-data==0.6.3
137
+ - starlette==0.41.0
138
+ - streamlit==1.40.1
139
+ - tabulate==0.9.0
140
+ - tenacity==8.3.0
141
+ - termcolor==2.4.0
142
+ - threadpoolctl==3.5.0
143
+ - tiktoken==0.9.0
144
+ - tokenizers==0.20.0
145
+ - toml==0.10.2
146
+ - tomlkit==0.12.0
147
+ - tornado==6.4.1
148
+ - tqdm==4.66.4
149
+ - traitlets==5.14.3
150
+ - transformers==4.45.1
151
+ - typer==0.12.5
152
+ - typing-extensions==4.12.0
153
+ - tzdata==2024.1
154
+ - urllib3==2.2.1
155
+ - uvicorn==0.32.0
156
+ - virtualenv==20.26.6
157
+ - watchdog==4.0.2
158
+ - wcwidth==0.2.13
159
+ - websockets==12.0
160
+ prefix: /home/deftson/miniconda3/envs/va
varco_arena/varco_arena_core/eval_utils.py CHANGED
@@ -72,29 +72,31 @@ async def async_query_openai(
72
  messages=completed_prompts,
73
  **prompt_obj.sampling_parameters,
74
  )
75
-
76
- # o-series of models (reasoning models)
 
 
 
 
 
 
77
  if model in [
78
  "o4-mini",
79
  ]:
80
- # does not provide logprobs
81
- kwargs.pop("logprobs")
82
- kwargs.pop("top_logprobs")
83
  # does not allow temperature
84
- kwargs.pop("temperature")
 
85
  # does not allow stop
86
- kwargs.pop("stop")
 
87
  # max_completion_tokens is different from what I expect... does it count reaosning path too?
88
- kwargs.pop("max_tokens")
89
-
90
- # max_tokens = kwargs.pop("max_tokens")
91
- # kwargs["max_completion_tokens"] = max_tokens
92
-
93
  # prefer developer role than system
94
  if kwargs["messages"][0]["role"] == "system":
95
  kwargs["messages"][0]["role"] = "developer"
96
- # do not support max_tokens --> max_completion_tokens
97
-
98
  isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
99
 
100
  # defining client here?...
@@ -137,8 +139,25 @@ async def async_query_openai(
137
  async with limiter:
138
  try:
139
  resp = await client.chat.completions.create(**kwargs)
140
- postprocess_f = prompt_obj.prob_calibrated_output
141
- normalized_result = postprocess_f(resp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  normalized_result["error"] = False
143
  normalized_result["exception_str"] = ""
144
  except Exception as e:
 
72
  messages=completed_prompts,
73
  **prompt_obj.sampling_parameters,
74
  )
75
+
76
+ # logprobs are no longer used, remove for all models
77
+ if "logprobs" in kwargs:
78
+ kwargs.pop("logprobs")
79
+ if "top_logprobs" in kwargs:
80
+ kwargs.pop("top_logprobs")
81
+
82
+ # o-series of models (reasoning models)
83
  if model in [
84
  "o4-mini",
85
  ]:
 
 
 
86
  # does not allow temperature
87
+ if "temperature" in kwargs:
88
+ kwargs.pop("temperature")
89
  # does not allow stop
90
+ if "stop" in kwargs:
91
+ kwargs.pop("stop")
92
  # max_completion_tokens is different from what I expect... does it count reaosning path too?
93
+ if "max_tokens" in kwargs:
94
+ kwargs.pop("max_tokens")
95
+
 
 
96
  # prefer developer role than system
97
  if kwargs["messages"][0]["role"] == "system":
98
  kwargs["messages"][0]["role"] = "developer"
99
+
 
100
  isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
101
 
102
  # defining client here?...
 
139
  async with limiter:
140
  try:
141
  resp = await client.chat.completions.create(**kwargs)
142
+
143
+ decision_tok = prompt_obj.parsed_output(resp)
144
+
145
+ prefer_1st_tok = prompt_obj.decision_tokens.get("prefer_1st")
146
+ prefer_2nd_tok = prompt_obj.decision_tokens.get("prefer_2nd")
147
+
148
+ if decision_tok == prefer_1st_tok:
149
+ normalized_result = {"prefer_1st": 1.0, "prefer_2nd": 0.0}
150
+ elif decision_tok == prefer_2nd_tok:
151
+ normalized_result = {"prefer_1st": 0.0, "prefer_2nd": 1.0}
152
+ else: # 예외 처리: 무승부 또는 예상치 못한 결과
153
+ normalized_result = {"prefer_1st": 0.5, "prefer_2nd": 0.5}
154
+
155
+ normalized_result["model"] = resp.model
156
+ normalized_result["decision_by"] = "parsed_output"
157
+ normalized_result["target_tokens_found"] = [decision_tok]
158
+ normalized_result[
159
+ "no_target_tokens_found"
160
+ ] = decision_tok not in [prefer_1st_tok, prefer_2nd_tok]
161
  normalized_result["error"] = False
162
  normalized_result["exception_str"] = ""
163
  except Exception as e:
varco_arena/varco_arena_core/prompts/__init__.py CHANGED
@@ -8,14 +8,14 @@ from .llmbar import LLMBarPrompt
8
  from .llmbar_brief import LLMBarBriefPrompt
9
  from .rag_pair_kr import RagPairKRPrompt
10
  from .translation_pair import TranslationPairPrompt
11
- from .translation_new import TranslationNewPrompt
12
 
13
  NAME2PROMPT_CLS = dict(
14
  llmbar_brief=LLMBarBriefPrompt(),
15
  llmbar=LLMBarPrompt(),
16
  translation_pair=TranslationPairPrompt(),
17
  rag_pair_kr=RagPairKRPrompt(),
18
- translation_new=TranslationNewPrompt(),
19
  # contextual_vqa = Contextual_VQA(),
20
  # contextual_ocr = Contextual_OCR(),
21
  )
@@ -26,7 +26,7 @@ def load_prompt(
26
  "llmbar_brief",
27
  "llmbar",
28
  "translation_pair",
29
- "translation_new",
30
  "rag_pair_kr",
31
  ],
32
  task: str = "", # used for further prompt variation (eval prompt might depend on task.)
 
8
  from .llmbar_brief import LLMBarBriefPrompt
9
  from .rag_pair_kr import RagPairKRPrompt
10
  from .translation_pair import TranslationPairPrompt
11
+ from .translation_fortunecookie import TranslationNewPrompt
12
 
13
  NAME2PROMPT_CLS = dict(
14
  llmbar_brief=LLMBarBriefPrompt(),
15
  llmbar=LLMBarPrompt(),
16
  translation_pair=TranslationPairPrompt(),
17
  rag_pair_kr=RagPairKRPrompt(),
18
+ translation_fortunecookie=TranslationNewPrompt(),
19
  # contextual_vqa = Contextual_VQA(),
20
  # contextual_ocr = Contextual_OCR(),
21
  )
 
26
  "llmbar_brief",
27
  "llmbar",
28
  "translation_pair",
29
+ "translation_fortunecookie",
30
  "rag_pair_kr",
31
  ],
32
  task: str = "", # used for further prompt variation (eval prompt might depend on task.)
varco_arena/varco_arena_core/prompts/llmbar.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from pathlib import Path
2
  from typing import Dict, List, Optional
3
 
@@ -25,12 +26,42 @@ class LLMBarPrompt(ComparisonPromptBase):
25
  1. detect (a|b)
26
  2. get 'a|b' only
27
  """
28
- raw_output = response.choices[0].message.content
29
- for target_find in self.decision_tokens.values():
30
- idx = raw_output.find(f"({target_find})")
31
- if idx != -1:
32
- break
33
- res_tok = raw_output[idx + 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  return res_tok
35
 
36
  def prob_calibrated_output(self, response) -> Dict[str, float]:
 
1
+ import random
2
  from pathlib import Path
3
  from typing import Dict, List, Optional
4
 
 
26
  1. detect (a|b)
27
  2. get 'a|b' only
28
  """
29
+ raw_output = response.choices[0].message.content.strip()
30
+
31
+ decision1 = self.decision_tokens['prefer_1st'] # 'a'
32
+ decision2 = self.decision_tokens['prefer_2nd'] # 'b'
33
+
34
+ # 프롬프트에서 "Output (a)" 또는 "Output (b)"로 답변하라고 요청했으므로, 이 패턴을 직접 확인
35
+ output1_str = f"Output ({decision1})"
36
+ output2_str = f"Output ({decision2})"
37
+
38
+ # 괄호만 있는 경우도 확인
39
+ paren1_str = f"({decision1})"
40
+ paren2_str = f"({decision2})"
41
+
42
+ output1_present = output1_str in raw_output
43
+ output2_present = output2_str in raw_output
44
+ paren1_present = paren1_str in raw_output
45
+ paren2_present = paren2_str in raw_output
46
+
47
+ res_tok = None
48
+ if output1_present and not output2_present:
49
+ res_tok = decision1
50
+ elif output2_present and not output1_present:
51
+ res_tok = decision2
52
+ elif paren1_present and not paren2_present:
53
+ res_tok = decision1
54
+ elif paren2_present and not paren1_present:
55
+ res_tok = decision2
56
+ else:
57
+ # 모호한 경우, translation_new.py처럼 무작위로 선택하고 경고를 출력
58
+ res_tok = random.choice([decision1, decision2])
59
+ print("=" * 100)
60
+ print(f"실제 응답: '{raw_output}'")
61
+ print(f"무작위로 선택된 결정: {res_tok}")
62
+ print("응답 형식이 예상과 다르거나 모호하여 무작위로 선택했습니다.")
63
+ print("=" * 100)
64
+
65
  return res_tok
66
 
67
  def prob_calibrated_output(self, response) -> Dict[str, float]:
varco_arena/varco_arena_core/prompts/prompt_utils.py CHANGED
@@ -59,6 +59,7 @@ def get_tokenizer_from_model_name(
59
  if model_name in [
60
  "gpt-4.1",
61
  "gpt-4.1-mini",
 
62
  "o4-mini",
63
  ]:
64
  tokenizer = tiktoken.encoding_for_model("gpt-4o")
 
59
  if model_name in [
60
  "gpt-4.1",
61
  "gpt-4.1-mini",
62
+ "gpt-4.1-nano",
63
  "o4-mini",
64
  ]:
65
  tokenizer = tiktoken.encoding_for_model("gpt-4o")
varco_arena/varco_arena_core/prompts/{translation_new.py → translation_fortunecookie.py} RENAMED
@@ -7,7 +7,7 @@ from .prompt_utils import fill_template_over_messsages
7
  import random
8
 
9
  class TranslationNewPrompt(LLMBarPrompt):
10
- def __init__(self, prompt_yaml: str = "translation_new.yaml"):
11
  super().__init__(prompt_yaml=prompt_yaml)
12
 
13
  def parsed_output(self, response: Any) -> str:
 
7
  import random
8
 
9
  class TranslationNewPrompt(LLMBarPrompt):
10
+ def __init__(self, prompt_yaml: str = "translation_fortunecookie.yaml"):
11
  super().__init__(prompt_yaml=prompt_yaml)
12
 
13
  def parsed_output(self, response: Any) -> str:
varco_arena/varco_arena_core/prompts/{translation_new.yaml → translation_fortunecookie.yaml} RENAMED
File without changes