Spaces:

NCSOFT
/

ArenaLite

Sleeping

App Files Files Community

sonsus commited on Jul 15

Commit

5bdfe24

1 Parent(s): e60d760

Judge 로직 간소화 및 프롬프트 구조 개선

Browse files

Files changed (10) hide show

.gitignore +4 -1
.vscode/launch.json +5 -11
eval_prompt_list.txt +1 -1
requirements_2025jun24.yaml +160 -0
varco_arena/varco_arena_core/eval_utils.py +35 -16
varco_arena/varco_arena_core/prompts/__init__.py +3 -3
varco_arena/varco_arena_core/prompts/llmbar.py +37 -6
varco_arena/varco_arena_core/prompts/prompt_utils.py +1 -0
varco_arena/varco_arena_core/prompts/{translation_new.py → translation_fortunecookie.py} +1 -1
varco_arena/varco_arena_core/prompts/{translation_new.yaml → translation_fortunecookie.yaml} +0 -0

.gitignore CHANGED Viewed

@@ -2,10 +2,13 @@
 **/user_submit/*/
 **/__pycache__/
 **/*.pyc
 # But re-include these four
 !**/user_submit/llm/
 !**/user_submit/rag/
 !**/user_submit/mt/
 !**/user_submit/12-02-14:29:30/
-rslora_es_ja_dpo_results.txt

 **/user_submit/*/
 **/__pycache__/
 **/*.pyc
+DBGOUT/
+es_ja_results.txt
 # But re-include these four
 !**/user_submit/llm/
 !**/user_submit/rag/
 !**/user_submit/mt/
 !**/user_submit/12-02-14:29:30/
+rslora_es_ja_dpo_results.txt

.vscode/launch.json CHANGED Viewed

@@ -4,13 +4,6 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
-        {
-            "name": "Python Debugger: Current File",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "${file}",
-            "console": "integratedTerminal"
-        },
         {
             "name": "VA",
             "type": "debugpy",
@@ -20,13 +13,14 @@
             "console": "integratedTerminal",
             "args": [
                 "-i",
-                "rsc/inputs_for_dbg/dbg_trans_inputs/",
                 "-o",
-                "SOME_FOLDER1",
                 "-e",
-                "o4-mini",
                 "-p",
-                "translation_new"
             ]
         }
     ]

     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
         {
             "name": "VA",
             "type": "debugpy",
             "console": "integratedTerminal",
             "args": [
                 "-i",
+                "rsc/inputs_for_dbg/dbg_llmbar_inputs/", // "rsc/inputs_for_dbg/dbg_trans_inputs/",
                 "-o",
+                "DBGOUT",
                 "-e",
+                "gpt-4.1-mini",
                 "-p",
+                "llmbar", // "translation_fortunecookie",
             ]
         }
     ]

eval_prompt_list.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 llmbar
 translation_pair
 rag_pair_kr
-translation_new

 llmbar
 translation_pair
 rag_pair_kr
+translation_fortunecookie

requirements_2025jun24.yaml ADDED Viewed

	@@ -0,0 +1,160 @@

+name: va
+channels:
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - bzip2=1.0.8=hd590300_5
+  - ca-certificates=2024.2.2=hbcca054_0
+  - ld_impl_linux-64=2.40=hf3520f5_1
+  - libexpat=2.6.2=h59595ed_0
+  - libffi=3.4.2=h7f98852_5
+  - libgcc-ng=13.2.0=h77fa898_7
+  - libgomp=13.2.0=h77fa898_7
+  - libnsl=2.0.1=hd590300_0
+  - libsqlite=3.45.3=h2797004_0
+  - libuuid=2.38.1=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libzlib=1.2.13=h4ab18f5_6
+  - ncurses=6.5=h59595ed_0
+  - openssl=3.3.0=h4ab18f5_3
+  - pip=24.0=pyhd8ed1ab_0
+  - python=3.11.9=hb806964_0_cpython
+  - readline=8.2=h8228510_1
+  - setuptools=70.0.0=pyhd8ed1ab_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - wheel=0.43.0=pyhd8ed1ab_1
+  - xz=5.2.6=h166bdaf_0
+  - pip:
+    - aiofiles==23.2.1
+    - altair==5.4.1
+    - annotated-types==0.7.0
+    - anyio==4.4.0
+    - asttokens==2.4.1
+    - attrs==23.2.0
+    - blinker==1.8.2
+    - cachetools==5.5.0
+    - certifi==2024.2.2
+    - cfgv==3.4.0
+    - charset-normalizer==3.3.2
+    - click==8.1.7
+    - comm==0.2.2
+    - contourpy==1.2.1
+    - cycler==0.12.1
+    - debugpy==1.8.2
+    - decorator==5.1.1
+    - distlib==0.3.8
+    - distro==1.9.0
+    - et-xmlfile==1.1.0
+    - executing==2.0.1
+    - fastapi==0.115.3
+    - fastchat==0.1.0
+    - ffmpy==0.4.0
+    - filelock==3.16.1
+    - fire==0.6.0
+    - fonttools==4.53.1
+    - fsspec==2024.9.0
+    - gitdb==4.0.11
+    - gitpython==3.1.43
+    - gradio==5.3.0
+    - gradio-client==1.4.2
+    - h11==0.14.0
+    - httpcore==1.0.5
+    - httpx==0.27.0
+    - huggingface-hub==0.25.1
+    - identify==2.6.1
+    - idna==3.7
+    - ipdb==0.13.13
+    - ipykernel==6.29.5
+    - ipython==8.26.0
+    - jedi==0.19.1
+    - jinja2==3.1.4
+    - jiter==0.7.1
+    - joblib==1.4.2
+    - jsonlines==4.0.0
+    - jsonschema==4.23.0
+    - jsonschema-specifications==2023.12.1
+    - jupyter-client==8.6.2
+    - jupyter-core==5.7.2
+    - kaleido==0.2.1
+    - kiwisolver==1.4.5
+    - markdown-it-py==3.0.0
+    - markupsafe==2.1.5
+    - matplotlib==3.9.1
+    - matplotlib-inline==0.1.7
+    - mdurl==0.1.2
+    - munch==4.0.0
+    - narwhals==1.8.3
+    - nest-asyncio==1.6.0
+    - networkx==3.3
+    - nodeenv==1.9.1
+    - numpy==1.26.4
+    - openai==1.54.4
+    - openpyxl==3.1.5
+    - orjson==3.10.10
+    - packaging==24.0
+    - pandas==2.2.2
+    - parso==0.8.4
+    - pexpect==4.9.0
+    - pillow==10.4.0
+    - platformdirs==4.2.2
+    - plotly==5.22.0
+    - pre-commit==3.8.0
+    - prompt-toolkit==3.0.47
+    - protobuf==5.28.2
+    - psutil==6.0.0
+    - ptyprocess==0.7.0
+    - pure-eval==0.2.2
+    - pyarrow==17.0.0
+    - pydantic==2.7.2
+    - pydantic-core==2.18.3
+    - pydeck==0.9.1
+    - pydub==0.25.1
+    - pygments==2.18.0
+    - pyparsing==3.1.2
+    - python-dateutil==2.9.0.post0
+    - python-multipart==0.0.12
+    - pytz==2024.1
+    - pyyaml==6.0.2
+    - pyzmq==26.0.3
+    - redis==5.0.4
+    - referencing==0.35.1
+    - regex==2024.5.15
+    - requests==2.32.3
+    - rich==13.8.1
+    - rpds-py==0.20.0
+    - ruff==0.7.0
+    - safetensors==0.4.5
+    - scikit-learn==1.5.0
+    - scipy==1.13.1
+    - seaborn==0.13.2
+    - semantic-version==2.10.0
+    - shellingham==1.5.4
+    - six==1.16.0
+    - smmap==5.0.1
+    - sniffio==1.3.1
+    - stack-data==0.6.3
+    - starlette==0.41.0
+    - streamlit==1.40.1
+    - tabulate==0.9.0
+    - tenacity==8.3.0
+    - termcolor==2.4.0
+    - threadpoolctl==3.5.0
+    - tiktoken==0.9.0
+    - tokenizers==0.20.0
+    - toml==0.10.2
+    - tomlkit==0.12.0
+    - tornado==6.4.1
+    - tqdm==4.66.4
+    - traitlets==5.14.3
+    - transformers==4.45.1
+    - typer==0.12.5
+    - typing-extensions==4.12.0
+    - tzdata==2024.1
+    - urllib3==2.2.1
+    - uvicorn==0.32.0
+    - virtualenv==20.26.6
+    - watchdog==4.0.2
+    - wcwidth==0.2.13
+    - websockets==12.0
+prefix: /home/deftson/miniconda3/envs/va

varco_arena/varco_arena_core/eval_utils.py CHANGED Viewed

@@ -72,29 +72,31 @@ async def async_query_openai(
         messages=completed_prompts,
         **prompt_obj.sampling_parameters,
     )
-    # o-series of models (reasoning models)
     if model in [
         "o4-mini",
     ]:
-        # does not provide logprobs
-        kwargs.pop("logprobs")
-        kwargs.pop("top_logprobs")
         # does not allow temperature
-        kwargs.pop("temperature")
         # does not allow stop
-        kwargs.pop("stop")
         # max_completion_tokens is different from what I expect... does it count reaosning path too?
-        kwargs.pop("max_tokens")
-        # max_tokens = kwargs.pop("max_tokens")
-        # kwargs["max_completion_tokens"] = max_tokens
         # prefer developer role than system
         if kwargs["messages"][0]["role"] == "system":
             kwargs["messages"][0]["role"] = "developer"
-        # do not support max_tokens --> max_completion_tokens
     isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
     # defining client here?...
@@ -137,8 +139,25 @@ async def async_query_openai(
     async with limiter:
         try:
             resp = await client.chat.completions.create(**kwargs)
-            postprocess_f = prompt_obj.prob_calibrated_output
-            normalized_result = postprocess_f(resp)
             normalized_result["error"] = False
             normalized_result["exception_str"] = ""
         except Exception as e:

         messages=completed_prompts,
         **prompt_obj.sampling_parameters,
     )
+    # logprobs are no longer used, remove for all models
+    if "logprobs" in kwargs:
+        kwargs.pop("logprobs")
+    if "top_logprobs" in kwargs:
+        kwargs.pop("top_logprobs")
+    # o-series of models (reasoning models)
     if model in [
         "o4-mini",
     ]:
         # does not allow temperature
+        if "temperature" in kwargs:
+            kwargs.pop("temperature")
         # does not allow stop
+        if "stop" in kwargs:
+            kwargs.pop("stop")
         # max_completion_tokens is different from what I expect... does it count reaosning path too?
+        if "max_tokens" in kwargs:
+            kwargs.pop("max_tokens")
         # prefer developer role than system
         if kwargs["messages"][0]["role"] == "system":
             kwargs["messages"][0]["role"] = "developer"
     isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
     # defining client here?...
     async with limiter:
         try:
             resp = await client.chat.completions.create(**kwargs)
+            decision_tok = prompt_obj.parsed_output(resp)
+            prefer_1st_tok = prompt_obj.decision_tokens.get("prefer_1st")
+            prefer_2nd_tok = prompt_obj.decision_tokens.get("prefer_2nd")
+            if decision_tok == prefer_1st_tok:
+                normalized_result = {"prefer_1st": 1.0, "prefer_2nd": 0.0}
+            elif decision_tok == prefer_2nd_tok:
+                normalized_result = {"prefer_1st": 0.0, "prefer_2nd": 1.0}
+            else:  # 예외 처리: 무승부 또는 예상치 못한 결과
+                normalized_result = {"prefer_1st": 0.5, "prefer_2nd": 0.5}
+            normalized_result["model"] = resp.model
+            normalized_result["decision_by"] = "parsed_output"
+            normalized_result["target_tokens_found"] = [decision_tok]
+            normalized_result[
+                "no_target_tokens_found"
+            ] = decision_tok not in [prefer_1st_tok, prefer_2nd_tok]
             normalized_result["error"] = False
             normalized_result["exception_str"] = ""
         except Exception as e:

varco_arena/varco_arena_core/prompts/__init__.py CHANGED Viewed

@@ -8,14 +8,14 @@ from .llmbar import LLMBarPrompt
 from .llmbar_brief import LLMBarBriefPrompt
 from .rag_pair_kr import RagPairKRPrompt
 from .translation_pair import TranslationPairPrompt
-from .translation_new import TranslationNewPrompt
 NAME2PROMPT_CLS = dict(
     llmbar_brief=LLMBarBriefPrompt(),
     llmbar=LLMBarPrompt(),
     translation_pair=TranslationPairPrompt(),
     rag_pair_kr=RagPairKRPrompt(),
-    translation_new=TranslationNewPrompt(),
     # contextual_vqa = Contextual_VQA(),
     # contextual_ocr = Contextual_OCR(),
 )
@@ -26,7 +26,7 @@ def load_prompt(
         "llmbar_brief",
         "llmbar",
         "translation_pair",
-        "translation_new",
         "rag_pair_kr",
     ],
     task: str = "",  # used for further prompt variation (eval prompt might depend on task.)

 from .llmbar_brief import LLMBarBriefPrompt
 from .rag_pair_kr import RagPairKRPrompt
 from .translation_pair import TranslationPairPrompt
+from .translation_fortunecookie import TranslationNewPrompt
 NAME2PROMPT_CLS = dict(
     llmbar_brief=LLMBarBriefPrompt(),
     llmbar=LLMBarPrompt(),
     translation_pair=TranslationPairPrompt(),
     rag_pair_kr=RagPairKRPrompt(),
+    translation_fortunecookie=TranslationNewPrompt(),
     # contextual_vqa = Contextual_VQA(),
     # contextual_ocr = Contextual_OCR(),
 )
         "llmbar_brief",
         "llmbar",
         "translation_pair",
+        "translation_fortunecookie",
         "rag_pair_kr",
     ],
     task: str = "",  # used for further prompt variation (eval prompt might depend on task.)

varco_arena/varco_arena_core/prompts/llmbar.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -25,12 +26,42 @@ class LLMBarPrompt(ComparisonPromptBase):
         1. detect (a|b)
         2. get 'a|b' only
         """
-        raw_output = response.choices[0].message.content
-        for target_find in self.decision_tokens.values():
-            idx = raw_output.find(f"({target_find})")
-            if idx != -1:
-                break
-        res_tok = raw_output[idx + 1]
         return res_tok
     def prob_calibrated_output(self, response) -> Dict[str, float]:

+import random
 from pathlib import Path
 from typing import Dict, List, Optional
         1. detect (a|b)
         2. get 'a|b' only
         """
+        raw_output = response.choices[0].message.content.strip()
+        decision1 = self.decision_tokens['prefer_1st']  # 'a'
+        decision2 = self.decision_tokens['prefer_2nd']  # 'b'
+        # 프롬프트에서 "Output (a)" 또는 "Output (b)"로 답변하라고 요청했으므로, 이 패턴을 직접 확인
+        output1_str = f"Output ({decision1})"
+        output2_str = f"Output ({decision2})"
+        # 괄호만 있는 경우도 확인
+        paren1_str = f"({decision1})"
+        paren2_str = f"({decision2})"
+        output1_present = output1_str in raw_output
+        output2_present = output2_str in raw_output
+        paren1_present = paren1_str in raw_output
+        paren2_present = paren2_str in raw_output
+        res_tok = None
+        if output1_present and not output2_present:
+            res_tok = decision1
+        elif output2_present and not output1_present:
+            res_tok = decision2
+        elif paren1_present and not paren2_present:
+            res_tok = decision1
+        elif paren2_present and not paren1_present:
+            res_tok = decision2
+        else:
+            # 모호한 경우, translation_new.py처럼 무작위로 선택하고 경고를 출력
+            res_tok = random.choice([decision1, decision2])
+            print("=" * 100)
+            print(f"실제 응답: '{raw_output}'")
+            print(f"무작위로 선택된 결정: {res_tok}")
+            print("응답 형식이 예상과 다르거나 모호하여 무작위로 선택했습니다.")
+            print("=" * 100)
         return res_tok
     def prob_calibrated_output(self, response) -> Dict[str, float]:

varco_arena/varco_arena_core/prompts/prompt_utils.py CHANGED Viewed

@@ -59,6 +59,7 @@ def get_tokenizer_from_model_name(
             if model_name in [
                 "gpt-4.1",
                 "gpt-4.1-mini",
                 "o4-mini",
             ]:
                 tokenizer = tiktoken.encoding_for_model("gpt-4o")

             if model_name in [
                 "gpt-4.1",
                 "gpt-4.1-mini",
+                "gpt-4.1-nano",
                 "o4-mini",
             ]:
                 tokenizer = tiktoken.encoding_for_model("gpt-4o")

varco_arena/varco_arena_core/prompts/{translation_new.py → translation_fortunecookie.py} RENAMED Viewed

@@ -7,7 +7,7 @@ from .prompt_utils import fill_template_over_messsages
 import random
 class TranslationNewPrompt(LLMBarPrompt):
-    def __init__(self, prompt_yaml: str = "translation_new.yaml"):
         super().__init__(prompt_yaml=prompt_yaml)
     def parsed_output(self, response: Any) -> str:

 import random
 class TranslationNewPrompt(LLMBarPrompt):
+    def __init__(self, prompt_yaml: str = "translation_fortunecookie.yaml"):
         super().__init__(prompt_yaml=prompt_yaml)
     def parsed_output(self, response: Any) -> str:

varco_arena/varco_arena_core/prompts/{translation_new.yaml → translation_fortunecookie.yaml} RENAMED Viewed

File without changes