Spaces:

NCSOFT
/

ArenaLite

Sleeping

App Files Files Community

sonsus commited on Jul 15

Commit

1eadcf1

1 Parent(s): 5bdfe24

removed prob_calibrated_output logic, modif complete_prompt, polish docs

Browse files

Files changed (7) hide show

varco_arena/main.py +2 -2
varco_arena/setup.py +1 -1
varco_arena/varco_arena_core/data_utils.py +2 -2
varco_arena/varco_arena_core/elo.py +1 -1
varco_arena/varco_arena_core/eval_utils.py +26 -10
varco_arena/varco_arena_core/prompts/base_prompt.py +29 -9
varco_arena/varco_arena_core/prompts/llmbar.py +81 -81

varco_arena/main.py CHANGED Viewed

@@ -75,9 +75,9 @@ def main(
         return
     # prompt user whether to continue
-    flag = input("[*] Run Varco Arena? (y/n) : ")
     if not flag.lower() == "y" and not flag.lower() == "yes":
-        print("[-] Varco Arena Stopped")
         return
     manager = Manager(

         return
     # prompt user whether to continue
+    flag = input("[*] Run Arena-Lite? (y/n) : ")
     if not flag.lower() == "y" and not flag.lower() == "yes":
+        print("[-] Arena-Lite Stopped")
         return
     manager = Manager(

varco_arena/setup.py CHANGED Viewed

@@ -19,7 +19,7 @@ setup(
     version="1.0.0",  # 패키지의 버전
     author="Text AI Lab, Generation Model Team",  # 작성자 이름
     author_email="ncsoft_generationmodelteamd@ncsoft.com",  # 작성자 이메일
-    description="VARCO Arena는 각 모델 별로 생성된 결과를 기반으로 어떤 모델이 가장 좋은 모델인지 판별해주는 프로젝트입니다.",  # 패키지에 대한 간단한 설명
     packages=find_packages(),  # 패키지 폴더를 자동으로 찾아 추가
     install_requires=required_packages,  # 패키지 설치에 필요한 다른 패키지들
     dependency_links=["git+https://github.com/shobrook/openlimit.git#egg=openlimit"],

     version="1.0.0",  # 패키지의 버전
     author="Text AI Lab, Generation Model Team",  # 작성자 이름
     author_email="ncsoft_generationmodelteamd@ncsoft.com",  # 작성자 이메일
+    description="Arena-Lite는 각 모델 별로 생성된 결과를 기반으로 어떤 모델이 가장 좋은 모델인지 판별해주는 프로젝트입니다.",  # 패키지에 대한 간단한 설명
     packages=find_packages(),  # 패키지 폴더를 자동으로 찾아 추가
     install_requires=required_packages,  # 패키지 설치에 필요한 다른 패키지들
     dependency_links=["git+https://github.com/shobrook/openlimit.git#egg=openlimit"],

varco_arena/varco_arena_core/data_utils.py CHANGED Viewed

@@ -108,9 +108,9 @@ def _fill_mcguffin_field(df: pd.DataFrame) -> pd.DataFrame:
 def num_uniq_queries(dataset: List[Dict]) -> int:
-    """use this function to check if the input jsonl files are suitable to intended use of Varco Arena
-    Varco Arena: Let {n_models} of LLMs compete each other on {len_file} number of instructions
     --> dataset.instruction + dataset.source shouldn't change by adding another model output file, if it does, it is would be a buggy input.
     """
     df = pd.DataFrame(dataset)

 def num_uniq_queries(dataset: List[Dict]) -> int:
+    """use this function to check if the input jsonl files are suitable to intended use of Arena-Lite
+    Arena-Lite: Let {n_models} of LLMs compete each other on {len_file} number of instructions
     --> dataset.instruction + dataset.source shouldn't change by adding another model output file, if it does, it is would be a buggy input.
     """
     df = pd.DataFrame(dataset)

varco_arena/varco_arena_core/elo.py CHANGED Viewed

@@ -26,7 +26,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
     Y = np.zeros(n)
     Y[df["winner"] == "A"] = 1.0
-    WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Varco Arena or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
     if (Y == 0).all():
         print(WARNING.format(L=32))
         Y[-1] = 1.0

     Y = np.zeros(n)
     Y[df["winner"] == "A"] = 1.0
+    WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
     if (Y == 0).all():
         print(WARNING.format(L=32))
         Y[-1] = 1.0

varco_arena/varco_arena_core/eval_utils.py CHANGED Viewed

@@ -195,18 +195,34 @@ async def async_eval_w_prompt(
     # prob_calibration: bool = True, # default True.
 ):
     # complete the prompt
-    kwargs = dict(
-        inst=position_1st.instruction,
-        src=position_1st.source,
-        out_a=position_1st.generated,
-        out_b=position_2nd.generated,
-        task=position_1st.task,
-    )
-    if isinstance(prompt_obj, TranslationPairPrompt) or \
-        isinstance(prompt_obj, TranslationNewPrompt):
         kwargs["source_lang"] = position_1st.source_lang
         kwargs["target_lang"] = position_1st.target_lang
     completed_prompt = prompt_obj.complete_prompt(**kwargs)
     # print(completed_prompt[-1]["content"])

     # prob_calibration: bool = True, # default True.
 ):
     # complete the prompt
+    kwargs = vars(position_1st).copy()
+    kwargs["out_a"] = position_1st.generated
+    kwargs["out_b"] = position_2nd.generated
+    # Remove 'generated' from kwargs if it was copied from position_1st, as it's now 'out_a'
+    if "generated" in kwargs:
+        del kwargs["generated"]
+    # Remove 'instruction', 'source', 'task' if they are not needed as separate keys
+    # and are already covered by the vars(position_1st).copy()
+    # For now, keep them as they might be explicitly used in some templates.
+    # If 'inst', 'src', 'task' are expected as specific keys in templates,
+    # ensure they are present.
+    # Assuming 'instruction' -> 'inst', 'source' -> 'src' etc. is handled by prompt templates.
+    # If not, explicit mapping might be needed.
+    # For now, let's assume the template uses the original field names from input.jsonl.
+    # If the template expects 'inst' but input.jsonl has 'instruction',
+    # then a mapping is required.
+    # Based on the original code, it seems 'inst', 'src', 'task' are the expected keys.
+    # Let's ensure these are explicitly set if they exist in position_1st.
+    if hasattr(position_1st, "instruction"):
+        kwargs["inst"] = position_1st.instruction
+    if hasattr(position_1st, "source"):
+        kwargs["src"] = position_1st.source
+    if hasattr(position_1st, "source_lang"):
         kwargs["source_lang"] = position_1st.source_lang
+    if hasattr(position_1st, "target_lang"):
         kwargs["target_lang"] = position_1st.target_lang
+    if hasattr(position_1st, "task"):
+        kwargs["task"] = position_1st.task
     completed_prompt = prompt_obj.complete_prompt(**kwargs)
     # print(completed_prompt[-1]["content"])

varco_arena/varco_arena_core/prompts/base_prompt.py CHANGED Viewed

@@ -45,15 +45,35 @@ class ComparisonPromptBase:
         sampling_parameters = raw_d["sampling_parameters"]
         decision_tokens = raw_d["decision_tokens"]
-        prompt_template = [
-            {
-                "role": message["role"],
-                "content": Template(message["content"])
-                if get_fields(message["content"])
-                else message["content"],
-            }
-            for message in raw_d["prompt_template"]
-        ]
         expected_generation_str = raw_d["expected_generation_str"]

         sampling_parameters = raw_d["sampling_parameters"]
         decision_tokens = raw_d["decision_tokens"]
+        prompt_template = []
+        for message in raw_d["prompt_template"]:
+            content = message["content"]
+            if "${task}" in content:
+                raise ValueError(
+                    f"The prompt template in {yamlf} contains '${{task}}'. "
+                    "The 'task' field is reserved for prompt branching logic "
+                    "and cannot be used as a template variable."
+                )
+            if "${model_id}" in content:
+                raise ValueError(
+                    f"The prompt template in {yamlf} contains '${{model_id}}'. "
+                    "The 'model_id' field is reserved for match logic "
+                    "and cannot be used as a template variable."
+                )
+            if "${generated}" in content:
+                raise ValueError(
+                    f"The prompt template in {yamlf} contains '${{generated}}'. "
+                    "The 'generated' field is reserved for match logic "
+                    "and cannot be used as a template variable."
+                )
+            prompt_template.append(
+                {
+                    "role": message["role"],
+                    "content": Template(content)
+                    if get_fields(content)
+                    else content,
+                }
+            )
         expected_generation_str = raw_d["expected_generation_str"]

varco_arena/varco_arena_core/prompts/llmbar.py CHANGED Viewed

@@ -64,88 +64,88 @@ class LLMBarPrompt(ComparisonPromptBase):
         return res_tok
-    def prob_calibrated_output(self, response) -> Dict[str, float]:
-        """
-        ChatCompletionTokenLogProb:
-            token: str
-            logprob: float
-            top_logprobs: List[
-                TopLogprob:
-                    token: str
-                    logprob: float
-            ]
-        or
-        vllm response object (logprob struct differs)
-        """
-        # focus to the token of interest
-        # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
-        res_tok = self.parsed_output(response)
-        if res_tok == None:  # if not found, return None
-            return None
-        isopenai: bool = is_model_from_openai(response=response)
-        found_tokens: list = []
-        if isopenai:
-            try:
-                logprobs = response.choices[0].logprobs.content
-                top_logprob_list = find_logprob_of_a_token_openai(
-                    logprobs=logprobs, token=res_tok
-                ).top_logprobs
-            except Exception as e:
-                print(f"(Warning) logprob method not implemented for this prompt")
-                print(f"use token result from parsed_output: {res_tok}")
-                probs_normalized = {
-                    "prefer_1st": 1.0 if "a" in res_tok.lower() else 0.0,
-                    "prefer_2nd": 1.0 if "b" in res_tok.lower() else 0.0,
-                    "target_tokens_found": [res_tok],
-                    "no_target_tokens_found": False,
-                    "model": response.model,
-                    "decision_by": "parsed_output",
-                }
-                return probs_normalized
-            # explore the `top_logprobs_list` list and then gather
-            probs_of_interest_pos = dict()
-            for k, tok in self.decision_tokens.items():
-                lp_obj = find_logprob_of_a_token_openai(
-                    logprobs=top_logprob_list, token=tok
-                )
-                if lp_obj is not None:
-                    logp = lp_obj.logprob
-                    found_tokens.append(tok)
-                p = np.exp(logp)
-                probs_of_interest_pos[k] = p  # prefer_1st = a_prob
-        else:
-            # vllm/openai=0.4.2, 0.5.5
-            probs_of_interest_pos = dict()
-            top_logprobs_d = top_logprob_of_a_token_vllm(
-                response=response, token=res_tok
-            )
-            for k, tok in self.decision_tokens.items():
-                if tok in top_logprobs_d.keys():
-                    probs_of_interest_pos[k] = np.exp(top_logprobs_d[tok])
-                    found_tokens.append(tok)
-        # normalize
-        norm = sum(
-            probs_of_interest_pos.values()
-        )  # even for empty dict, this wouldn't cause divbyzero error
-        probs_normalized = {k: v / norm for k, v in probs_of_interest_pos.items()}
-        probs_normalized["target_tokens_found"] = found_tokens
-        probs_normalized["no_target_tokens_found"] = not bool(found_tokens)
-        probs_normalized["model"] = response.model
-        if not found_tokens:  # empty dict, no toks in self.decision_tokens.values()
-            # for the ease of downstream processings: set values as exact .5 (tie case)
-            probs_normalized["prefer_1st"] = 0.5
-            probs_normalized["prefer_2nd"] = 0.5
-            probs_normalized["decision_by"] = "normalized likelihood"
-        return probs_normalized
     def complete_prompt(
         self,

         return res_tok
+    # def prob_calibrated_output(self, response) -> Dict[str, float]:
+    #     """
+    #     ChatCompletionTokenLogProb:
+    #         token: str
+    #         logprob: float
+    #         top_logprobs: List[
+    #             TopLogprob:
+    #                 token: str
+    #                 logprob: float
+    #         ]
+    #     or
+    #     vllm response object (logprob struct differs)
+    #     """
+    #     # focus to the token of interest
+    #     # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
+    #     res_tok = self.parsed_output(response)
+    #     if res_tok == None:  # if not found, return None
+    #         return None
+    #     isopenai: bool = is_model_from_openai(response=response)
+    #     found_tokens: list = []
+    #     if isopenai:
+    #         try:
+    #             logprobs = response.choices[0].logprobs.content
+    #             top_logprob_list = find_logprob_of_a_token_openai(
+    #                 logprobs=logprobs, token=res_tok
+    #             ).top_logprobs
+    #         except Exception as e:
+    #             print(f"(Warning) logprob method not implemented for this prompt")
+    #             print(f"use token result from parsed_output: {res_tok}")
+    #             probs_normalized = {
+    #                 "prefer_1st": 1.0 if "a" in res_tok.lower() else 0.0,
+    #                 "prefer_2nd": 1.0 if "b" in res_tok.lower() else 0.0,
+    #                 "target_tokens_found": [res_tok],
+    #                 "no_target_tokens_found": False,
+    #                 "model": response.model,
+    #                 "decision_by": "parsed_output",
+    #             }
+    #             return probs_normalized
+    #         # explore the `top_logprobs_list` list and then gather
+    #         probs_of_interest_pos = dict()
+    #         for k, tok in self.decision_tokens.items():
+    #             lp_obj = find_logprob_of_a_token_openai(
+    #                 logprobs=top_logprob_list, token=tok
+    #             )
+    #             if lp_obj is not None:
+    #                 logp = lp_obj.logprob
+    #                 found_tokens.append(tok)
+    #             p = np.exp(logp)
+    #             probs_of_interest_pos[k] = p  # prefer_1st = a_prob
+    #     else:
+    #         # vllm/openai=0.4.2, 0.5.5
+    #         probs_of_interest_pos = dict()
+    #         top_logprobs_d = top_logprob_of_a_token_vllm(
+    #             response=response, token=res_tok
+    #         )
+    #         for k, tok in self.decision_tokens.items():
+    #             if tok in top_logprobs_d.keys():
+    #                 probs_of_interest_pos[k] = np.exp(top_logprobs_d[tok])
+    #                 found_tokens.append(tok)
+    #     # normalize
+    #     norm = sum(
+    #         probs_of_interest_pos.values()
+    #     )  # even for empty dict, this wouldn't cause divbyzero error
+    #     probs_normalized = {k: v / norm for k, v in probs_of_interest_pos.items()}
+    #     probs_normalized["target_tokens_found"] = found_tokens
+    #     probs_normalized["no_target_tokens_found"] = not bool(found_tokens)
+    #     probs_normalized["model"] = response.model
+    #     if not found_tokens:  # empty dict, no toks in self.decision_tokens.values()
+    #         # for the ease of downstream processings: set values as exact .5 (tie case)
+    #         probs_normalized["prefer_1st"] = 0.5
+    #         probs_normalized["prefer_2nd"] = 0.5
+    #         probs_normalized["decision_by"] = "normalized likelihood"
+    #     return probs_normalized
     def complete_prompt(
         self,