diff --git a/src/lighteval/tasks/tasks/multi_challenge.py b/src/lighteval/tasks/tasks/multi_challenge.py new file mode 100644 index 000000000..684bf27b5 --- /dev/null +++ b/src/lighteval/tasks/tasks/multi_challenge.py @@ -0,0 +1,172 @@ +""" +name: +MultiChallenge + +dataset: +nmayorga7/multichallenge + +abstract: +MultiChallenge evaluates large language models (LLMs) on their ability to +conduct multi-turn conversations with human users. +The model is given a target question belonging to one or +more axes (categories) and must provide a free-form answer. +The evaluation uses a secondary judge model to determine if the +answer satisfies the pass criteria for that question. + +languages: +english + +tags: +conversational, generation, instruction-following + +paper: +https://arxiv.org/abs/2501.17399 + +starred: +true +""" + +from inspect_ai.dataset import Sample +from inspect_ai.model._chat_message import ChatMessageAssistant, ChatMessageUser +from inspect_ai.scorer import Score, Target, accuracy, model_graded_fact, scorer, stderr +from inspect_ai.solver import Generate, TaskState, generate, solver + +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +# NOTE: ChatMessageAssistant and ChatMessageUser are imported from a private module. + + +JUDGE_PROMPT = """You are tasked with evaluating a model response to see if it meets a specific criteria. +The criteria will always be YES/NO evaluation. + +The model response is as follows: + +{answer} + + +The criteria that the model response must meet is as follows. Be VERY STRICT!: + +{criterion} + + +Print your reasoning followed by your verdict, either "YES" or "NO".""" + + +def multi_challenge_prompt(line, task_name: str = None): + """Stub prompt function for inspect-ai-only task (not used by inspect-ai backend).""" + return Doc( + task_name=task_name, + query="", + choices=[], + gold_index=0, + ) + + +@scorer(metrics=[accuracy(), stderr()]) +def multi_challenge_scorer(): + base_scorer = model_graded_fact( + template=JUDGE_PROMPT, + grade_pattern=r"\b(YES|NO)\b", + model="openai/gpt-4o-2024-08-06", + ) + + async def score(state: TaskState, target: Target): + score = await base_scorer(state, target) + judge_verdict = score.value.upper() if score.value else None + + if not judge_verdict or judge_verdict not in ["YES", "NO"]: + return Score( + value="I", + answer=score.answer, + explanation=f"Could not extract valid verdict from judge output: {score.explanation}", + ) + + pass_criteria = state.metadata.get("pass_criteria", "") + if pass_criteria not in ["YES", "NO"]: + return Score( + value="I", + answer=score.answer, + explanation=f"Invalid pass criteria: {pass_criteria}", + ) + + passed = judge_verdict == pass_criteria + + return Score( + value="C" if passed else "I", + answer=score.answer, + explanation=score.explanation, + ) + + return score + + +@solver +def conversation_solver(): + """Solver that builds conversation history from metadata.""" + + async def solve(state: TaskState, generate: Generate): + conversation = state.metadata.get("conversation", []) + + if not hasattr(state, "messages") or state.messages is None: + state.messages = [] + + for msg in conversation: + role = msg["role"].lower() + content = msg["content"] + + if role == "user": + state.messages.append(ChatMessageUser(content=content)) + elif role == "assistant": + state.messages.append(ChatMessageAssistant(content=content)) + else: + raise ValueError(f"Unsupported role: {role} in conversation.") + + return state + + return solve + + +def record_to_sample(record: dict) -> Sample: + """Convert dataset record to inspect-ai Sample object.""" + conversation = record["CONVERSATION"] + + last_msg = None + for msg in reversed(conversation): + if msg["role"] == "user": + last_msg = msg["content"] + break + + return Sample( + input=last_msg or "", + target=record["TARGET_QUESTION"], + metadata={ + "question_id": record["QUESTION_ID"], + "axis": record["AXIS"], + "pass_criteria": record["PASS_CRITERIA"], + "conversation": conversation, + "length": len(conversation), + }, + ) + + +multi_challenge = LightevalTaskConfig( + name="multi_challenge", + prompt_function=multi_challenge_prompt, + hf_repo="nmayorga7/multichallenge", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + stop_sequence=[], + version=0, + sample_fields=record_to_sample, + metrics=[], # Metrics are defined in the scorer decorator for inspect-ai tasks + solver=[conversation_solver(), generate(cache=True)], + scorer=multi_challenge_scorer(), +) + +TASKS_TABLE = [multi_challenge]