Fix TruthfulQA generative evaluation bugs

Duc Hoang · Duc Hoang · commit ff898bbdf9ec · 2025-12-10T18:18:08.000-08:00
1. Remove KeyError: mc1_targets field only exists in multiple_choice subset,
   not generation subset used by truthfulqa:gen task

2. Fix backwards answer processing logic that was replacing correct answers
   with periods instead of preserving answer text

These fixes make truthfulqa:gen functional for proper evaluation.
Task format: lighteval|truthfulqa:gen|0
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -2478,20 +2478,19 @@ def truthful_qa_multiple_choice(line, task_name: str = None):
 
 def truthful_qa_generative(line, task_name: str = None):  # BLEU and combination of BLEU
     correct_answers = [
-        answer.strip() + "" if answer[-1] == "." else "." for answer in line["correct_answers"] if answer != ""
+        answer.strip() + ("" if answer.strip().endswith(".") else ".") for answer in line["correct_answers"] if answer.strip() != ""
     ]
     if "I have no comment." not in correct_answers:
         correct_answers.append("I have no comment.")
     incorrect_answers = [
-        answer.strip() + "" if answer[-1] == "." else "." for answer in line["incorrect_answers"] if answer != ""
+        answer.strip() + ("" if answer.strip().endswith(".") else ".") for answer in line["incorrect_answers"] if answer.strip() != ""
     ]
 
     return Doc(
         task_name=task_name,
         query=line["question"].strip(),
         choices=correct_answers + incorrect_answers,
         gold_index=list(range(len(correct_answers))),
-        specific={"len_mc1": len(line["mc1_targets"]["choices"])},
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -2478,20 +2478,19 @@ def truthful_qa_multiple_choice(line, task_name: str = None):`
`2478`	`2478`
`2479`	`2479`	`def truthful_qa_generative(line, task_name: str = None): # BLEU and combination of BLEU`
`2480`	`2480`	`correct_answers = [`
`2481`		`- answer.strip() + "" if answer[-1] == "." else "." for answer in line["correct_answers"] if answer != ""`
	`2481`	`+ answer.strip() + ("" if answer.strip().endswith(".") else ".") for answer in line["correct_answers"] if answer.strip() != ""`
`2482`	`2482`	`]`
`2483`	`2483`	`if "I have no comment." not in correct_answers:`
`2484`	`2484`	`correct_answers.append("I have no comment.")`
`2485`	`2485`	`incorrect_answers = [`
`2486`		`- answer.strip() + "" if answer[-1] == "." else "." for answer in line["incorrect_answers"] if answer != ""`
	`2486`	`+ answer.strip() + ("" if answer.strip().endswith(".") else ".") for answer in line["incorrect_answers"] if answer.strip() != ""`
`2487`	`2487`	`]`
`2488`	`2488`
`2489`	`2489`	`return Doc(`
`2490`	`2490`	`task_name=task_name,`
`2491`	`2491`	`query=line["question"].strip(),`
`2492`	`2492`	`choices=correct_answers + incorrect_answers,`
`2493`	`2493`	`gold_index=list(range(len(correct_answers))),`
`2494`		`- specific={"len_mc1": len(line["mc1_targets"]["choices"])},`
`2495`	`2494`	`)`
`2496`	`2495`
`2497`	`2496`