diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx index 06d3dd069..66b862ffa 100644 --- a/docs/source/metric-list.mdx +++ b/docs/source/metric-list.mdx @@ -61,3 +61,4 @@ These metrics need the model to generate an output. They are therefore slower. - `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. - `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench. - `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench. +- `tvd_mi`: Corpus-level LLM-as-a-judge metric that estimates a lower bound on total variation mutual information using paired responses. It assumes each example has two responses and a binary label indicating whether they are from the same underlying item (`1`) or from different items (`0`), and computes `TPR + TNR - 1` from the judge’s binary decisions. diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/README.md b/examples/custom_tasks_templates/tvd_mi_synthetic/README.md new file mode 100644 index 000000000..0c892f2dd --- /dev/null +++ b/examples/custom_tasks_templates/tvd_mi_synthetic/README.md @@ -0,0 +1,33 @@ +# TVD-MI synthetic example (paired-response) + +This folder contains a tiny, synthetic paired-response dataset intended to demonstrate how to run the `tvd_mi` metric. + +## Data format + +The dataset is a `.jsonl` file where each line is a JSON object with: + +- `response_a` (str): first response in the pair +- `response_b` (str): second response in the pair +- `pair_label` (int): `1` if the two responses come from the same underlying item/task/source, `0` otherwise + +Example line: + +```json +{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital of France.","pair_label":1} +```` + +## What this example is (and isn’t) + +* ✅ A minimal, copyable example showing the expected fields for `tvd_mi` +* ✅ Useful as a template for building larger paired-response benchmarks +* ❌ Not intended to be a scientifically meaningful benchmark by itself + +## Running + +`tvd_mi` is an LLM-as-judge metric. To run with the OpenAI backend, set: + +```bash +export OPENAI_API_KEY=... +``` + +You can then load this dataset as Docs and evaluate with `tvd_mi` (see the Python loader in `tvd_mi_synthetic.py`). diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl new file mode 100644 index 000000000..5f5ce0734 --- /dev/null +++ b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl @@ -0,0 +1,5 @@ +{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital city of France.","pair_label":1} +{"response_a":"2 + 2 = 4.","response_b":"Four is the sum of two plus two.","pair_label":1} +{"response_a":"Water freezes at 0°C at standard atmospheric pressure.","response_b":"At 1 atm, water’s freezing point is 0 degrees Celsius.","pair_label":1} +{"response_a":"The capital of France is Paris.","response_b":"The mitochondria is the powerhouse of the cell.","pair_label":0} +{"response_a":"2 + 2 = 4.","response_b":"Photosynthesis converts light into chemical energy in plants.","pair_label":0} diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py new file mode 100644 index 000000000..3ade21ca7 --- /dev/null +++ b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py @@ -0,0 +1,121 @@ +# MIT License +# Copyright (c) 2024 The HuggingFace Team + +""" +Minimal loader for the TVD-MI paired-response synthetic example. + +This module intentionally avoids tight coupling to task registries so it can be +used as a simple reference/template. It provides `read_jsonl()` and `build_docs()` +helpers to construct lighteval `Doc` objects with the fields expected by TVD-MI. + +Expected JSONL schema per line: + - response_a: str + - response_b: str + - pair_label: int (1=same, 0=different) +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable + + +try: + # lighteval Doc type (preferred if available) + from lighteval.tasks.requests import Doc # type: ignore +except Exception: + # Fallback: minimal doc type for local testing / documentation purposes + @dataclass + class Doc: # type: ignore + query: str = "" + choices: list[str] | None = None + gold_index: int | list[int] | None = None + task_name: str | None = None + specific: dict[str, Any] | None = None + + +HERE = Path(__file__).resolve().parent +DEFAULT_DATA_PATH = HERE / "tvd_mi_synthetic.jsonl" + + +def read_jsonl(path: str | Path) -> list[dict[str, Any]]: + path = Path(path) + rows: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for line_num, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + try: + rows.append(json.loads(line)) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON on line {line_num} of {path}: {e}") from e + return rows + + +def _set_attr_if_possible(obj: Any, name: str, value: Any) -> None: + """ + Try to set `obj.name = value`. Some Doc implementations may forbid new attributes. + """ + try: + setattr(obj, name, value) + except Exception: + # It's fine if Doc is strict; we always store in `specific` too. + pass + + +def build_docs(rows: Iterable[dict[str, Any]], task_name: str = "tvd_mi_synthetic") -> list[Doc]: + docs: list[Doc] = [] + for i, r in enumerate(rows): + if "response_a" not in r or "response_b" not in r or "pair_label" not in r: + raise ValueError( + f"Row {i} missing required keys. Expected response_a/response_b/pair_label. Got keys={list(r.keys())}" + ) + + response_a = str(r["response_a"]) + response_b = str(r["response_b"]) + pair_label = int(r["pair_label"]) + + # Create a minimal Doc. Many metrics/tests assume `query`/`choices` exist. + doc = Doc( + query="", + choices=[], + gold_index=0, + task_name=task_name, + specific={ + "response_a": response_a, + "response_b": response_b, + "pair_label": pair_label, + }, + ) + + # Also set direct attributes for compatibility with JudgeLLMTVDMI.compute as currently implemented. + _set_attr_if_possible(doc, "response_a", response_a) + _set_attr_if_possible(doc, "response_b", response_b) + _set_attr_if_possible(doc, "pair_label", pair_label) + + docs.append(doc) + + return docs + + +def load_default_docs() -> list[Doc]: + """ + Convenience helper to load the default JSONL shipped with this example folder. + """ + rows = read_jsonl(DEFAULT_DATA_PATH) + return build_docs(rows) + + +if __name__ == "__main__": + docs = load_default_docs() + print(f"Loaded {len(docs)} docs from {DEFAULT_DATA_PATH}") + print( + "First doc has attrs:", + hasattr(docs[0], "response_a"), + hasattr(docs[0], "response_b"), + hasattr(docs[0], "pair_label"), + ) + print("First doc specific keys:", list((docs[0].specific or {}).keys())) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 82cfbb706..ae3470465 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -35,6 +35,7 @@ CorpusLevelF1Score, CorpusLevelPerplexityMetric, CorpusLevelTranslationMetric, + CorpusLevelTVDMI, MatthewsCorrCoef, ) from lighteval.metrics.metrics_sample import ( @@ -51,13 +52,19 @@ Faithfulness, GPassAtK, JudgeLLMSimpleQA, + JudgeLLMTVDMI, LoglikelihoodAcc, MajAtN, PassAtK, Recall, StringDistance, + process_judge_response_tvdmi, +) +from lighteval.metrics.normalizations import ( + bigbench_normalizer, + remove_braces, + remove_braces_and_strip, ) -from lighteval.metrics.normalizations import bigbench_normalizer, remove_braces, remove_braces_and_strip from lighteval.metrics.sample_preparator import ( GenerativePreparator, LoglikelihoodPreparator, @@ -84,7 +91,10 @@ @scorer(metrics=[accuracy()]) def math_scorer(): gold_extraction_target = (ExprExtractionConfig(),) - pred_extraction_target = (ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)) + pred_extraction_target = ( + ExprExtractionConfig(), + LatexExtractionConfig(boxed_match_priority=0), + ) language = Language.ENGLISH fallback_mode = "first_match" extraction_mode = "first_match" @@ -95,10 +105,18 @@ def math_scorer(): async def score(state: TaskState, target: Target): extracted_predictions = extract_target_from_pred( - state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds + state.output.completion, + pred_extraction_regexes, + fallback_mode, + extraction_mode, + timeout_seconds, ) extracted_gold = extract_target_from_pred( - target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds + target.text, + gold_extraction_regexes, + fallback_mode, + extraction_mode, + timeout_seconds, ) return Score( # Correct or Incorrect, used by inspect-ai backend @@ -128,10 +146,18 @@ def multichoice_scorer(): async def score(state: TaskState, target: Target): extracted_predictions = extract_target_from_pred( - state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds + state.output.completion, + pred_extraction_regexes, + fallback_mode, + extraction_mode, + timeout_seconds, ) extracted_gold = extract_target_from_pred( - target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds + target.text, + gold_extraction_regexes, + fallback_mode, + extraction_mode, + timeout_seconds, ) return Score( # Correct or Incorrect, used by inspect-ai backend @@ -143,6 +169,45 @@ async def score(state: TaskState, target: Target): return score +@scorer(metrics=[accuracy(), stderr()]) +def tvd_mi_scorer(): + """ + Inspect-compatible scorer for TVD-MI pair classification. + """ + + def _normalize_gold(label: str) -> int | None: + if not isinstance(label, str): + return None + s = label.strip().upper() + if s in {"A", "SAME", "POSITIVE", "1"}: + return 1 + if s in {"B", "DIFFERENT", "NEGATIVE", "0"}: + return 0 + return None + + async def score(state: TaskState, target: Target) -> Score: + raw_pred = state.output.completion + # Interpretation mapping logic + pred_val = process_judge_response_tvdmi(raw_pred) + + try: + pred_label = int(pred_val) + except Exception: + pred_label = None + + gold_label = _normalize_gold(str(target.text)) + + correct = pred_label is not None and gold_label is not None and pred_label == gold_label + # Correct or Incorrect, used by inspect-ai backend + return Score( + value="C" if correct else "I", + explanation=raw_pred, + answer=str(pred_label), + ) + + return score + + class Metrics(Enum): acc_golds_likelihood = SampleLevelMetric( # todo: we need a better name for this! metric_name="acc", @@ -163,8 +228,14 @@ class Metrics(Enum): sample_level_fn=AvgAtN( sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + gold_extraction_target=[ + ExprExtractionConfig(), + LatexExtractionConfig(), + ], + pred_extraction_target=[ + ExprExtractionConfig(), + LatexExtractionConfig(), + ], precision=6, ), ), @@ -176,8 +247,16 @@ class Metrics(Enum): metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"], sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip), category=SamplingMethod.GENERATIVE, - corpus_level_fn={"BERTScore-P": np.mean, "BERTScore-R": np.mean, "BERTScore-F": np.mean}, - higher_is_better={"BERTScore-P": True, "BERTScore-R": True, "BERTScore-F": True}, + corpus_level_fn={ + "BERTScore-P": np.mean, + "BERTScore-R": np.mean, + "BERTScore-F": np.mean, + }, + higher_is_better={ + "BERTScore-P": True, + "BERTScore-R": True, + "BERTScore-F": True, + }, ) bits_per_byte = CorpusLevelMetric( metric_name="bits_per_byte", @@ -237,13 +316,30 @@ class Metrics(Enum): higher_is_better=True, ) copyright = SampleLevelMetricGrouping( - metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"], + metric_name=[ + "longest_common_prefix_length", + "edit_distance", + "edit_similarity", + ], sample_level_fn=StringDistance( - metric_types=["longest_common_prefix_length", "edit_distance", "edit_similarity"], strip_prediction=True + metric_types=[ + "longest_common_prefix_length", + "edit_distance", + "edit_similarity", + ], + strip_prediction=True, ), category=SamplingMethod.GENERATIVE, - corpus_level_fn={"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max}, - higher_is_better={"longest_common_prefix_length": True, "edit_distance": False, "edit_similarity": True}, + corpus_level_fn={ + "longest_common_prefix_length": max, + "edit_distance": min, + "edit_similarity": max, + }, + higher_is_better={ + "longest_common_prefix_length": True, + "edit_distance": False, + "edit_similarity": True, + }, ) drop = SampleLevelMetricGrouping( metric_name=["em", "f1"], @@ -267,7 +363,10 @@ class Metrics(Enum): precision=5, gold_extraction_target=(ExprExtractionConfig(),), # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + pred_extraction_target=( + ExprExtractionConfig(), + LatexExtractionConfig(boxed_match_priority=0), + ), aggregation_function=max, ), category=SamplingMethod.GENERATIVE, @@ -275,9 +374,15 @@ class Metrics(Enum): higher_is_better=True, ) extractiveness = SampleLevelMetricGrouping( - metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], + metric_name=[ + "summarization_coverage", + "summarization_density", + "summarization_compression", + ], sample_level_fn=Extractiveness( - normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text" + normalize_input=remove_braces, + normalize_pred=remove_braces_and_strip, + input_column="text", ), category=SamplingMethod.GENERATIVE, corpus_level_fn={ @@ -292,9 +397,16 @@ class Metrics(Enum): }, ) extractiveness_de = SampleLevelMetricGrouping( - metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], + metric_name=[ + "summarization_coverage", + "summarization_density", + "summarization_compression", + ], sample_level_fn=Extractiveness( - normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="de" + normalize_input=remove_braces, + normalize_pred=remove_braces_and_strip, + input_column="text", + language="de", ), category=SamplingMethod.GENERATIVE, corpus_level_fn={ @@ -309,9 +421,16 @@ class Metrics(Enum): }, ) extractiveness_fr = SampleLevelMetricGrouping( - metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], + metric_name=[ + "summarization_coverage", + "summarization_density", + "summarization_compression", + ], sample_level_fn=Extractiveness( - normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="fr" + normalize_input=remove_braces, + normalize_pred=remove_braces_and_strip, + input_column="text", + language="fr", ), category=SamplingMethod.GENERATIVE, corpus_level_fn={ @@ -326,9 +445,16 @@ class Metrics(Enum): }, ) extractiveness_it = SampleLevelMetricGrouping( - metric_name=["summarization_coverage", "summarization_density", "summarization_compression"], + metric_name=[ + "summarization_coverage", + "summarization_density", + "summarization_compression", + ], sample_level_fn=Extractiveness( - normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="it" + normalize_input=remove_braces, + normalize_pred=remove_braces_and_strip, + input_column="text", + language="it", ), category=SamplingMethod.GENERATIVE, corpus_level_fn={ @@ -366,7 +492,9 @@ class Metrics(Enum): faithfulness = SampleLevelMetric( metric_name="summac", sample_level_fn=Faithfulness( - normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text" + normalize_input=remove_braces, + normalize_pred=remove_braces_and_strip, + input_column="text", ), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, @@ -390,7 +518,10 @@ class Metrics(Enum): precision=5, gold_extraction_target=(ExprExtractionConfig(),), # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + pred_extraction_target=( + ExprExtractionConfig(), + LatexExtractionConfig(boxed_match_priority=0), + ), aggregation_function=max, ), ), @@ -409,7 +540,10 @@ class Metrics(Enum): precision=5, gold_extraction_target=(LatexExtractionConfig(),), # Match boxed first before trying other regexes - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), + pred_extraction_target=( + ExprExtractionConfig(), + LatexExtractionConfig(boxed_match_priority=0), + ), aggregation_function=max, ), ), @@ -473,8 +607,14 @@ class Metrics(Enum): # Extracting mathematical expressions and latex expressions sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, - gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], - pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], + gold_extraction_target=[ + ExprExtractionConfig(), + LatexExtractionConfig(), + ], + pred_extraction_target=[ + ExprExtractionConfig(), + LatexExtractionConfig(), + ], precision=6, ), ), @@ -519,8 +659,18 @@ class Metrics(Enum): normalize_pred=bigbench_normalizer, ), category=SamplingMethod.GENERATIVE, - corpus_level_fn={"rouge1": np.mean, "rouge2": np.mean, "rougeL": np.mean, "rougeLsum": np.mean}, - higher_is_better={"rouge1": True, "rouge2": True, "rougeL": True, "rougeLsum": True}, + corpus_level_fn={ + "rouge1": np.mean, + "rouge2": np.mean, + "rougeL": np.mean, + "rougeLsum": np.mean, + }, + higher_is_better={ + "rouge1": True, + "rouge2": True, + "rougeL": True, + "rougeLsum": True, + }, ) rouge1 = SampleLevelMetric( metric_name="rouge1", @@ -593,10 +743,16 @@ class Metrics(Enum): sample_level_fn=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[ - IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + IndicesExtractionConfig( + prefix_for_extraction="NativeLetters", + try_extract_without_anchor=True, + ) ], pred_extraction_target=[ - IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + IndicesExtractionConfig( + prefix_for_extraction="NativeLetters", + try_extract_without_anchor=True, + ) ], precision=6, ), @@ -610,10 +766,16 @@ class Metrics(Enum): sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[ - IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + IndicesExtractionConfig( + prefix_for_extraction="NativeLetters", + try_extract_without_anchor=True, + ) ], pred_extraction_target=[ - IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True) + IndicesExtractionConfig( + prefix_for_extraction="NativeLetters", + try_extract_without_anchor=True, + ) ], precision=6, ), @@ -622,6 +784,13 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) + tvd_mi = CorpusLevelMetric( + metric_name="tvd_mi", + sample_level_fn=JudgeLLMTVDMI(), + category=SamplingMethod.GENERATIVE, + corpus_level_fn=CorpusLevelTVDMI(), + higher_is_better=True, + ) def __str__(self): return self.name.replace("_at_", "@") diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 92c2c574a..2cd630b54 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -107,7 +107,9 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]): for i in range(self.num_classes): f1s.append( sklearn.metrics.f1_score( - y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average + y_true=[g == i for g in golds], + y_pred=[p == i for p in preds], + average=self.average, ) ) return float(np.mean(f1s)) @@ -190,3 +192,22 @@ def compute_corpus(self, items: list[PerplexityCorpusMetricInput]): return math.exp(-sum(logprobs) / sum(weights)) if self.metric_type == "bits_per_byte": return -sum(logprobs) / sum(weights) * 1 / math.log(2) + + +class CorpusLevelTVDMI: + def __call__(self, items): + # items: list of dicts returned by JudgeLLMTVDMI.compute + labels = np.array([it["label"] for it in items]) + preds = np.array([it["pred"] for it in items]) + + pos = labels == 1 + neg = ~pos + + if pos.sum() == 0 or neg.sum() == 0: + return {"tvd_mi": float("nan")} + + tpr = (preds[pos] == 1).mean() + tnr = (preds[neg] == 0).mean() + tvd_mi = tpr + tnr - 1.0 + + return {"tvd_mi": float(tvd_mi)} diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index d83e64e22..ad0c85be1 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -51,7 +51,12 @@ remove_braces, remove_braces_and_strip, ) -from lighteval.metrics.utils.judge_utils import get_judge_prompt_simpleqa, process_judge_response_simpleqa +from lighteval.metrics.utils.judge_utils import ( + get_judge_prompt_simpleqa, + get_judge_prompt_tvdmi, + process_judge_response_simpleqa, + process_judge_response_tvdmi, +) from lighteval.metrics.utils.llm_as_judge import JudgeLM from lighteval.models.model_output import ModelResponse from lighteval.tasks.requests import Doc @@ -643,7 +648,10 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str logger.warning("The first metric computation step might be a bit longer as we need to download the model.") # We only initialize on first compute self.bert_scorer = BERTScorer( - model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, num_layers=9 + model_type="microsoft/deberta-large-mnli", + lang="en", + rescale_with_baseline=True, + num_layers=9, ) golds = as_list(golds) predictions = as_list(predictions) @@ -655,7 +663,11 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str predictions = [self.normalize_pred(p) for p in predictions] p, r, f = self.bert_scorer.score(predictions, golds) - return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()} + return { + "BERTScore-P": p[0].item(), + "BERTScore-R": r[0].item(), + "BERTScore-F": f[0].item(), + } class Extractiveness(SampleLevelComputation): @@ -856,7 +868,11 @@ def __init__( metric_types (list[str] | str): Can be one or any of `longest_common_prefix_length`, `edit_distance` or `edit_similarity`. strip_prediction (bool, optional): Whether to strip the prediction. Defaults to True. """ - allowed_values = ["longest_common_prefix_length", "edit_distance", "edit_similarity"] + allowed_values = [ + "longest_common_prefix_length", + "edit_distance", + "edit_similarity", + ] metric_types = as_list(metric_types) if any(metric_type not in allowed_values for metric_type in metric_types): raise ValueError( @@ -864,7 +880,11 @@ def __init__( ) self.metric_types = metric_types self.strip_prediction = strip_prediction - self.sample_aggregations = {"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max} + self.sample_aggregations = { + "longest_common_prefix_length": max, + "edit_distance": min, + "edit_similarity": max, + } def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): """Computes all the requested metrics on the golds and prediction. @@ -940,7 +960,13 @@ def edit_similarity(self, s1, s2): class JudgeLLM(SampleLevelComputation): - available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4", "gpt-4o-2024-08-06"] + available_models_openai = [ + "gpt-3.5-turbo", + "gpt-4o", + "gpt-4-turbo", + "gpt-4", + "gpt-4o-2024-08-06", + ] def __init__( self, @@ -1065,10 +1091,14 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs) query_context_2 = {"query": questions[1], "context": predictions[0]} score_turn_1, message_turn_1, judgement_turn_1 = self.judge.evaluate_answer( - question=json.dumps(query_context_1, indent=2), answer=predictions[0], gold=golds[0] if golds else None + question=json.dumps(query_context_1, indent=2), + answer=predictions[0], + gold=golds[0] if golds else None, ) score_turn_2, message_turn_2, judgement_turn_2 = self.judge.evaluate_answer( - question=json.dumps(query_context_2, indent=2), answer=predictions[1], gold=golds[1] if golds else None + question=json.dumps(query_context_2, indent=2), + answer=predictions[1], + gold=golds[1] if golds else None, ) return { @@ -1106,6 +1136,50 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs): return metrics +class JudgeLLMTVDMI(JudgeLLM): + def __init__(self): + super().__init__( + judge_model_name="gpt-4o-2024-08-06", + template=get_judge_prompt_tvdmi, + process_judge_response=process_judge_response_tvdmi, + judge_backend="openai", + short_judge_name="gpt4o", + ) + + def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list: + # For TVD-MI, the evaluated model is the judge; the “responses” from + # base models are already baked into docs as response_a / response_b. + def _get(d, k): + return ( + getattr(d, k, None) + if getattr(d, k, None) is not None + else (d.specific.get(k) if getattr(d, "specific", None) else None) + ) + + questions = [_get(d, "response_a") for d in docs] + answers = [_get(d, "response_b") for d in docs] + labels = [int(_get(d, "pair_label")) for d in docs] + + options = [None] * len(docs) + golds = [None] * len(docs) + + scores, prompts, judge_responses = self.judge.evaluate_answer_batch(questions, answers, options, golds) + + metrics = [] + for i in range(len(docs)): + pred = scores[i] # already 0/1 from process_judge_response_tvdmi + metrics.append( + { + "label": labels[i], + "pred": pred, + f"user_prompt_{self.short_judge_name}": prompts[i], + f"judgement_{self.short_judge_name}": judge_responses[i], + } + ) + + return metrics + + class SamplingMetric: """All sampling metrics we have defined below use the same set of normalization parameters and same behavior for the default sample_scoring_function. This class just holds the normalization and applies it to all samples passed to preprocess, then uses the default sample function if not provided. @@ -1115,7 +1189,7 @@ def __init__( self, normalize: Callable | str | None = None, strip_strings: bool = False, - sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None, + sample_scoring_function: (Callable[[Doc, ModelResponse], float] | str | None) = None, ): if isinstance(normalize, str): import lighteval.metrics.normalizations diff --git a/src/lighteval/metrics/utils/judge_utils.py b/src/lighteval/metrics/utils/judge_utils.py index cde25fd26..b78d00dc0 100644 --- a/src/lighteval/metrics/utils/judge_utils.py +++ b/src/lighteval/metrics/utils/judge_utils.py @@ -125,3 +125,49 @@ def process_judge_response_simpleqa(response: str) -> float: else: logger.warning(f"Unknown response from judge: {response}") return 0.0 + + +def get_judge_prompt_tvdmi( + question: str, + answer: str, + options: list[str] | None = None, + gold: str | None = None, + **kwargs, +): + # `question` = response_a, `answer` = response_b + TEMPLATE = """ +You are judging whether two responses are likely derived from the same underlying task or item. + +Consider ONLY whether the two responses plausibly come from the same task/source, not whether they are factually correct. + +Response A: +{response_a} + +Response B: +{response_b} + +Grade as: +A: SAME TASK/SOURCE +B: DIFFERENT TASK/SOURCE + +Just return the letter "A" or "B", with no other text. +""".strip() + + content = TEMPLATE.format(response_a=question, response_b=answer) + return [{"role": "user", "content": content}] + + +def process_judge_response_tvdmi(response: str) -> int: + # Normalize + if response is None: + return 0 + + cleaned = response.strip().lower() + + if cleaned == "a": + return 1 + elif cleaned == "b": + return 0 + else: + logger.warning(f"Unknown response from TVD-MI judge: {response!r}") + return 0 diff --git a/tests/unit/metrics/test_tvd_mi.py b/tests/unit/metrics/test_tvd_mi.py new file mode 100644 index 000000000..f2bb5e088 --- /dev/null +++ b/tests/unit/metrics/test_tvd_mi.py @@ -0,0 +1,185 @@ +import asyncio +import math +from dataclasses import dataclass + +import pytest + +from lighteval.metrics.metrics import tvd_mi_scorer +from lighteval.metrics.metrics_corpus import CorpusLevelTVDMI +from lighteval.metrics.metrics_sample import JudgeLLMTVDMI +from lighteval.metrics.utils.judge_utils import ( + get_judge_prompt_tvdmi, + process_judge_response_tvdmi, +) + + +def test_get_judge_prompt_tvdmi_injects_responses(): + question = "Resp A" + answer = "Resp B" + + messages = get_judge_prompt_tvdmi(question=question, answer=answer, options=None, gold=None) + + # Should be a single chat message + assert isinstance(messages, list) + assert len(messages) == 1 + msg = messages[0] + assert msg["role"] == "user" + + content = msg["content"] + # Basic structure checks + assert "Response A:" in content + assert "Response B:" in content + assert "Resp A" in content + assert "Resp B" in content + # Should mention A/B grading + assert "A:" in content + assert "B:" in content + + +def test_process_judge_response_tvdmi_maps_A_B(): + assert process_judge_response_tvdmi("A") == 1 + assert process_judge_response_tvdmi("B") == 0 + # Robust to case/whitespace + assert process_judge_response_tvdmi(" a \n") == 1 + assert process_judge_response_tvdmi(" b\t") == 0 + + +def test_process_judge_response_tvdmi_unknown_falls_back_to_0(caplog): + with caplog.at_level("WARNING"): + out = process_judge_response_tvdmi("weird") + assert out == 0 + # Optional: check that we actually logged something + assert any("TVD-MI judge" in rec.message for rec in caplog.records) + + +def test_corpus_level_tvdmi_perfect_critic(): + # Always correct on both positive and negative + items = [ + {"label": 1, "pred": 1}, + {"label": 1, "pred": 1}, + {"label": 0, "pred": 0}, + {"label": 0, "pred": 0}, + ] + + result = CorpusLevelTVDMI()(items) + assert "tvd_mi" in result + assert result["tvd_mi"] == pytest.approx(1.0) + + +def test_corpus_level_tvdmi_random_critic(): + # 50% TPR, 50% TNR → TVD-MI = 0 + items = [ + {"label": 1, "pred": 1}, + {"label": 1, "pred": 0}, + {"label": 0, "pred": 0}, + {"label": 0, "pred": 1}, + ] + + result = CorpusLevelTVDMI()(items) + assert result["tvd_mi"] == pytest.approx(0.0) + + +def test_corpus_level_tvdmi_missing_class_returns_nan(): + # No negatives → TVD-MI undefined + items = [ + {"label": 1, "pred": 1}, + {"label": 1, "pred": 0}, + ] + + result = CorpusLevelTVDMI()(items) + assert math.isnan(result["tvd_mi"]) + + +@dataclass +class FakeDoc: + response_a: str + response_b: str + pair_label: int + + +def test_judge_tvdmi_compute(monkeypatch): + judge = JudgeLLMTVDMI() + + # Two examples: one positive, one negative + docs = [ + FakeDoc("A1", "A2", 1), + FakeDoc("B1", "B2", 0), + ] + + # Fake judge backend: we want to check what arguments it receives, + # and return deterministic scores/prompts/responses. + def fake_evaluate_answer_batch(questions, answers, options, golds, **kwargs): + # Input wiring checks + assert questions == ["A1", "B1"] + assert answers == ["A2", "B2"] + assert options == [None, None] + assert golds == [None, None] + + scores = [1, 0] # predict SAME for first, DIFFERENT for second + prompts = ["prompt-0", "prompt-1"] + responses = ["A", "B"] # raw judge outputs + return scores, prompts, responses + + # Attach a fake .judge with our method + class FakeInnerJudge: + def evaluate_answer_batch(self, *args, **kwargs): + return fake_evaluate_answer_batch(*args, **kwargs) + + monkeypatch.setattr(judge, "judge", FakeInnerJudge()) + + metrics = judge.compute(responses=[], docs=docs) + + assert len(metrics) == 2 + + # Check labels and preds propagated correctly + assert metrics[0]["label"] == 1 + assert metrics[0]["pred"] == 1 + assert metrics[1]["label"] == 0 + assert metrics[1]["pred"] == 0 + + # Check extra fields exist (names match your short_judge_name) + assert any(k.startswith("user_prompt_") for k in metrics[0].keys()) + assert any(k.startswith("judgement_") for k in metrics[0].keys()) + + +# ---- Inspect-compatible scorer tests ---- + + +class _DummyOutput: + def __init__(self, completion: str): + self.completion = completion + + +class _DummyState: + def __init__(self, completion: str): + self.output = _DummyOutput(completion) + + +class _DummyTarget: + def __init__(self, text: str): + self.text = text + + +def test_tvd_mi_scorer_matches_label_A(): + """The inspect-ai tvd_mi_scorer should mark matching 'A' labels as correct.""" + scorer_fn = tvd_mi_scorer() + + state = _DummyState("A") # model/judge output + target = _DummyTarget("A") # gold label + + score = asyncio.run(scorer_fn(state, target)) + + assert score.value == "C" + assert score.answer == "1" # normalized positive class + + +def test_tvd_mi_scorer_mismatch_is_incorrect(): + """Mismatched A/B labels should be scored as incorrect.""" + scorer_fn = tvd_mi_scorer() + + state = _DummyState("B") # model says DIFFERENT + target = _DummyTarget("A") # gold SAME + + score = asyncio.run(scorer_fn(state, target)) + + assert score.value == "I"