diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx
index 06d3dd069..66b862ffa 100644
--- a/docs/source/metric-list.mdx
+++ b/docs/source/metric-list.mdx
@@ -61,3 +61,4 @@ These metrics need the model to generate an output. They are therefore slower.
 - `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.
 - `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
 - `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
+- `tvd_mi`: Corpus-level LLM-as-a-judge metric that estimates a lower bound on total variation mutual information using paired responses. It assumes each example has two responses and a binary label indicating whether they are from the same underlying item (`1`) or from different items (`0`), and computes `TPR + TNR - 1` from the judge’s binary decisions.
diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/README.md b/examples/custom_tasks_templates/tvd_mi_synthetic/README.md
new file mode 100644
index 000000000..0c892f2dd
--- /dev/null
+++ b/examples/custom_tasks_templates/tvd_mi_synthetic/README.md
@@ -0,0 +1,33 @@
+# TVD-MI synthetic example (paired-response)
+
+This folder contains a tiny, synthetic paired-response dataset intended to demonstrate how to run the `tvd_mi` metric.
+
+## Data format
+
+The dataset is a `.jsonl` file where each line is a JSON object with:
+
+- `response_a` (str): first response in the pair
+- `response_b` (str): second response in the pair
+- `pair_label` (int): `1` if the two responses come from the same underlying item/task/source, `0` otherwise
+
+Example line:
+
+```json
+{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital of France.","pair_label":1}
+````
+
+## What this example is (and isn’t)
+
+* ✅ A minimal, copyable example showing the expected fields for `tvd_mi`
+* ✅ Useful as a template for building larger paired-response benchmarks
+* ❌ Not intended to be a scientifically meaningful benchmark by itself
+
+## Running
+
+`tvd_mi` is an LLM-as-judge metric. To run with the OpenAI backend, set:
+
+```bash
+export OPENAI_API_KEY=...
+```
+
+You can then load this dataset as Docs and evaluate with `tvd_mi` (see the Python loader in `tvd_mi_synthetic.py`).
diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl
new file mode 100644
index 000000000..5f5ce0734
--- /dev/null
+++ b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl
@@ -0,0 +1,5 @@
+{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital city of France.","pair_label":1}
+{"response_a":"2 + 2 = 4.","response_b":"Four is the sum of two plus two.","pair_label":1}
+{"response_a":"Water freezes at 0°C at standard atmospheric pressure.","response_b":"At 1 atm, water’s freezing point is 0 degrees Celsius.","pair_label":1}
+{"response_a":"The capital of France is Paris.","response_b":"The mitochondria is the powerhouse of the cell.","pair_label":0}
+{"response_a":"2 + 2 = 4.","response_b":"Photosynthesis converts light into chemical energy in plants.","pair_label":0}
diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py
new file mode 100644
index 000000000..3ade21ca7
--- /dev/null
+++ b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py
@@ -0,0 +1,121 @@
+# MIT License
+# Copyright (c) 2024 The HuggingFace Team
+
+"""
+Minimal loader for the TVD-MI paired-response synthetic example.
+
+This module intentionally avoids tight coupling to task registries so it can be
+used as a simple reference/template. It provides `read_jsonl()` and `build_docs()`
+helpers to construct lighteval `Doc` objects with the fields expected by TVD-MI.
+
+Expected JSONL schema per line:
+  - response_a: str
+  - response_b: str
+  - pair_label: int (1=same, 0=different)
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+
+try:
+    # lighteval Doc type (preferred if available)
+    from lighteval.tasks.requests import Doc  # type: ignore
+except Exception:
+    # Fallback: minimal doc type for local testing / documentation purposes
+    @dataclass
+    class Doc:  # type: ignore
+        query: str = ""
+        choices: list[str] | None = None
+        gold_index: int | list[int] | None = None
+        task_name: str | None = None
+        specific: dict[str, Any] | None = None
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_DATA_PATH = HERE / "tvd_mi_synthetic.jsonl"
+
+
+def read_jsonl(path: str | Path) -> list[dict[str, Any]]:
+    path = Path(path)
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line_num, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append(json.loads(line))
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON on line {line_num} of {path}: {e}") from e
+    return rows
+
+
+def _set_attr_if_possible(obj: Any, name: str, value: Any) -> None:
+    """
+    Try to set `obj.name = value`. Some Doc implementations may forbid new attributes.
+    """
+    try:
+        setattr(obj, name, value)
+    except Exception:
+        # It's fine if Doc is strict; we always store in `specific` too.
+        pass
+
+
+def build_docs(rows: Iterable[dict[str, Any]], task_name: str = "tvd_mi_synthetic") -> list[Doc]:
+    docs: list[Doc] = []
+    for i, r in enumerate(rows):
+        if "response_a" not in r or "response_b" not in r or "pair_label" not in r:
+            raise ValueError(
+                f"Row {i} missing required keys. Expected response_a/response_b/pair_label. Got keys={list(r.keys())}"
+            )
+
+        response_a = str(r["response_a"])
+        response_b = str(r["response_b"])
+        pair_label = int(r["pair_label"])
+
+        # Create a minimal Doc. Many metrics/tests assume `query`/`choices` exist.
+        doc = Doc(
+            query="",
+            choices=[],
+            gold_index=0,
+            task_name=task_name,
+            specific={
+                "response_a": response_a,
+                "response_b": response_b,
+                "pair_label": pair_label,
+            },
+        )
+
+        # Also set direct attributes for compatibility with JudgeLLMTVDMI.compute as currently implemented.
+        _set_attr_if_possible(doc, "response_a", response_a)
+        _set_attr_if_possible(doc, "response_b", response_b)
+        _set_attr_if_possible(doc, "pair_label", pair_label)
+
+        docs.append(doc)
+
+    return docs
+
+
+def load_default_docs() -> list[Doc]:
+    """
+    Convenience helper to load the default JSONL shipped with this example folder.
+    """
+    rows = read_jsonl(DEFAULT_DATA_PATH)
+    return build_docs(rows)
+
+
+if __name__ == "__main__":
+    docs = load_default_docs()
+    print(f"Loaded {len(docs)} docs from {DEFAULT_DATA_PATH}")
+    print(
+        "First doc has attrs:",
+        hasattr(docs[0], "response_a"),
+        hasattr(docs[0], "response_b"),
+        hasattr(docs[0], "pair_label"),
+    )
+    print("First doc specific keys:", list((docs[0].specific or {}).keys()))
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index 82cfbb706..ae3470465 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -35,6 +35,7 @@
     CorpusLevelF1Score,
     CorpusLevelPerplexityMetric,
     CorpusLevelTranslationMetric,
+    CorpusLevelTVDMI,
     MatthewsCorrCoef,
 )
 from lighteval.metrics.metrics_sample import (
@@ -51,13 +52,19 @@
     Faithfulness,
     GPassAtK,
     JudgeLLMSimpleQA,
+    JudgeLLMTVDMI,
     LoglikelihoodAcc,
     MajAtN,
     PassAtK,
     Recall,
     StringDistance,
+    process_judge_response_tvdmi,
+)
+from lighteval.metrics.normalizations import (
+    bigbench_normalizer,
+    remove_braces,
+    remove_braces_and_strip,
 )
-from lighteval.metrics.normalizations import bigbench_normalizer, remove_braces, remove_braces_and_strip
 from lighteval.metrics.sample_preparator import (
     GenerativePreparator,
     LoglikelihoodPreparator,
@@ -84,7 +91,10 @@
 @scorer(metrics=[accuracy()])
 def math_scorer():
     gold_extraction_target = (ExprExtractionConfig(),)
-    pred_extraction_target = (ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0))
+    pred_extraction_target = (
+        ExprExtractionConfig(),
+        LatexExtractionConfig(boxed_match_priority=0),
+    )
     language = Language.ENGLISH
     fallback_mode = "first_match"
     extraction_mode = "first_match"
@@ -95,10 +105,18 @@ def math_scorer():
 
     async def score(state: TaskState, target: Target):
         extracted_predictions = extract_target_from_pred(
-            state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+            state.output.completion,
+            pred_extraction_regexes,
+            fallback_mode,
+            extraction_mode,
+            timeout_seconds,
         )
         extracted_gold = extract_target_from_pred(
-            target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+            target.text,
+            gold_extraction_regexes,
+            fallback_mode,
+            extraction_mode,
+            timeout_seconds,
         )
         return Score(
             # Correct or Incorrect, used by inspect-ai backend
@@ -128,10 +146,18 @@ def multichoice_scorer():
 
     async def score(state: TaskState, target: Target):
         extracted_predictions = extract_target_from_pred(
-            state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+            state.output.completion,
+            pred_extraction_regexes,
+            fallback_mode,
+            extraction_mode,
+            timeout_seconds,
         )
         extracted_gold = extract_target_from_pred(
-            target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+            target.text,
+            gold_extraction_regexes,
+            fallback_mode,
+            extraction_mode,
+            timeout_seconds,
         )
         return Score(
             # Correct or Incorrect, used by inspect-ai backend
@@ -143,6 +169,45 @@ async def score(state: TaskState, target: Target):
     return score
 
 
+@scorer(metrics=[accuracy(), stderr()])
+def tvd_mi_scorer():
+    """
+    Inspect-compatible scorer for TVD-MI pair classification.
+    """
+
+    def _normalize_gold(label: str) -> int | None:
+        if not isinstance(label, str):
+            return None
+        s = label.strip().upper()
+        if s in {"A", "SAME", "POSITIVE", "1"}:
+            return 1
+        if s in {"B", "DIFFERENT", "NEGATIVE", "0"}:
+            return 0
+        return None
+
+    async def score(state: TaskState, target: Target) -> Score:
+        raw_pred = state.output.completion
+        # Interpretation mapping logic
+        pred_val = process_judge_response_tvdmi(raw_pred)
+
+        try:
+            pred_label = int(pred_val)
+        except Exception:
+            pred_label = None
+
+        gold_label = _normalize_gold(str(target.text))
+
+        correct = pred_label is not None and gold_label is not None and pred_label == gold_label
+        # Correct or Incorrect, used by inspect-ai backend
+        return Score(
+            value="C" if correct else "I",
+            explanation=raw_pred,
+            answer=str(pred_label),
+        )
+
+    return score
+
+
 class Metrics(Enum):
     acc_golds_likelihood = SampleLevelMetric(  # todo: we need a better name for this!
         metric_name="acc",
@@ -163,8 +228,14 @@ class Metrics(Enum):
         sample_level_fn=AvgAtN(
             sample_scoring_function=MultilingualExtractiveMatchMetric(
                 language=Language.ENGLISH,
-                gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
-                pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
+                gold_extraction_target=[
+                    ExprExtractionConfig(),
+                    LatexExtractionConfig(),
+                ],
+                pred_extraction_target=[
+                    ExprExtractionConfig(),
+                    LatexExtractionConfig(),
+                ],
                 precision=6,
             ),
         ),
@@ -176,8 +247,16 @@ class Metrics(Enum):
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
         sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip),
         category=SamplingMethod.GENERATIVE,
-        corpus_level_fn={"BERTScore-P": np.mean, "BERTScore-R": np.mean, "BERTScore-F": np.mean},
-        higher_is_better={"BERTScore-P": True, "BERTScore-R": True, "BERTScore-F": True},
+        corpus_level_fn={
+            "BERTScore-P": np.mean,
+            "BERTScore-R": np.mean,
+            "BERTScore-F": np.mean,
+        },
+        higher_is_better={
+            "BERTScore-P": True,
+            "BERTScore-R": True,
+            "BERTScore-F": True,
+        },
     )
     bits_per_byte = CorpusLevelMetric(
         metric_name="bits_per_byte",
@@ -237,13 +316,30 @@ class Metrics(Enum):
         higher_is_better=True,
     )
     copyright = SampleLevelMetricGrouping(
-        metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"],
+        metric_name=[
+            "longest_common_prefix_length",
+            "edit_distance",
+            "edit_similarity",
+        ],
         sample_level_fn=StringDistance(
-            metric_types=["longest_common_prefix_length", "edit_distance", "edit_similarity"], strip_prediction=True
+            metric_types=[
+                "longest_common_prefix_length",
+                "edit_distance",
+                "edit_similarity",
+            ],
+            strip_prediction=True,
         ),
         category=SamplingMethod.GENERATIVE,
-        corpus_level_fn={"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max},
-        higher_is_better={"longest_common_prefix_length": True, "edit_distance": False, "edit_similarity": True},
+        corpus_level_fn={
+            "longest_common_prefix_length": max,
+            "edit_distance": min,
+            "edit_similarity": max,
+        },
+        higher_is_better={
+            "longest_common_prefix_length": True,
+            "edit_distance": False,
+            "edit_similarity": True,
+        },
     )
     drop = SampleLevelMetricGrouping(
         metric_name=["em", "f1"],
@@ -267,7 +363,10 @@ class Metrics(Enum):
             precision=5,
             gold_extraction_target=(ExprExtractionConfig(),),
             # Match boxed first before trying other regexes
-            pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+            pred_extraction_target=(
+                ExprExtractionConfig(),
+                LatexExtractionConfig(boxed_match_priority=0),
+            ),
             aggregation_function=max,
         ),
         category=SamplingMethod.GENERATIVE,
@@ -275,9 +374,15 @@ class Metrics(Enum):
         higher_is_better=True,
     )
     extractiveness = SampleLevelMetricGrouping(
-        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        metric_name=[
+            "summarization_coverage",
+            "summarization_density",
+            "summarization_compression",
+        ],
         sample_level_fn=Extractiveness(
-            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
+            normalize_input=remove_braces,
+            normalize_pred=remove_braces_and_strip,
+            input_column="text",
         ),
         category=SamplingMethod.GENERATIVE,
         corpus_level_fn={
@@ -292,9 +397,16 @@ class Metrics(Enum):
         },
     )
     extractiveness_de = SampleLevelMetricGrouping(
-        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        metric_name=[
+            "summarization_coverage",
+            "summarization_density",
+            "summarization_compression",
+        ],
         sample_level_fn=Extractiveness(
-            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="de"
+            normalize_input=remove_braces,
+            normalize_pred=remove_braces_and_strip,
+            input_column="text",
+            language="de",
         ),
         category=SamplingMethod.GENERATIVE,
         corpus_level_fn={
@@ -309,9 +421,16 @@ class Metrics(Enum):
         },
     )
     extractiveness_fr = SampleLevelMetricGrouping(
-        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        metric_name=[
+            "summarization_coverage",
+            "summarization_density",
+            "summarization_compression",
+        ],
         sample_level_fn=Extractiveness(
-            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="fr"
+            normalize_input=remove_braces,
+            normalize_pred=remove_braces_and_strip,
+            input_column="text",
+            language="fr",
         ),
         category=SamplingMethod.GENERATIVE,
         corpus_level_fn={
@@ -326,9 +445,16 @@ class Metrics(Enum):
         },
     )
     extractiveness_it = SampleLevelMetricGrouping(
-        metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
+        metric_name=[
+            "summarization_coverage",
+            "summarization_density",
+            "summarization_compression",
+        ],
         sample_level_fn=Extractiveness(
-            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="it"
+            normalize_input=remove_braces,
+            normalize_pred=remove_braces_and_strip,
+            input_column="text",
+            language="it",
         ),
         category=SamplingMethod.GENERATIVE,
         corpus_level_fn={
@@ -366,7 +492,9 @@ class Metrics(Enum):
     faithfulness = SampleLevelMetric(
         metric_name="summac",
         sample_level_fn=Faithfulness(
-            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
+            normalize_input=remove_braces,
+            normalize_pred=remove_braces_and_strip,
+            input_column="text",
         ),
         category=SamplingMethod.GENERATIVE,
         corpus_level_fn=np.mean,
@@ -390,7 +518,10 @@ class Metrics(Enum):
                 precision=5,
                 gold_extraction_target=(ExprExtractionConfig(),),
                 # Match boxed first before trying other regexes
-                pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+                pred_extraction_target=(
+                    ExprExtractionConfig(),
+                    LatexExtractionConfig(boxed_match_priority=0),
+                ),
                 aggregation_function=max,
             ),
         ),
@@ -409,7 +540,10 @@ class Metrics(Enum):
                 precision=5,
                 gold_extraction_target=(LatexExtractionConfig(),),
                 # Match boxed first before trying other regexes
-                pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+                pred_extraction_target=(
+                    ExprExtractionConfig(),
+                    LatexExtractionConfig(boxed_match_priority=0),
+                ),
                 aggregation_function=max,
             ),
         ),
@@ -473,8 +607,14 @@ class Metrics(Enum):
             # Extracting mathematical expressions and latex expressions
             sample_scoring_function=MultilingualExtractiveMatchMetric(
                 language=Language.ENGLISH,
-                gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
-                pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
+                gold_extraction_target=[
+                    ExprExtractionConfig(),
+                    LatexExtractionConfig(),
+                ],
+                pred_extraction_target=[
+                    ExprExtractionConfig(),
+                    LatexExtractionConfig(),
+                ],
                 precision=6,
             ),
         ),
@@ -519,8 +659,18 @@ class Metrics(Enum):
             normalize_pred=bigbench_normalizer,
         ),
         category=SamplingMethod.GENERATIVE,
-        corpus_level_fn={"rouge1": np.mean, "rouge2": np.mean, "rougeL": np.mean, "rougeLsum": np.mean},
-        higher_is_better={"rouge1": True, "rouge2": True, "rougeL": True, "rougeLsum": True},
+        corpus_level_fn={
+            "rouge1": np.mean,
+            "rouge2": np.mean,
+            "rougeL": np.mean,
+            "rougeLsum": np.mean,
+        },
+        higher_is_better={
+            "rouge1": True,
+            "rouge2": True,
+            "rougeL": True,
+            "rougeLsum": True,
+        },
     )
     rouge1 = SampleLevelMetric(
         metric_name="rouge1",
@@ -593,10 +743,16 @@ class Metrics(Enum):
         sample_level_fn=MultilingualExtractiveMatchMetric(
             language=Language.ENGLISH,
             gold_extraction_target=[
-                IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
+                IndicesExtractionConfig(
+                    prefix_for_extraction="NativeLetters",
+                    try_extract_without_anchor=True,
+                )
             ],
             pred_extraction_target=[
-                IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
+                IndicesExtractionConfig(
+                    prefix_for_extraction="NativeLetters",
+                    try_extract_without_anchor=True,
+                )
             ],
             precision=6,
         ),
@@ -610,10 +766,16 @@ class Metrics(Enum):
             sample_scoring_function=MultilingualExtractiveMatchMetric(
                 language=Language.ENGLISH,
                 gold_extraction_target=[
-                    IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
+                    IndicesExtractionConfig(
+                        prefix_for_extraction="NativeLetters",
+                        try_extract_without_anchor=True,
+                    )
                 ],
                 pred_extraction_target=[
-                    IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
+                    IndicesExtractionConfig(
+                        prefix_for_extraction="NativeLetters",
+                        try_extract_without_anchor=True,
+                    )
                 ],
                 precision=6,
             ),
@@ -622,6 +784,13 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    tvd_mi = CorpusLevelMetric(
+        metric_name="tvd_mi",
+        sample_level_fn=JudgeLLMTVDMI(),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=CorpusLevelTVDMI(),
+        higher_is_better=True,
+    )
 
     def __str__(self):
         return self.name.replace("_at_", "@")
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index 92c2c574a..2cd630b54 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -107,7 +107,9 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
         for i in range(self.num_classes):
             f1s.append(
                 sklearn.metrics.f1_score(
-                    y_true=[g == i for g in golds], y_pred=[p == i for p in preds], average=self.average
+                    y_true=[g == i for g in golds],
+                    y_pred=[p == i for p in preds],
+                    average=self.average,
                 )
             )
         return float(np.mean(f1s))
@@ -190,3 +192,22 @@ def compute_corpus(self, items: list[PerplexityCorpusMetricInput]):
             return math.exp(-sum(logprobs) / sum(weights))
         if self.metric_type == "bits_per_byte":
             return -sum(logprobs) / sum(weights) * 1 / math.log(2)
+
+
+class CorpusLevelTVDMI:
+    def __call__(self, items):
+        # items: list of dicts returned by JudgeLLMTVDMI.compute
+        labels = np.array([it["label"] for it in items])
+        preds = np.array([it["pred"] for it in items])
+
+        pos = labels == 1
+        neg = ~pos
+
+        if pos.sum() == 0 or neg.sum() == 0:
+            return {"tvd_mi": float("nan")}
+
+        tpr = (preds[pos] == 1).mean()
+        tnr = (preds[neg] == 0).mean()
+        tvd_mi = tpr + tnr - 1.0
+
+        return {"tvd_mi": float(tvd_mi)}
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index d83e64e22..ad0c85be1 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -51,7 +51,12 @@
     remove_braces,
     remove_braces_and_strip,
 )
-from lighteval.metrics.utils.judge_utils import get_judge_prompt_simpleqa, process_judge_response_simpleqa
+from lighteval.metrics.utils.judge_utils import (
+    get_judge_prompt_simpleqa,
+    get_judge_prompt_tvdmi,
+    process_judge_response_simpleqa,
+    process_judge_response_tvdmi,
+)
 from lighteval.metrics.utils.llm_as_judge import JudgeLM
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.requests import Doc
@@ -643,7 +648,10 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str
             logger.warning("The first metric computation step might be a bit longer as we need to download the model.")
             # We only initialize on first compute
             self.bert_scorer = BERTScorer(
-                model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, num_layers=9
+                model_type="microsoft/deberta-large-mnli",
+                lang="en",
+                rescale_with_baseline=True,
+                num_layers=9,
             )
         golds = as_list(golds)
         predictions = as_list(predictions)
@@ -655,7 +663,11 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str
             predictions = [self.normalize_pred(p) for p in predictions]
 
         p, r, f = self.bert_scorer.score(predictions, golds)
-        return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()}
+        return {
+            "BERTScore-P": p[0].item(),
+            "BERTScore-R": r[0].item(),
+            "BERTScore-F": f[0].item(),
+        }
 
 
 class Extractiveness(SampleLevelComputation):
@@ -856,7 +868,11 @@ def __init__(
             metric_types (list[str] | str): Can be one or any of `longest_common_prefix_length`, `edit_distance` or `edit_similarity`.
             strip_prediction (bool, optional): Whether to strip the prediction. Defaults to True.
         """
-        allowed_values = ["longest_common_prefix_length", "edit_distance", "edit_similarity"]
+        allowed_values = [
+            "longest_common_prefix_length",
+            "edit_distance",
+            "edit_similarity",
+        ]
         metric_types = as_list(metric_types)
         if any(metric_type not in allowed_values for metric_type in metric_types):
             raise ValueError(
@@ -864,7 +880,11 @@ def __init__(
             )
         self.metric_types = metric_types
         self.strip_prediction = strip_prediction
-        self.sample_aggregations = {"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max}
+        self.sample_aggregations = {
+            "longest_common_prefix_length": max,
+            "edit_distance": min,
+            "edit_similarity": max,
+        }
 
     def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         """Computes all the requested metrics on the golds and prediction.
@@ -940,7 +960,13 @@ def edit_similarity(self, s1, s2):
 
 
 class JudgeLLM(SampleLevelComputation):
-    available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4", "gpt-4o-2024-08-06"]
+    available_models_openai = [
+        "gpt-3.5-turbo",
+        "gpt-4o",
+        "gpt-4-turbo",
+        "gpt-4",
+        "gpt-4o-2024-08-06",
+    ]
 
     def __init__(
         self,
@@ -1065,10 +1091,14 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs)
         query_context_2 = {"query": questions[1], "context": predictions[0]}
 
         score_turn_1, message_turn_1, judgement_turn_1 = self.judge.evaluate_answer(
-            question=json.dumps(query_context_1, indent=2), answer=predictions[0], gold=golds[0] if golds else None
+            question=json.dumps(query_context_1, indent=2),
+            answer=predictions[0],
+            gold=golds[0] if golds else None,
         )
         score_turn_2, message_turn_2, judgement_turn_2 = self.judge.evaluate_answer(
-            question=json.dumps(query_context_2, indent=2), answer=predictions[1], gold=golds[1] if golds else None
+            question=json.dumps(query_context_2, indent=2),
+            answer=predictions[1],
+            gold=golds[1] if golds else None,
         )
 
         return {
@@ -1106,6 +1136,50 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs):
         return metrics
 
 
+class JudgeLLMTVDMI(JudgeLLM):
+    def __init__(self):
+        super().__init__(
+            judge_model_name="gpt-4o-2024-08-06",
+            template=get_judge_prompt_tvdmi,
+            process_judge_response=process_judge_response_tvdmi,
+            judge_backend="openai",
+            short_judge_name="gpt4o",
+        )
+
+    def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list:
+        # For TVD-MI, the evaluated model is the judge; the “responses” from
+        # base models are already baked into docs as response_a / response_b.
+        def _get(d, k):
+            return (
+                getattr(d, k, None)
+                if getattr(d, k, None) is not None
+                else (d.specific.get(k) if getattr(d, "specific", None) else None)
+            )
+
+        questions = [_get(d, "response_a") for d in docs]
+        answers = [_get(d, "response_b") for d in docs]
+        labels = [int(_get(d, "pair_label")) for d in docs]
+
+        options = [None] * len(docs)
+        golds = [None] * len(docs)
+
+        scores, prompts, judge_responses = self.judge.evaluate_answer_batch(questions, answers, options, golds)
+
+        metrics = []
+        for i in range(len(docs)):
+            pred = scores[i]  # already 0/1 from process_judge_response_tvdmi
+            metrics.append(
+                {
+                    "label": labels[i],
+                    "pred": pred,
+                    f"user_prompt_{self.short_judge_name}": prompts[i],
+                    f"judgement_{self.short_judge_name}": judge_responses[i],
+                }
+            )
+
+        return metrics
+
+
 class SamplingMetric:
     """All sampling metrics we have defined below use the same set of normalization parameters and same behavior for the default sample_scoring_function.
     This class just holds the normalization and applies it to all samples passed to preprocess, then uses the default sample function if not provided.
@@ -1115,7 +1189,7 @@ def __init__(
         self,
         normalize: Callable | str | None = None,
         strip_strings: bool = False,
-        sample_scoring_function: Callable[[Doc, ModelResponse], float] | str | None = None,
+        sample_scoring_function: (Callable[[Doc, ModelResponse], float] | str | None) = None,
     ):
         if isinstance(normalize, str):
             import lighteval.metrics.normalizations
diff --git a/src/lighteval/metrics/utils/judge_utils.py b/src/lighteval/metrics/utils/judge_utils.py
index cde25fd26..b78d00dc0 100644
--- a/src/lighteval/metrics/utils/judge_utils.py
+++ b/src/lighteval/metrics/utils/judge_utils.py
@@ -125,3 +125,49 @@ def process_judge_response_simpleqa(response: str) -> float:
     else:
         logger.warning(f"Unknown response from judge: {response}")
         return 0.0
+
+
+def get_judge_prompt_tvdmi(
+    question: str,
+    answer: str,
+    options: list[str] | None = None,
+    gold: str | None = None,
+    **kwargs,
+):
+    # `question` = response_a, `answer` = response_b
+    TEMPLATE = """
+You are judging whether two responses are likely derived from the same underlying task or item.
+
+Consider ONLY whether the two responses plausibly come from the same task/source, not whether they are factually correct.
+
+Response A:
+{response_a}
+
+Response B:
+{response_b}
+
+Grade as:
+A: SAME TASK/SOURCE
+B: DIFFERENT TASK/SOURCE
+
+Just return the letter "A" or "B", with no other text.
+""".strip()
+
+    content = TEMPLATE.format(response_a=question, response_b=answer)
+    return [{"role": "user", "content": content}]
+
+
+def process_judge_response_tvdmi(response: str) -> int:
+    # Normalize
+    if response is None:
+        return 0
+
+    cleaned = response.strip().lower()
+
+    if cleaned == "a":
+        return 1
+    elif cleaned == "b":
+        return 0
+    else:
+        logger.warning(f"Unknown response from TVD-MI judge: {response!r}")
+        return 0
diff --git a/tests/unit/metrics/test_tvd_mi.py b/tests/unit/metrics/test_tvd_mi.py
new file mode 100644
index 000000000..f2bb5e088
--- /dev/null
+++ b/tests/unit/metrics/test_tvd_mi.py
@@ -0,0 +1,185 @@
+import asyncio
+import math
+from dataclasses import dataclass
+
+import pytest
+
+from lighteval.metrics.metrics import tvd_mi_scorer
+from lighteval.metrics.metrics_corpus import CorpusLevelTVDMI
+from lighteval.metrics.metrics_sample import JudgeLLMTVDMI
+from lighteval.metrics.utils.judge_utils import (
+    get_judge_prompt_tvdmi,
+    process_judge_response_tvdmi,
+)
+
+
+def test_get_judge_prompt_tvdmi_injects_responses():
+    question = "Resp A"
+    answer = "Resp B"
+
+    messages = get_judge_prompt_tvdmi(question=question, answer=answer, options=None, gold=None)
+
+    # Should be a single chat message
+    assert isinstance(messages, list)
+    assert len(messages) == 1
+    msg = messages[0]
+    assert msg["role"] == "user"
+
+    content = msg["content"]
+    # Basic structure checks
+    assert "Response A:" in content
+    assert "Response B:" in content
+    assert "Resp A" in content
+    assert "Resp B" in content
+    # Should mention A/B grading
+    assert "A:" in content
+    assert "B:" in content
+
+
+def test_process_judge_response_tvdmi_maps_A_B():
+    assert process_judge_response_tvdmi("A") == 1
+    assert process_judge_response_tvdmi("B") == 0
+    # Robust to case/whitespace
+    assert process_judge_response_tvdmi(" a \n") == 1
+    assert process_judge_response_tvdmi(" b\t") == 0
+
+
+def test_process_judge_response_tvdmi_unknown_falls_back_to_0(caplog):
+    with caplog.at_level("WARNING"):
+        out = process_judge_response_tvdmi("weird")
+    assert out == 0
+    # Optional: check that we actually logged something
+    assert any("TVD-MI judge" in rec.message for rec in caplog.records)
+
+
+def test_corpus_level_tvdmi_perfect_critic():
+    # Always correct on both positive and negative
+    items = [
+        {"label": 1, "pred": 1},
+        {"label": 1, "pred": 1},
+        {"label": 0, "pred": 0},
+        {"label": 0, "pred": 0},
+    ]
+
+    result = CorpusLevelTVDMI()(items)
+    assert "tvd_mi" in result
+    assert result["tvd_mi"] == pytest.approx(1.0)
+
+
+def test_corpus_level_tvdmi_random_critic():
+    # 50% TPR, 50% TNR → TVD-MI = 0
+    items = [
+        {"label": 1, "pred": 1},
+        {"label": 1, "pred": 0},
+        {"label": 0, "pred": 0},
+        {"label": 0, "pred": 1},
+    ]
+
+    result = CorpusLevelTVDMI()(items)
+    assert result["tvd_mi"] == pytest.approx(0.0)
+
+
+def test_corpus_level_tvdmi_missing_class_returns_nan():
+    # No negatives → TVD-MI undefined
+    items = [
+        {"label": 1, "pred": 1},
+        {"label": 1, "pred": 0},
+    ]
+
+    result = CorpusLevelTVDMI()(items)
+    assert math.isnan(result["tvd_mi"])
+
+
+@dataclass
+class FakeDoc:
+    response_a: str
+    response_b: str
+    pair_label: int
+
+
+def test_judge_tvdmi_compute(monkeypatch):
+    judge = JudgeLLMTVDMI()
+
+    # Two examples: one positive, one negative
+    docs = [
+        FakeDoc("A1", "A2", 1),
+        FakeDoc("B1", "B2", 0),
+    ]
+
+    # Fake judge backend: we want to check what arguments it receives,
+    # and return deterministic scores/prompts/responses.
+    def fake_evaluate_answer_batch(questions, answers, options, golds, **kwargs):
+        # Input wiring checks
+        assert questions == ["A1", "B1"]
+        assert answers == ["A2", "B2"]
+        assert options == [None, None]
+        assert golds == [None, None]
+
+        scores = [1, 0]  # predict SAME for first, DIFFERENT for second
+        prompts = ["prompt-0", "prompt-1"]
+        responses = ["A", "B"]  # raw judge outputs
+        return scores, prompts, responses
+
+    # Attach a fake .judge with our method
+    class FakeInnerJudge:
+        def evaluate_answer_batch(self, *args, **kwargs):
+            return fake_evaluate_answer_batch(*args, **kwargs)
+
+    monkeypatch.setattr(judge, "judge", FakeInnerJudge())
+
+    metrics = judge.compute(responses=[], docs=docs)
+
+    assert len(metrics) == 2
+
+    # Check labels and preds propagated correctly
+    assert metrics[0]["label"] == 1
+    assert metrics[0]["pred"] == 1
+    assert metrics[1]["label"] == 0
+    assert metrics[1]["pred"] == 0
+
+    # Check extra fields exist (names match your short_judge_name)
+    assert any(k.startswith("user_prompt_") for k in metrics[0].keys())
+    assert any(k.startswith("judgement_") for k in metrics[0].keys())
+
+
+# ---- Inspect-compatible scorer tests ----
+
+
+class _DummyOutput:
+    def __init__(self, completion: str):
+        self.completion = completion
+
+
+class _DummyState:
+    def __init__(self, completion: str):
+        self.output = _DummyOutput(completion)
+
+
+class _DummyTarget:
+    def __init__(self, text: str):
+        self.text = text
+
+
+def test_tvd_mi_scorer_matches_label_A():
+    """The inspect-ai tvd_mi_scorer should mark matching 'A' labels as correct."""
+    scorer_fn = tvd_mi_scorer()
+
+    state = _DummyState("A")  # model/judge output
+    target = _DummyTarget("A")  # gold label
+
+    score = asyncio.run(scorer_fn(state, target))
+
+    assert score.value == "C"
+    assert score.answer == "1"  # normalized positive class
+
+
+def test_tvd_mi_scorer_mismatch_is_incorrect():
+    """Mismatched A/B labels should be scored as incorrect."""
+    scorer_fn = tvd_mi_scorer()
+
+    state = _DummyState("B")  # model says DIFFERENT
+    target = _DummyTarget("A")  # gold SAME
+
+    score = asyncio.run(scorer_fn(state, target))
+
+    assert score.value == "I"