Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/metric-list.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,4 @@ These metrics need the model to generate an output. They are therefore slower.
- `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.
- `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
- `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
- `tvd_mi`: Corpus-level LLM-as-a-judge metric that estimates a lower bound on total variation mutual information using paired responses. It assumes each example has two responses and a binary label indicating whether they are from the same underlying item (`1`) or from different items (`0`), and computes `TPR + TNR - 1` from the judge’s binary decisions.
197 changes: 163 additions & 34 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
CorpusLevelF1Score,
CorpusLevelPerplexityMetric,
CorpusLevelTranslationMetric,
CorpusLevelTVDMI,
MatthewsCorrCoef,
)
from lighteval.metrics.metrics_sample import (
Expand All @@ -51,13 +52,18 @@
Faithfulness,
GPassAtK,
JudgeLLMSimpleQA,
JudgeLLMTVDMI,
LoglikelihoodAcc,
MajAtN,
PassAtK,
Recall,
StringDistance,
)
from lighteval.metrics.normalizations import bigbench_normalizer, remove_braces, remove_braces_and_strip
from lighteval.metrics.normalizations import (
bigbench_normalizer,
remove_braces,
remove_braces_and_strip,
)
from lighteval.metrics.sample_preparator import (
GenerativePreparator,
LoglikelihoodPreparator,
Expand All @@ -84,7 +90,10 @@
@scorer(metrics=[accuracy()])
def math_scorer():
gold_extraction_target = (ExprExtractionConfig(),)
pred_extraction_target = (ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0))
pred_extraction_target = (
ExprExtractionConfig(),
LatexExtractionConfig(boxed_match_priority=0),
)
language = Language.ENGLISH
fallback_mode = "first_match"
extraction_mode = "first_match"
Expand All @@ -95,10 +104,18 @@ def math_scorer():

async def score(state: TaskState, target: Target):
extracted_predictions = extract_target_from_pred(
state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
state.output.completion,
pred_extraction_regexes,
fallback_mode,
extraction_mode,
timeout_seconds,
)
extracted_gold = extract_target_from_pred(
target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
target.text,
gold_extraction_regexes,
fallback_mode,
extraction_mode,
timeout_seconds,
)
return Score(
# Correct or Incorrect, used by inspect-ai backend
Expand Down Expand Up @@ -128,10 +145,18 @@ def multichoice_scorer():

async def score(state: TaskState, target: Target):
extracted_predictions = extract_target_from_pred(
state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
state.output.completion,
pred_extraction_regexes,
fallback_mode,
extraction_mode,
timeout_seconds,
)
extracted_gold = extract_target_from_pred(
target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
target.text,
gold_extraction_regexes,
fallback_mode,
extraction_mode,
timeout_seconds,
)
return Score(
# Correct or Incorrect, used by inspect-ai backend
Expand Down Expand Up @@ -163,8 +188,14 @@ class Metrics(Enum):
sample_level_fn=AvgAtN(
sample_scoring_function=MultilingualExtractiveMatchMetric(
language=Language.ENGLISH,
gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
gold_extraction_target=[
ExprExtractionConfig(),
LatexExtractionConfig(),
],
pred_extraction_target=[
ExprExtractionConfig(),
LatexExtractionConfig(),
],
precision=6,
),
),
Expand All @@ -176,8 +207,16 @@ class Metrics(Enum):
metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={"BERTScore-P": np.mean, "BERTScore-R": np.mean, "BERTScore-F": np.mean},
higher_is_better={"BERTScore-P": True, "BERTScore-R": True, "BERTScore-F": True},
corpus_level_fn={
"BERTScore-P": np.mean,
"BERTScore-R": np.mean,
"BERTScore-F": np.mean,
},
higher_is_better={
"BERTScore-P": True,
"BERTScore-R": True,
"BERTScore-F": True,
},
)
bits_per_byte = CorpusLevelMetric(
metric_name="bits_per_byte",
Expand Down Expand Up @@ -237,13 +276,30 @@ class Metrics(Enum):
higher_is_better=True,
)
copyright = SampleLevelMetricGrouping(
metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"],
metric_name=[
"longest_common_prefix_length",
"edit_distance",
"edit_similarity",
],
sample_level_fn=StringDistance(
metric_types=["longest_common_prefix_length", "edit_distance", "edit_similarity"], strip_prediction=True
metric_types=[
"longest_common_prefix_length",
"edit_distance",
"edit_similarity",
],
strip_prediction=True,
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max},
higher_is_better={"longest_common_prefix_length": True, "edit_distance": False, "edit_similarity": True},
corpus_level_fn={
"longest_common_prefix_length": max,
"edit_distance": min,
"edit_similarity": max,
},
higher_is_better={
"longest_common_prefix_length": True,
"edit_distance": False,
"edit_similarity": True,
},
)
drop = SampleLevelMetricGrouping(
metric_name=["em", "f1"],
Expand All @@ -267,17 +323,26 @@ class Metrics(Enum):
precision=5,
gold_extraction_target=(ExprExtractionConfig(),),
# Match boxed first before trying other regexes
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
pred_extraction_target=(
ExprExtractionConfig(),
LatexExtractionConfig(boxed_match_priority=0),
),
aggregation_function=max,
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn=np.mean,
higher_is_better=True,
)
extractiveness = SampleLevelMetricGrouping(
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
metric_name=[
"summarization_coverage",
"summarization_density",
"summarization_compression",
],
sample_level_fn=Extractiveness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
normalize_input=remove_braces,
normalize_pred=remove_braces_and_strip,
input_column="text",
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={
Expand All @@ -292,9 +357,16 @@ class Metrics(Enum):
},
)
extractiveness_de = SampleLevelMetricGrouping(
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
metric_name=[
"summarization_coverage",
"summarization_density",
"summarization_compression",
],
sample_level_fn=Extractiveness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="de"
normalize_input=remove_braces,
normalize_pred=remove_braces_and_strip,
input_column="text",
language="de",
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={
Expand All @@ -309,9 +381,16 @@ class Metrics(Enum):
},
)
extractiveness_fr = SampleLevelMetricGrouping(
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
metric_name=[
"summarization_coverage",
"summarization_density",
"summarization_compression",
],
sample_level_fn=Extractiveness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="fr"
normalize_input=remove_braces,
normalize_pred=remove_braces_and_strip,
input_column="text",
language="fr",
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={
Expand All @@ -326,9 +405,16 @@ class Metrics(Enum):
},
)
extractiveness_it = SampleLevelMetricGrouping(
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
metric_name=[
"summarization_coverage",
"summarization_density",
"summarization_compression",
],
sample_level_fn=Extractiveness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text", language="it"
normalize_input=remove_braces,
normalize_pred=remove_braces_and_strip,
input_column="text",
language="it",
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={
Expand Down Expand Up @@ -366,7 +452,9 @@ class Metrics(Enum):
faithfulness = SampleLevelMetric(
metric_name="summac",
sample_level_fn=Faithfulness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
normalize_input=remove_braces,
normalize_pred=remove_braces_and_strip,
input_column="text",
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn=np.mean,
Expand All @@ -390,7 +478,10 @@ class Metrics(Enum):
precision=5,
gold_extraction_target=(ExprExtractionConfig(),),
# Match boxed first before trying other regexes
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
pred_extraction_target=(
ExprExtractionConfig(),
LatexExtractionConfig(boxed_match_priority=0),
),
aggregation_function=max,
),
),
Expand All @@ -409,7 +500,10 @@ class Metrics(Enum):
precision=5,
gold_extraction_target=(LatexExtractionConfig(),),
# Match boxed first before trying other regexes
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
pred_extraction_target=(
ExprExtractionConfig(),
LatexExtractionConfig(boxed_match_priority=0),
),
aggregation_function=max,
),
),
Expand Down Expand Up @@ -473,8 +567,14 @@ class Metrics(Enum):
# Extracting mathematical expressions and latex expressions
sample_scoring_function=MultilingualExtractiveMatchMetric(
language=Language.ENGLISH,
gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
pred_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()],
gold_extraction_target=[
ExprExtractionConfig(),
LatexExtractionConfig(),
],
pred_extraction_target=[
ExprExtractionConfig(),
LatexExtractionConfig(),
],
precision=6,
),
),
Expand Down Expand Up @@ -519,8 +619,18 @@ class Metrics(Enum):
normalize_pred=bigbench_normalizer,
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={"rouge1": np.mean, "rouge2": np.mean, "rougeL": np.mean, "rougeLsum": np.mean},
higher_is_better={"rouge1": True, "rouge2": True, "rougeL": True, "rougeLsum": True},
corpus_level_fn={
"rouge1": np.mean,
"rouge2": np.mean,
"rougeL": np.mean,
"rougeLsum": np.mean,
},
higher_is_better={
"rouge1": True,
"rouge2": True,
"rougeL": True,
"rougeLsum": True,
},
)
rouge1 = SampleLevelMetric(
metric_name="rouge1",
Expand Down Expand Up @@ -593,10 +703,16 @@ class Metrics(Enum):
sample_level_fn=MultilingualExtractiveMatchMetric(
language=Language.ENGLISH,
gold_extraction_target=[
IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
IndicesExtractionConfig(
prefix_for_extraction="NativeLetters",
try_extract_without_anchor=True,
)
],
pred_extraction_target=[
IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
IndicesExtractionConfig(
prefix_for_extraction="NativeLetters",
try_extract_without_anchor=True,
)
],
precision=6,
),
Expand All @@ -610,10 +726,16 @@ class Metrics(Enum):
sample_scoring_function=MultilingualExtractiveMatchMetric(
language=Language.ENGLISH,
gold_extraction_target=[
IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
IndicesExtractionConfig(
prefix_for_extraction="NativeLetters",
try_extract_without_anchor=True,
)
],
pred_extraction_target=[
IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
IndicesExtractionConfig(
prefix_for_extraction="NativeLetters",
try_extract_without_anchor=True,
)
],
precision=6,
),
Expand All @@ -622,6 +744,13 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
tvd_mi = CorpusLevelMetric(
metric_name="tvd_mi",
sample_level_fn=JudgeLLMTVDMI(),
category=SamplingMethod.GENERATIVE,
corpus_level_fn=CorpusLevelTVDMI(),
higher_is_better=True,
)

def __str__(self):
return self.name.replace("_at_", "@")
Expand Down
Loading
Loading