Merge branch 'main' into main

NathanHB · web-flow · commit 6136dc3ec073 · 2025-11-12T15:39:17.000+01:00
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -147,8 +147,6 @@ your_tasks = [
     LightevalTaskConfig(
         # Name of your evaluation
         name=f"evalname_{language.value}_{formulation.name.lower()}",
-        # The evaluation is community contributed
-        suite=["community"],
         # This will automatically get the correct metrics for your chosen formulation
         metric=get_metrics_for_formulation(
             formulation,
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
@@ -60,12 +60,6 @@ lighteval accelerate \
 
 ### Task Specification
 
-The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:
-
-```txt
-{suite}|{task}|{num_few_shot}
-```
-
 Tasks have a function applied at the sample level and one at the corpus level. For example,
 - an exact match can be applied per sample, then averaged over the corpus to give the final score
 - samples can be left untouched before applying Corpus BLEU at the corpus level
@@ -74,7 +68,7 @@ etc.
 If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI.
 For example
 ```txt
-{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
+{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
 ```
 
 All officially supported tasks can be found at the [tasks_list](available-tasks) and in the
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
@@ -247,9 +247,6 @@ The main results file contains several sections:
         "Question="
       ],
       "num_samples": null,
-      "suite": [
-        "lighteval"
-      ],
       "original_num_docs": 1319,
       "effective_num_docs": 1,
       "must_remove_duplicate_docs": null,
diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
@@ -300,7 +300,6 @@ def __init__(
         evaluation_splits=["test"],
         few_shots_split=None,
         few_shots_select=None,
-        suite=["custom"],
         generation_size=40,
         stop_sequence=None,
     ):
@@ -314,7 +313,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
@@ -401,7 +399,6 @@ def __init__(
         evaluation_splits=["test"],
         few_shots_split="dev",
         few_shots_select=None,
-        suite=None,
         generation_size=-1,
         stop_sequence=None,
     ):
@@ -415,7 +412,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
@@ -512,7 +508,6 @@ def __init__(
         evaluation_splits=["train"],
         few_shots_split="train",
         few_shots_select=None,
-        suite=None,
         generation_size=4,
         stop_sequence=None,
     ):
@@ -526,7 +521,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
@@ -646,7 +640,6 @@ def __init__(
         evaluation_splits=["train"],
         few_shots_split="validation",
         few_shots_select=None,
-        suite=None,
         generation_size=-1,
         stop_sequence=None,
     ):
@@ -660,7 +653,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py
@@ -71,7 +71,6 @@ def mmlu_anatomy(line):
 TASKS_TABLE = [
     LightevalTaskConfig(
         name="mmlu:anatomy",
-        suite=["custom"],
         prompt_function=mmlu_anatomy,
         hf_repo="lighteval/mmlu",
         hf_subset="anatomy",
@@ -85,7 +84,6 @@ def mmlu_anatomy(line):
     ),
     LightevalTaskConfig(
         name="mmlu:anatomy_signs",
-        suite=["custom"],
         prompt_function=mmlu_anatomy_signs,
         hf_repo="lighteval/mmlu",
         hf_subset="anatomy",
diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py
@@ -243,7 +243,7 @@ class Arg:
     type=Annotated[
         str,
         Argument(
-            help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
+            help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'task{|fewshot}'. Use 'lighteval tasks list' to see available tasks."
         ),
     ],
     default=None,  # Required argument, no default
diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py
@@ -22,6 +22,7 @@
 
 import logging
 from collections import defaultdict
+from datetime import datetime
 from typing import Literal
 
 import requests
@@ -211,6 +212,20 @@ def eval(  # noqa C901
     models: Annotated[list[str], Argument(help="Models to evaluate")],
     tasks: Annotated[str, Argument(help="Tasks to evaluate")],
     # model arguments
+    model_base_url: Annotated[
+        str | None,
+        Option(
+            help="Base URL for communicating with the model API.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    model_roles: Annotated[
+        str | None,
+        Option(
+            help="Model creation args (as a dictionary or as a path to a JSON or YAML config file)",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     max_tokens: Annotated[
         int | None,
         Option(
@@ -382,9 +397,9 @@ def eval(  # noqa C901
     ] = None,
     # Logging parameters
     log_dir: Annotated[
-        str,
+        str | None,
         Option(help="Log directory to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
-    ] = "lighteval-logs",
+    ] = None,
     log_dir_allow_dirty: Annotated[
         bool, Option(help="Allow dirty log directory", rich_help_panel=HELP_PANEL_NAME_4)
     ] = True,
@@ -396,6 +411,10 @@ def eval(  # noqa C901
         str | None,
         Option(help="Bundle directory to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
     ] = None,
+    bundle_overwrite: Annotated[
+        bool,
+        Option(help="Overwrite bundle directory if it exists", rich_help_panel=HELP_PANEL_NAME_4),
+    ] = True,
     repo_id: Annotated[
         str | None,
         Option(help="Repository ID to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
@@ -428,6 +447,9 @@ def eval(  # noqa C901
             providers = _get_huggingface_providers(model)
             models = [f"{model.replace(':all', '')}:{provider}" for provider in providers]
 
+    if log_dir is None:
+        log_dir = f"lighteval-logs-{datetime.now().strftime('%Y%m%d%H%M%S')}"
+
     success, logs = inspect_ai_eval_set(
         inspect_ai_tasks,
         model=models,
@@ -440,7 +462,6 @@ def eval(  # noqa C901
         log_dir=log_dir,
         log_dir_allow_dirty=log_dir_allow_dirty,
         display=display,
-        bundle_dir=bundle_dir,
         model_args=model_args,
         max_tokens=max_tokens,
         system_message=system_message,
@@ -463,10 +484,13 @@ def eval(  # noqa C901
         parallel_tool_calls=parallel_tool_calls,
         max_tool_output=max_tool_output,
         internal_tools=internal_tools,
-        overwrite=True,
+        bundle_dir=bundle_dir,
+        bundle_overwrite=bundle_overwrite,
     )
 
     if not success:
+        print("Error evaluating models")
+        print(f"run the same command with --log-dir {log_dir} to retry !")
         return
 
     results_per_model_per_task = {}
@@ -482,12 +506,17 @@ def eval(  # noqa C901
     table_md = results_to_markdown_table(results_per_model_per_task_agg)
 
     if repo_id is not None:
-        push_to_hub(bundle_dir, repo_id, public=public)
+        if bundle_dir is not None:
+            push_to_hub(bundle_dir, repo_id, public=public)
 
     print()
     print(table_md)
     print(f"results saved to {log_dir}")
-    print(f'run "inspect view --log-dir {log_dir}" to view the results')
+
+    if log_dir is not None:
+        print(f'run "inspect view --log-dir {log_dir}" to view the results')
+    else:
+        print("run 'inspect view' to view the results")
 
 
 if __name__ == "__main__":
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -1199,7 +1199,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
             float: Aggregated score over the current sample's items.
         """
         all_scores = []
-        for i in range(self.k):
+        for i in range(self.n):
             all_scores.append(self.compute_score(doc, model_response[i]))
 
         avg_score = np.mean(all_scores)
@@ -1235,7 +1235,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
         Returns:
             float: Aggregated score over the current sample's items.
         """
-        if self.k is None:
+        if self.n is None:
             raise Exception("You did not set the value of n")
 
         golds = doc.get_golds()
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import functools
 import logging
 import random
 from dataclasses import asdict, dataclass, field
@@ -155,7 +156,7 @@ def __post_init__(self):
         self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else ()
         self.full_name = f"{self.name}|{self.num_fewshots}"  # todo clefourrier: this is likely incorrect
 
-    def __str__(self, lite: bool = False):
+    def __str__(self, lite: bool = False):  # noqa: C901
         md_writer = MarkdownTableWriter()
         md_writer.headers = ["Key", "Value"]
 
@@ -170,17 +171,23 @@ def __str__(self, lite: bool = False):
             if k == "metrics":
                 for ix, metrics in enumerate(v):
                     for metric_k, metric_v in metrics.items():
-                        if isinstance(metric_v, Callable):
-                            repr_v = metric_v.__name__
+                        if isinstance(metric_v, functools.partial):
+                            func_name = getattr(metric_v.func, "__name__", str(metric_v.func))
+                            repr_v = f"partial({func_name}, ...)"
+                        elif isinstance(metric_v, Callable):
+                            repr_v = getattr(metric_v, "__name__", repr(metric_v))
                         elif isinstance(metric_v, Metric.get_allowed_types_for_metrics()):
                             repr_v = str(metric_v)
                         else:
                             repr_v = repr(metric_v)
                         values.append([f"{k} {ix}: {metric_k}", repr_v])
 
             else:
-                if isinstance(v, Callable):
-                    values.append([k, v.__name__])
+                if isinstance(v, functools.partial):
+                    func_name = getattr(v.func, "__name__", str(v.func))
+                    values.append([k, f"partial({func_name}, ...)"])
+                elif isinstance(v, Callable):
+                    values.append([k, getattr(v, "__name__", repr(v))])
                 else:
                     values.append([k, repr(v)])
 
@@ -388,7 +395,7 @@ def get_docs(self, max_samples: int | None = None) -> list[Doc]:
             )
             doc.sampling_methods.extend(self.sampling_methods)
             doc.generation_size = self.generation_size
-            doc.use_logits = True
+            doc.use_logits = doc.use_logits if doc.use_logits is not None else True
             doc.stop_sequences = self.stop_sequence
             doc.num_samples = max(self.num_samples)
             docs.append(doc)
diff --git a/src/lighteval/tasks/multilingual/tasks/serbian_eval.py b/src/lighteval/tasks/multilingual/tasks/serbian_eval.py
@@ -234,7 +234,6 @@ def create_task_config(
     hf_subset: str,
     metrics: List,
     evaluation_splits: List[str] = ["test"],
-    suite: List[str] = ["community"],
     hf_avail_splits: List[str] = ["test", "validation"],
     few_shots_split: str = "validation",
     generation_size=5,
@@ -249,7 +248,6 @@ def create_task_config(
         hf_subset: Subset of the dataset.
         metrics: The metrics to use for the task.
         evaluation_splits: The evaluation splits to use (default is "test").
-        suite: The suite of tasks.
         hf_avail_splits: Available splits (default is "test", "validation").
         few_shots_split: Split used for few-shot examples.
         generation_size: Number of generations to produce (default is 5).
@@ -260,7 +258,6 @@ def create_task_config(
     return LightevalTaskConfig(
         name=task_name,
         prompt_function=prompt_function,
-        suite=suite,
         hf_repo=hf_repo,
         hf_subset=hf_subset,
         hf_avail_splits=hf_avail_splits,