Skip to content

Commit 6136dc3

Browse files
authored
Merge branch 'main' into main
2 parents e07acec + 35babcb commit 6136dc3

File tree

10 files changed

+52
-40
lines changed

10 files changed

+52
-40
lines changed

docs/source/contributing-to-multilingual-evaluations.mdx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,6 @@ your_tasks = [
147147
LightevalTaskConfig(
148148
# Name of your evaluation
149149
name=f"evalname_{language.value}_{formulation.name.lower()}",
150-
# The evaluation is community contributed
151-
suite=["community"],
152150
# This will automatically get the correct metrics for your chosen formulation
153151
metric=get_metrics_for_formulation(
154152
formulation,

docs/source/quicktour.mdx

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,6 @@ lighteval accelerate \
6060

6161
### Task Specification
6262

63-
The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:
64-
65-
```txt
66-
{suite}|{task}|{num_few_shot}
67-
```
68-
6963
Tasks have a function applied at the sample level and one at the corpus level. For example,
7064
- an exact match can be applied per sample, then averaged over the corpus to give the final score
7165
- samples can be left untouched before applying Corpus BLEU at the corpus level
@@ -74,7 +68,7 @@ etc.
7468
If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI.
7569
For example
7670
```txt
77-
{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
71+
{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
7872
```
7973

8074
All officially supported tasks can be found at the [tasks_list](available-tasks) and in the

docs/source/saving-and-reading-results.mdx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,9 +247,6 @@ The main results file contains several sections:
247247
"Question="
248248
],
249249
"num_samples": null,
250-
"suite": [
251-
"lighteval"
252-
],
253250
"original_num_docs": 1319,
254251
"effective_num_docs": 1,
255252
"must_remove_duplicate_docs": null,

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,6 @@ def __init__(
300300
evaluation_splits=["test"],
301301
few_shots_split=None,
302302
few_shots_select=None,
303-
suite=["custom"],
304303
generation_size=40,
305304
stop_sequence=None,
306305
):
@@ -314,7 +313,6 @@ def __init__(
314313
evaluation_splits=evaluation_splits,
315314
few_shots_split=few_shots_split,
316315
few_shots_select=few_shots_select,
317-
suite=suite,
318316
generation_size=generation_size,
319317
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
320318
)
@@ -401,7 +399,6 @@ def __init__(
401399
evaluation_splits=["test"],
402400
few_shots_split="dev",
403401
few_shots_select=None,
404-
suite=None,
405402
generation_size=-1,
406403
stop_sequence=None,
407404
):
@@ -415,7 +412,6 @@ def __init__(
415412
evaluation_splits=evaluation_splits,
416413
few_shots_split=few_shots_split,
417414
few_shots_select=few_shots_select,
418-
suite=suite,
419415
generation_size=generation_size,
420416
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
421417
)
@@ -512,7 +508,6 @@ def __init__(
512508
evaluation_splits=["train"],
513509
few_shots_split="train",
514510
few_shots_select=None,
515-
suite=None,
516511
generation_size=4,
517512
stop_sequence=None,
518513
):
@@ -526,7 +521,6 @@ def __init__(
526521
evaluation_splits=evaluation_splits,
527522
few_shots_split=few_shots_split,
528523
few_shots_select=few_shots_select,
529-
suite=suite,
530524
generation_size=generation_size,
531525
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
532526
)
@@ -646,7 +640,6 @@ def __init__(
646640
evaluation_splits=["train"],
647641
few_shots_split="validation",
648642
few_shots_select=None,
649-
suite=None,
650643
generation_size=-1,
651644
stop_sequence=None,
652645
):
@@ -660,7 +653,6 @@ def __init__(
660653
evaluation_splits=evaluation_splits,
661654
few_shots_split=few_shots_split,
662655
few_shots_select=few_shots_select,
663-
suite=suite,
664656
generation_size=generation_size,
665657
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
666658
)

examples/nanotron/custom_task.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def mmlu_anatomy(line):
7171
TASKS_TABLE = [
7272
LightevalTaskConfig(
7373
name="mmlu:anatomy",
74-
suite=["custom"],
7574
prompt_function=mmlu_anatomy,
7675
hf_repo="lighteval/mmlu",
7776
hf_subset="anatomy",
@@ -85,7 +84,6 @@ def mmlu_anatomy(line):
8584
),
8685
LightevalTaskConfig(
8786
name="mmlu:anatomy_signs",
88-
suite=["custom"],
8987
prompt_function=mmlu_anatomy_signs,
9088
hf_repo="lighteval/mmlu",
9189
hf_subset="anatomy",

src/lighteval/cli_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class Arg:
243243
type=Annotated[
244244
str,
245245
Argument(
246-
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
246+
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'task{|fewshot}'. Use 'lighteval tasks list' to see available tasks."
247247
),
248248
],
249249
default=None, # Required argument, no default

src/lighteval/main_inspect.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import logging
2424
from collections import defaultdict
25+
from datetime import datetime
2526
from typing import Literal
2627

2728
import requests
@@ -211,6 +212,20 @@ def eval( # noqa C901
211212
models: Annotated[list[str], Argument(help="Models to evaluate")],
212213
tasks: Annotated[str, Argument(help="Tasks to evaluate")],
213214
# model arguments
215+
model_base_url: Annotated[
216+
str | None,
217+
Option(
218+
help="Base URL for communicating with the model API.",
219+
rich_help_panel=HELP_PANEL_NAME_1,
220+
),
221+
] = None,
222+
model_roles: Annotated[
223+
str | None,
224+
Option(
225+
help="Model creation args (as a dictionary or as a path to a JSON or YAML config file)",
226+
rich_help_panel=HELP_PANEL_NAME_1,
227+
),
228+
] = None,
214229
max_tokens: Annotated[
215230
int | None,
216231
Option(
@@ -382,9 +397,9 @@ def eval( # noqa C901
382397
] = None,
383398
# Logging parameters
384399
log_dir: Annotated[
385-
str,
400+
str | None,
386401
Option(help="Log directory to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
387-
] = "lighteval-logs",
402+
] = None,
388403
log_dir_allow_dirty: Annotated[
389404
bool, Option(help="Allow dirty log directory", rich_help_panel=HELP_PANEL_NAME_4)
390405
] = True,
@@ -396,6 +411,10 @@ def eval( # noqa C901
396411
str | None,
397412
Option(help="Bundle directory to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
398413
] = None,
414+
bundle_overwrite: Annotated[
415+
bool,
416+
Option(help="Overwrite bundle directory if it exists", rich_help_panel=HELP_PANEL_NAME_4),
417+
] = True,
399418
repo_id: Annotated[
400419
str | None,
401420
Option(help="Repository ID to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
@@ -428,6 +447,9 @@ def eval( # noqa C901
428447
providers = _get_huggingface_providers(model)
429448
models = [f"{model.replace(':all', '')}:{provider}" for provider in providers]
430449

450+
if log_dir is None:
451+
log_dir = f"lighteval-logs-{datetime.now().strftime('%Y%m%d%H%M%S')}"
452+
431453
success, logs = inspect_ai_eval_set(
432454
inspect_ai_tasks,
433455
model=models,
@@ -440,7 +462,6 @@ def eval( # noqa C901
440462
log_dir=log_dir,
441463
log_dir_allow_dirty=log_dir_allow_dirty,
442464
display=display,
443-
bundle_dir=bundle_dir,
444465
model_args=model_args,
445466
max_tokens=max_tokens,
446467
system_message=system_message,
@@ -463,10 +484,13 @@ def eval( # noqa C901
463484
parallel_tool_calls=parallel_tool_calls,
464485
max_tool_output=max_tool_output,
465486
internal_tools=internal_tools,
466-
overwrite=True,
487+
bundle_dir=bundle_dir,
488+
bundle_overwrite=bundle_overwrite,
467489
)
468490

469491
if not success:
492+
print("Error evaluating models")
493+
print(f"run the same command with --log-dir {log_dir} to retry !")
470494
return
471495

472496
results_per_model_per_task = {}
@@ -482,12 +506,17 @@ def eval( # noqa C901
482506
table_md = results_to_markdown_table(results_per_model_per_task_agg)
483507

484508
if repo_id is not None:
485-
push_to_hub(bundle_dir, repo_id, public=public)
509+
if bundle_dir is not None:
510+
push_to_hub(bundle_dir, repo_id, public=public)
486511

487512
print()
488513
print(table_md)
489514
print(f"results saved to {log_dir}")
490-
print(f'run "inspect view --log-dir {log_dir}" to view the results')
515+
516+
if log_dir is not None:
517+
print(f'run "inspect view --log-dir {log_dir}" to view the results')
518+
else:
519+
print("run 'inspect view' to view the results")
491520

492521

493522
if __name__ == "__main__":

src/lighteval/metrics/metrics_sample.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,7 +1199,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
11991199
float: Aggregated score over the current sample's items.
12001200
"""
12011201
all_scores = []
1202-
for i in range(self.k):
1202+
for i in range(self.n):
12031203
all_scores.append(self.compute_score(doc, model_response[i]))
12041204

12051205
avg_score = np.mean(all_scores)
@@ -1235,7 +1235,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
12351235
Returns:
12361236
float: Aggregated score over the current sample's items.
12371237
"""
1238-
if self.k is None:
1238+
if self.n is None:
12391239
raise Exception("You did not set the value of n")
12401240

12411241
golds = doc.get_golds()

src/lighteval/tasks/lighteval_task.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23+
import functools
2324
import logging
2425
import random
2526
from dataclasses import asdict, dataclass, field
@@ -155,7 +156,7 @@ def __post_init__(self):
155156
self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else ()
156157
self.full_name = f"{self.name}|{self.num_fewshots}" # todo clefourrier: this is likely incorrect
157158

158-
def __str__(self, lite: bool = False):
159+
def __str__(self, lite: bool = False): # noqa: C901
159160
md_writer = MarkdownTableWriter()
160161
md_writer.headers = ["Key", "Value"]
161162

@@ -170,17 +171,23 @@ def __str__(self, lite: bool = False):
170171
if k == "metrics":
171172
for ix, metrics in enumerate(v):
172173
for metric_k, metric_v in metrics.items():
173-
if isinstance(metric_v, Callable):
174-
repr_v = metric_v.__name__
174+
if isinstance(metric_v, functools.partial):
175+
func_name = getattr(metric_v.func, "__name__", str(metric_v.func))
176+
repr_v = f"partial({func_name}, ...)"
177+
elif isinstance(metric_v, Callable):
178+
repr_v = getattr(metric_v, "__name__", repr(metric_v))
175179
elif isinstance(metric_v, Metric.get_allowed_types_for_metrics()):
176180
repr_v = str(metric_v)
177181
else:
178182
repr_v = repr(metric_v)
179183
values.append([f"{k} {ix}: {metric_k}", repr_v])
180184

181185
else:
182-
if isinstance(v, Callable):
183-
values.append([k, v.__name__])
186+
if isinstance(v, functools.partial):
187+
func_name = getattr(v.func, "__name__", str(v.func))
188+
values.append([k, f"partial({func_name}, ...)"])
189+
elif isinstance(v, Callable):
190+
values.append([k, getattr(v, "__name__", repr(v))])
184191
else:
185192
values.append([k, repr(v)])
186193

@@ -388,7 +395,7 @@ def get_docs(self, max_samples: int | None = None) -> list[Doc]:
388395
)
389396
doc.sampling_methods.extend(self.sampling_methods)
390397
doc.generation_size = self.generation_size
391-
doc.use_logits = True
398+
doc.use_logits = doc.use_logits if doc.use_logits is not None else True
392399
doc.stop_sequences = self.stop_sequence
393400
doc.num_samples = max(self.num_samples)
394401
docs.append(doc)

src/lighteval/tasks/multilingual/tasks/serbian_eval.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,6 @@ def create_task_config(
234234
hf_subset: str,
235235
metrics: List,
236236
evaluation_splits: List[str] = ["test"],
237-
suite: List[str] = ["community"],
238237
hf_avail_splits: List[str] = ["test", "validation"],
239238
few_shots_split: str = "validation",
240239
generation_size=5,
@@ -249,7 +248,6 @@ def create_task_config(
249248
hf_subset: Subset of the dataset.
250249
metrics: The metrics to use for the task.
251250
evaluation_splits: The evaluation splits to use (default is "test").
252-
suite: The suite of tasks.
253251
hf_avail_splits: Available splits (default is "test", "validation").
254252
few_shots_split: Split used for few-shot examples.
255253
generation_size: Number of generations to produce (default is 5).
@@ -260,7 +258,6 @@ def create_task_config(
260258
return LightevalTaskConfig(
261259
name=task_name,
262260
prompt_function=prompt_function,
263-
suite=suite,
264261
hf_repo=hf_repo,
265262
hf_subset=hf_subset,
266263
hf_avail_splits=hf_avail_splits,

0 commit comments

Comments
 (0)