Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 18 additions & 81 deletions src/lighteval/tasks/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,34 +80,12 @@ def load_community_tasks():

logger = logging.getLogger(__name__)

# Helm, Bigbench, Harness are implementations following an evaluation suite setup
# Original follows the original implementation as closely as possible
# Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results
# Community are for community added evaluations
# Extended are for evaluations with custom logic
# Custom is for all the experiments you might want to do!

# Core suites - always available without extra dependencies
CORE_SUITES = [
"helm",
"bigbench",
"harness",
"leaderboard",
"lighteval",
"original",
"extended",
"custom",
"test",
]

# Optional suites - may require extra dependencies
OPTIONAL_SUITES = [
"community",
"multilingual",
]

DEFAULT_SUITES = CORE_SUITES + OPTIONAL_SUITES


class Registry:
"""The Registry class is used to manage the task registry and get task classes."""
Expand Down Expand Up @@ -138,7 +116,6 @@ def __init__(
TASKS_TABLE = [
LightevalTaskConfig(
name="custom_task",
suite="custom",
...
)
]
Expand Down Expand Up @@ -187,7 +164,7 @@ def _update_task_configs(self) -> dict[str, LightevalTaskConfig]: # noqa: C901
Now expects task specs in the form:
- task|few_shot
- task (defaults to few_shot=0)
Backwards-compat for suite|task|few_shot is preserved but the suite is ignored.
Backwards-compat for task|few_shot is preserved.
"""
task_to_configs = collections.defaultdict(list)

Expand Down Expand Up @@ -259,7 +236,7 @@ def load_tasks(self) -> dict[str, LightevalTask]:
@lru_cache
def _task_superset_dict(self):
"""Returns:
dict[str, list[str]]: A dictionary where keys are task super set names (suite|task) and values are lists of task subset names (suite|task).
dict[str, list[str]]: A dictionary where keys are task super set names (task) and values are lists of task subset names (task).

Example:
{
Expand All @@ -276,11 +253,11 @@ def _expand_task_definition(self, task_definition: str):
"""
Args:
task_definition (str): Task definition to expand. In format:
- suite|task
- suite|task_superset (e.g lighteval|mmlu, which runs all the mmlu subtasks)
- task
- task_superset (e.g mmlu, which runs all the mmlu subtasks)

Returns:
list[str]: List of task names (suite|task)
list[str]: List of task names (task)
"""
# Try if it's a task superset
tasks = self._task_superset_dict.get(task_definition, None)
Expand Down Expand Up @@ -379,64 +356,24 @@ def load_all_task_configs(
logger.info(f"Loaded {len(loaded_configs)} task configs in {time_end - time_start:.1f} seconds")
return loaded_configs

def print_all_tasks(self, suites: str | None = None):
"""Print all the tasks in the task registry.

Args:
suites: Comma-separated list of suites to display. If None, shows core suites only.
Use 'all' to show all available suites (core + optional).
Special handling for 'multilingual' suite with dependency checking.
"""
# Parse requested suites
if suites is None:
requested_suites = CORE_SUITES.copy()
else:
requested_suites = [s.strip() for s in suites.split(",")]

# Handle 'all' special case
if "all" in requested_suites:
requested_suites = DEFAULT_SUITES.copy()
def print_all_tasks(self):
"""Print all the tasks in the task registry."""

# Check for multilingual dependencies if requested
if "multilingual" in requested_suites:
import importlib.util
# Get all tasks
all_tasks = sorted(list(self._task_registry.keys()))

if importlib.util.find_spec("langcodes") is None:
logger.warning(
"Multilingual tasks require additional dependencies (langcodes). "
"Install them with: pip install langcodes"
)
requested_suites.remove("multilingual")

# Get all tasks and filter by requested suites
all_tasks = list(self._task_registry.keys())
tasks_names = [task for task in all_tasks if task.split("|")[0] in requested_suites]

# Ensure all requested suites are present (even if empty)
suites_in_registry = {name.split("|")[0] for name in tasks_names}
for suite in requested_suites:
if suite not in suites_in_registry:
# We add a dummy task to make sure the suite is printed
tasks_names.append(f"{suite}|")

tasks_names.sort()

print(f"Displaying tasks for suites: {', '.join(requested_suites)}")
print(f"Displaying tasks:")
print("=" * 60)

for suite, g in groupby(tasks_names, lambda x: x.split("|")[0]):
tasks_in_suite = [name for name in g if name.split("|")[1]] # Filter out dummy tasks
tasks_in_suite.sort()

print(f"\n- {suite}:")
if not tasks_in_suite:
print(" (no tasks in this suite)")
else:
for task_name in tasks_in_suite:
print(f" - {task_name}")

last_task = None
for task_name in all_tasks:
task_parts = task_name.split(":")
if last_task != task_parts[0]:
print("")
last_task = task_parts[0]
print(f" - {task_name}")
# Print summary
total_tasks = len([t for t in tasks_names if t.split("|")[1]])
total_tasks = len(all_tasks)
print(f"\nTotal tasks displayed: {total_tasks}")

def get_tasks_dump(self) -> list[dict]: # noqa: C901
Expand Down
Loading