2222
2323import logging
2424from collections import defaultdict
25+ from datetime import datetime
2526from typing import Literal
2627
2728import requests
@@ -211,6 +212,20 @@ def eval( # noqa C901
211212 models : Annotated [list [str ], Argument (help = "Models to evaluate" )],
212213 tasks : Annotated [str , Argument (help = "Tasks to evaluate" )],
213214 # model arguments
215+ model_base_url : Annotated [
216+ str | None ,
217+ Option (
218+ help = "Base URL for communicating with the model API." ,
219+ rich_help_panel = HELP_PANEL_NAME_1 ,
220+ ),
221+ ] = None ,
222+ model_roles : Annotated [
223+ str | None ,
224+ Option (
225+ help = "Model creation args (as a dictionary or as a path to a JSON or YAML config file)" ,
226+ rich_help_panel = HELP_PANEL_NAME_1 ,
227+ ),
228+ ] = None ,
214229 max_tokens : Annotated [
215230 int | None ,
216231 Option (
@@ -382,9 +397,9 @@ def eval( # noqa C901
382397 ] = None ,
383398 # Logging parameters
384399 log_dir : Annotated [
385- str ,
400+ str | None ,
386401 Option (help = "Log directory to use, will be created if it doesn't exist" , rich_help_panel = HELP_PANEL_NAME_4 ),
387- ] = "lighteval-logs" ,
402+ ] = None ,
388403 log_dir_allow_dirty : Annotated [
389404 bool , Option (help = "Allow dirty log directory" , rich_help_panel = HELP_PANEL_NAME_4 )
390405 ] = True ,
@@ -396,6 +411,10 @@ def eval( # noqa C901
396411 str | None ,
397412 Option (help = "Bundle directory to use, will be created if it doesn't exist" , rich_help_panel = HELP_PANEL_NAME_4 ),
398413 ] = None ,
414+ bundle_overwrite : Annotated [
415+ bool ,
416+ Option (help = "Overwrite bundle directory if it exists" , rich_help_panel = HELP_PANEL_NAME_4 ),
417+ ] = True ,
399418 repo_id : Annotated [
400419 str | None ,
401420 Option (help = "Repository ID to use, will be created if it doesn't exist" , rich_help_panel = HELP_PANEL_NAME_4 ),
@@ -428,6 +447,9 @@ def eval( # noqa C901
428447 providers = _get_huggingface_providers (model )
429448 models = [f"{ model .replace (':all' , '' )} :{ provider } " for provider in providers ]
430449
450+ if log_dir is None :
451+ log_dir = f"lighteval-logs-{ datetime .now ().strftime ('%Y%m%d%H%M%S' )} "
452+
431453 success , logs = inspect_ai_eval_set (
432454 inspect_ai_tasks ,
433455 model = models ,
@@ -440,7 +462,6 @@ def eval( # noqa C901
440462 log_dir = log_dir ,
441463 log_dir_allow_dirty = log_dir_allow_dirty ,
442464 display = display ,
443- bundle_dir = bundle_dir ,
444465 model_args = model_args ,
445466 max_tokens = max_tokens ,
446467 system_message = system_message ,
@@ -463,10 +484,13 @@ def eval( # noqa C901
463484 parallel_tool_calls = parallel_tool_calls ,
464485 max_tool_output = max_tool_output ,
465486 internal_tools = internal_tools ,
466- overwrite = True ,
487+ bundle_dir = bundle_dir ,
488+ bundle_overwrite = bundle_overwrite ,
467489 )
468490
469491 if not success :
492+ print ("Error evaluating models" )
493+ print (f"run the same command with --log-dir { log_dir } to retry !" )
470494 return
471495
472496 results_per_model_per_task = {}
@@ -482,12 +506,17 @@ def eval( # noqa C901
482506 table_md = results_to_markdown_table (results_per_model_per_task_agg )
483507
484508 if repo_id is not None :
485- push_to_hub (bundle_dir , repo_id , public = public )
509+ if bundle_dir is not None :
510+ push_to_hub (bundle_dir , repo_id , public = public )
486511
487512 print ()
488513 print (table_md )
489514 print (f"results saved to { log_dir } " )
490- print (f'run "inspect view --log-dir { log_dir } " to view the results' )
515+
516+ if log_dir is not None :
517+ print (f'run "inspect view --log-dir { log_dir } " to view the results' )
518+ else :
519+ print ("run 'inspect view' to view the results" )
491520
492521
493522if __name__ == "__main__" :
0 commit comments