Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions src/agentlab/utils/hinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import requests
from agentlab.llm.chat_api import ChatModel
import re
import json
from agentlab.llm.response_api import APIPayload

logger = logging.getLogger(__name__)
Expand All @@ -25,6 +26,7 @@ def __init__(
hint_db_path: str,
hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct",
skip_hints_for_current_task: bool = False,
skip_hints_for_current_goal: bool = False,
top_n: int = 4,
embedder_model: str = "Qwen/Qwen3-Embedding-0.6B",
embedder_server: str = "http://localhost:5000",
Expand All @@ -36,6 +38,7 @@ def __init__(
self.hint_db_path = hint_db_path
self.hint_retrieval_mode = hint_retrieval_mode
self.skip_hints_for_current_task = skip_hints_for_current_task
self.skip_hints_for_current_goal = skip_hints_for_current_goal
self.top_n = top_n
self.embedder_model = embedder_model
self.embedder_server = embedder_server
Expand All @@ -45,7 +48,16 @@ def __init__(
self.hint_db_path = Path(hint_db_path).as_posix()
else:
self.hint_db_path = (Path(__file__).parent / self.hint_db_path).as_posix()
self.hint_db = pd.read_csv(self.hint_db_path, header=0, index_col=None, dtype=str)
self.hint_db = pd.read_csv(
self.hint_db_path,
header=0,
index_col=None,
dtype=str,
converters={
"trace_paths_json": lambda x: json.loads(x) if pd.notna(x) else [],
"source_trace_goals": lambda x: json.loads(x) if pd.notna(x) else [],
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing error handling for JSON parsing in CSV converters category Error Handling

Tell me more
What is the issue?

The JSON parsing converter does not handle potential JSON parsing errors that could occur with malformed JSON data in the CSV file.

Why this matters

If the CSV contains invalid JSON in the source_trace_goals column, json.loads() will raise a JSONDecodeError, causing the entire hint database loading to fail and breaking the application startup.

Suggested change ∙ Feature Preview

Add try-except blocks to handle JSON parsing errors gracefully:

"source_trace_goals": lambda x: json.loads(x) if pd.notna(x) else [] if pd.notna(x) else [],

Better solution:

def safe_json_parse(x):
    if pd.notna(x):
        try:
            return json.loads(x)
        except json.JSONDecodeError:
            logger.warning(f"Failed to parse JSON: {x}")
            return []
    return []

# Then use:
"source_trace_goals": safe_json_parse,
"trace_paths_json": safe_json_parse,
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

},
Comment on lines +56 to +59
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eager JSON parsing during CSV load category Performance

Tell me more
What is the issue?

JSON parsing is performed for every row during CSV loading, even for columns that may not be used frequently.

Why this matters

This eager parsing approach increases memory usage and loading time, especially for large CSV files. If these JSON fields are only accessed occasionally, the parsing overhead is wasted for unused data.

Suggested change ∙ Feature Preview

Consider lazy parsing by keeping JSON fields as strings initially and parsing them on-demand:

# Load without converters initially
self.hint_db = pd.read_csv(self.hint_db_path, header=0, index_col=None, dtype=str)

# Parse JSON only when needed
def _get_source_trace_goals(self, row_index):
    goals_str = self.hint_db.loc[row_index, 'source_trace_goals']
    return json.loads(goals_str) if pd.notna(goals_str) else []
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

)
logger.info(f"Loaded {len(self.hint_db)} hints from database {self.hint_db_path}")
if self.hint_retrieval_mode == "emb":
self.load_hint_vectors()
Expand Down Expand Up @@ -84,7 +96,9 @@ def choose_hints_llm(self, llm, goal: str, task_name: str) -> list[str]:
topic_to_hints = defaultdict(list)
skip_hints = []
if self.skip_hints_for_current_task:
skip_hints = self.get_current_task_hints(task_name)
skip_hints += self.get_current_task_hints(task_name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a minor issue but if skip_hints_for_current_task is True, there is no need to check for skip_hints_for_current_goal, right?

if self.skip_hints_for_current_goal:
skip_hints += self.get_current_goal_hints(goal)
for _, row in self.hint_db.iterrows():
hint = row["hint"]
if hint in skip_hints:
Expand Down Expand Up @@ -128,7 +142,9 @@ def choose_hints_emb(self, goal: str, task_name: str) -> list[str]:
all_hints = self.uniq_hints["hint"].tolist()
skip_hints = []
if self.skip_hints_for_current_task:
skip_hints = self.get_current_task_hints(task_name)
skip_hints += self.get_current_task_hints(task_name)
if self.skip_hints_for_current_goal:
skip_hints += self.get_current_goal_hints(goal)
hint_embeddings = []
id_to_hint = {}
for hint, emb in zip(all_hints, self.hint_embeddings):
Expand Down Expand Up @@ -199,3 +215,7 @@ def get_current_task_hints(self, task_name):
self.hint_db["task_name"].apply(lambda x: fnmatch.fnmatch(x, task_name))
]
return hints_df["hint"].tolist()

def get_current_goal_hints(self, goal_str: str):
mask = self.hint_db["source_trace_goals"].apply(lambda goals: goal_str in goals)
return self.hint_db.loc[mask, "hint"].tolist()
Loading