Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,27 @@
from gooddata_eval.core.models import ChatResult, DatasetItem


def _normalize_str_list(value: object, *, lowercase: bool = False) -> list[str]:
# Arguments come from raw model-emitted JSON, so a malformed tool call may
# contain non-string entries. Drop them defensively so bad input scores as a
# mismatch instead of raising and aborting the whole evaluation.
if not isinstance(value, list):
return []
items = [item for item in value if isinstance(item, str)]
return sorted(item.lower() if lowercase else item for item in items)


def _args_match(actual_args: dict, expected_args: dict) -> bool:
if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []):
return False
if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []):
return False
if actual_args.get("limit") != expected_args.get("limit"):
# Only keywords and object_types determine semantic correctness.
# limit is optional with a server-side default; emit_widget was renamed to
# user_requested_search in the tool schema — neither affects search quality.
actual_kw = _normalize_str_list(actual_args.get("keywords"), lowercase=True)
expected_kw = _normalize_str_list(expected_args.get("keywords"), lowercase=True)
if actual_kw != expected_kw:
return False
return actual_args.get("emit_widget") == expected_args.get("emit_widget")
return _normalize_str_list(actual_args.get("object_types")) == _normalize_str_list(
expected_args.get("object_types")
)


class SearchToolEvaluator:
Expand Down
Loading