From 2ebc8b055b9a20beda30e9cb3594b36015b6c723 Mon Sep 17 00:00:00 2001 From: Peter Tomko Date: Sat, 27 Jun 2026 22:30:55 +0200 Subject: [PATCH 1/3] fix(eval): fix search_tool correctness always scoring 0% MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _args_match checked emit_widget (renamed to user_requested_search in the tool schema) and limit (optional, server-side default). Both mismatched on every real model call, so tool_correctness was always False regardless of whether the model used the right keywords and object types. Fix: evaluate only keywords (case-insensitive) and object_types — the two fields that actually determine whether the search was semantically correct. --- .../gooddata_eval/core/evaluators/search_tool.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py index 8d7fa1f62..32fe135a8 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py @@ -6,13 +6,14 @@ def _args_match(actual_args: dict, expected_args: dict) -> bool: - if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []): + # Only keywords and object_types determine semantic correctness. + # limit is optional with a server-side default; emit_widget was renamed to + # user_requested_search in the tool schema — neither affects search quality. + actual_kw = sorted(k.lower() for k in (actual_args.get("keywords") or [])) + expected_kw = sorted(k.lower() for k in (expected_args.get("keywords") or [])) + if actual_kw != expected_kw: return False - if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []): - return False - if actual_args.get("limit") != expected_args.get("limit"): - return False - return actual_args.get("emit_widget") == expected_args.get("emit_widget") + return sorted(actual_args.get("object_types") or []) == sorted(expected_args.get("object_types") or []) class SearchToolEvaluator: From f8476798910140017be4a5c44ab10f65c96360f1 Mon Sep 17 00:00:00 2001 From: Peter Tomko Date: Wed, 1 Jul 2026 11:09:44 +0200 Subject: [PATCH 2/3] fix(eval): harden _args_match against malformed tool-call JSON parsed_arguments() returns raw model-emitted JSON, so a bad tool call like {"keywords":[1]} or mixed-type object_types would raise on .lower()/sorted() and abort the whole eval run instead of scoring tool_correctness=False. Add _normalize_str_list to drop non-string entries defensively; valid comparisons (incl. case-insensitive keyword match) are unchanged. Addresses CodeRabbit review on PR #1675. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../core/evaluators/search_tool.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py index 32fe135a8..ec41b3ce4 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py @@ -5,15 +5,27 @@ from gooddata_eval.core.models import ChatResult, DatasetItem +def _normalize_str_list(value: object, *, lowercase: bool = False) -> list[str]: + # Arguments come from raw model-emitted JSON, so a malformed tool call may + # contain non-string entries. Drop them defensively so bad input scores as a + # mismatch instead of raising and aborting the whole evaluation. + if not isinstance(value, list): + return [] + items = [item for item in value if isinstance(item, str)] + return sorted(item.lower() if lowercase else item for item in items) + + def _args_match(actual_args: dict, expected_args: dict) -> bool: # Only keywords and object_types determine semantic correctness. # limit is optional with a server-side default; emit_widget was renamed to # user_requested_search in the tool schema — neither affects search quality. - actual_kw = sorted(k.lower() for k in (actual_args.get("keywords") or [])) - expected_kw = sorted(k.lower() for k in (expected_args.get("keywords") or [])) + actual_kw = _normalize_str_list(actual_args.get("keywords"), lowercase=True) + expected_kw = _normalize_str_list(expected_args.get("keywords"), lowercase=True) if actual_kw != expected_kw: return False - return sorted(actual_args.get("object_types") or []) == sorted(expected_args.get("object_types") or []) + return _normalize_str_list(actual_args.get("object_types")) == _normalize_str_list( + expected_args.get("object_types") + ) class SearchToolEvaluator: From 53dc5d2ee45cd85ac20450b3134f3e0463fb7be1 Mon Sep 17 00:00:00 2001 From: Peter Tomko Date: Thu, 2 Jul 2026 11:04:07 +0200 Subject: [PATCH 3/3] docs(eval): clarify _args_match comments per review Address review nits on search_tool _args_match: - reword _normalize_str_list comment: dropping non-string entries prevents a crash, it does not force a mismatch (surviving strings still compare) - note that object_types is compared case-sensitively on purpose (controlled ObjectType StrEnum values emitted verbatim) No behavior change. keywords/object_types are declared list[str] in the search_objects schema, so string-collapse false-negatives cannot occur. JIRA: TRIVIAL risk: nonprod --- .../src/gooddata_eval/core/evaluators/search_tool.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py index ec41b3ce4..c28ee4ab5 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py @@ -6,9 +6,12 @@ def _normalize_str_list(value: object, *, lowercase: bool = False) -> list[str]: - # Arguments come from raw model-emitted JSON, so a malformed tool call may - # contain non-string entries. Drop them defensively so bad input scores as a - # mismatch instead of raising and aborting the whole evaluation. + # Arguments come from raw model-emitted JSON. The search_objects schema + # declares keywords/object_types as list[str], but a malformed tool call may + # send a non-list or non-string entries. Drop the offending entries defensively + # so bad input can't raise (.lower()/sorted() on a non-str) and abort the whole + # evaluation run; a non-list collapses to [] and the surviving strings are + # still compared normally. if not isinstance(value, list): return [] items = [item for item in value if isinstance(item, str)] @@ -23,6 +26,9 @@ def _args_match(actual_args: dict, expected_args: dict) -> bool: expected_kw = _normalize_str_list(expected_args.get("keywords"), lowercase=True) if actual_kw != expected_kw: return False + # object_types is compared case-sensitively (no lowercase=True): they are + # controlled ObjectType StrEnum values the model emits verbatim ("metric", + # "dashboard"), so a case mismatch is a genuine error, not a formatting quirk. return _normalize_str_list(actual_args.get("object_types")) == _normalize_str_list( expected_args.get("object_types") )