From 2ebc8b055b9a20beda30e9cb3594b36015b6c723 Mon Sep 17 00:00:00 2001
From: Peter Tomko <tomko5peter@gmail.com>
Date: Sat, 27 Jun 2026 22:30:55 +0200
Subject: [PATCH 1/3] fix(eval): fix search_tool correctness always scoring 0%
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_args_match checked emit_widget (renamed to user_requested_search in the
tool schema) and limit (optional, server-side default). Both mismatched on
every real model call, so tool_correctness was always False regardless of
whether the model used the right keywords and object types.

Fix: evaluate only keywords (case-insensitive) and object_types — the two
fields that actually determine whether the search was semantically correct.
---
 .../gooddata_eval/core/evaluators/search_tool.py    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
index 8d7fa1f62..32fe135a8 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
@@ -6,13 +6,14 @@
 
 
 def _args_match(actual_args: dict, expected_args: dict) -> bool:
-    if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []):
+    # Only keywords and object_types determine semantic correctness.
+    # limit is optional with a server-side default; emit_widget was renamed to
+    # user_requested_search in the tool schema — neither affects search quality.
+    actual_kw = sorted(k.lower() for k in (actual_args.get("keywords") or []))
+    expected_kw = sorted(k.lower() for k in (expected_args.get("keywords") or []))
+    if actual_kw != expected_kw:
         return False
-    if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []):
-        return False
-    if actual_args.get("limit") != expected_args.get("limit"):
-        return False
-    return actual_args.get("emit_widget") == expected_args.get("emit_widget")
+    return sorted(actual_args.get("object_types") or []) == sorted(expected_args.get("object_types") or [])
 
 
 class SearchToolEvaluator:

From f8476798910140017be4a5c44ab10f65c96360f1 Mon Sep 17 00:00:00 2001
From: Peter Tomko <tomko5peter@gmail.com>
Date: Wed, 1 Jul 2026 11:09:44 +0200
Subject: [PATCH 2/3] fix(eval): harden _args_match against malformed tool-call
 JSON

parsed_arguments() returns raw model-emitted JSON, so a bad tool call
like {"keywords":[1]} or mixed-type object_types would raise on
.lower()/sorted() and abort the whole eval run instead of scoring
tool_correctness=False. Add _normalize_str_list to drop non-string
entries defensively; valid comparisons (incl. case-insensitive keyword
match) are unchanged.

Addresses CodeRabbit review on PR #1675.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../core/evaluators/search_tool.py             | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
index 32fe135a8..ec41b3ce4 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
@@ -5,15 +5,27 @@
 from gooddata_eval.core.models import ChatResult, DatasetItem
 
 
+def _normalize_str_list(value: object, *, lowercase: bool = False) -> list[str]:
+    # Arguments come from raw model-emitted JSON, so a malformed tool call may
+    # contain non-string entries. Drop them defensively so bad input scores as a
+    # mismatch instead of raising and aborting the whole evaluation.
+    if not isinstance(value, list):
+        return []
+    items = [item for item in value if isinstance(item, str)]
+    return sorted(item.lower() if lowercase else item for item in items)
+
+
 def _args_match(actual_args: dict, expected_args: dict) -> bool:
     # Only keywords and object_types determine semantic correctness.
     # limit is optional with a server-side default; emit_widget was renamed to
     # user_requested_search in the tool schema — neither affects search quality.
-    actual_kw = sorted(k.lower() for k in (actual_args.get("keywords") or []))
-    expected_kw = sorted(k.lower() for k in (expected_args.get("keywords") or []))
+    actual_kw = _normalize_str_list(actual_args.get("keywords"), lowercase=True)
+    expected_kw = _normalize_str_list(expected_args.get("keywords"), lowercase=True)
     if actual_kw != expected_kw:
         return False
-    return sorted(actual_args.get("object_types") or []) == sorted(expected_args.get("object_types") or [])
+    return _normalize_str_list(actual_args.get("object_types")) == _normalize_str_list(
+        expected_args.get("object_types")
+    )
 
 
 class SearchToolEvaluator:

From 53dc5d2ee45cd85ac20450b3134f3e0463fb7be1 Mon Sep 17 00:00:00 2001
From: Peter Tomko <tomko5peter@gmail.com>
Date: Thu, 2 Jul 2026 11:04:07 +0200
Subject: [PATCH 3/3] docs(eval): clarify _args_match comments per review

Address review nits on search_tool _args_match:
- reword _normalize_str_list comment: dropping non-string entries prevents
  a crash, it does not force a mismatch (surviving strings still compare)
- note that object_types is compared case-sensitively on purpose (controlled
  ObjectType StrEnum values emitted verbatim)

No behavior change. keywords/object_types are declared list[str] in the
search_objects schema, so string-collapse false-negatives cannot occur.

JIRA: TRIVIAL
risk: nonprod
---
 .../src/gooddata_eval/core/evaluators/search_tool.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
index ec41b3ce4..c28ee4ab5 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
@@ -6,9 +6,12 @@
 
 
 def _normalize_str_list(value: object, *, lowercase: bool = False) -> list[str]:
-    # Arguments come from raw model-emitted JSON, so a malformed tool call may
-    # contain non-string entries. Drop them defensively so bad input scores as a
-    # mismatch instead of raising and aborting the whole evaluation.
+    # Arguments come from raw model-emitted JSON. The search_objects schema
+    # declares keywords/object_types as list[str], but a malformed tool call may
+    # send a non-list or non-string entries. Drop the offending entries defensively
+    # so bad input can't raise (.lower()/sorted() on a non-str) and abort the whole
+    # evaluation run; a non-list collapses to [] and the surviving strings are
+    # still compared normally.
     if not isinstance(value, list):
         return []
     items = [item for item in value if isinstance(item, str)]
@@ -23,6 +26,9 @@ def _args_match(actual_args: dict, expected_args: dict) -> bool:
     expected_kw = _normalize_str_list(expected_args.get("keywords"), lowercase=True)
     if actual_kw != expected_kw:
         return False
+    # object_types is compared case-sensitively (no lowercase=True): they are
+    # controlled ObjectType StrEnum values the model emits verbatim ("metric",
+    # "dashboard"), so a case mismatch is a genuine error, not a formatting quirk.
     return _normalize_str_list(actual_args.get("object_types")) == _normalize_str_list(
         expected_args.get("object_types")
     )