diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
index 8d7fa1f62..c28ee4ab5 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py
@@ -5,14 +5,33 @@
 from gooddata_eval.core.models import ChatResult, DatasetItem
 
 
+def _normalize_str_list(value: object, *, lowercase: bool = False) -> list[str]:
+    # Arguments come from raw model-emitted JSON. The search_objects schema
+    # declares keywords/object_types as list[str], but a malformed tool call may
+    # send a non-list or non-string entries. Drop the offending entries defensively
+    # so bad input can't raise (.lower()/sorted() on a non-str) and abort the whole
+    # evaluation run; a non-list collapses to [] and the surviving strings are
+    # still compared normally.
+    if not isinstance(value, list):
+        return []
+    items = [item for item in value if isinstance(item, str)]
+    return sorted(item.lower() if lowercase else item for item in items)
+
+
 def _args_match(actual_args: dict, expected_args: dict) -> bool:
-    if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []):
-        return False
-    if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []):
-        return False
-    if actual_args.get("limit") != expected_args.get("limit"):
+    # Only keywords and object_types determine semantic correctness.
+    # limit is optional with a server-side default; emit_widget was renamed to
+    # user_requested_search in the tool schema — neither affects search quality.
+    actual_kw = _normalize_str_list(actual_args.get("keywords"), lowercase=True)
+    expected_kw = _normalize_str_list(expected_args.get("keywords"), lowercase=True)
+    if actual_kw != expected_kw:
         return False
-    return actual_args.get("emit_widget") == expected_args.get("emit_widget")
+    # object_types is compared case-sensitively (no lowercase=True): they are
+    # controlled ObjectType StrEnum values the model emits verbatim ("metric",
+    # "dashboard"), so a case mismatch is a genuine error, not a formatting quirk.
+    return _normalize_str_list(actual_args.get("object_types")) == _normalize_str_list(
+        expected_args.get("object_types")
+    )
 
 
 class SearchToolEvaluator: