From 3b067820cc05ce7ffdb68778efa9b323694f81c6 Mon Sep 17 00:00:00 2001
From: Matthew Tang <tangmatthew@google.com>
Date: Mon, 15 Jun 2026 12:40:03 -0700
Subject: [PATCH] chore: Deflake test_evaluation unit test

PiperOrigin-RevId: 932610634
---
 tests/unit/vertexai/test_evaluation.py | 30 ++++++++++++++------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
index 70775dfdc9..37789dadb5 100644
--- a/tests/unit/vertexai/test_evaluation.py
+++ b/tests/unit/vertexai/test_evaluation.py
@@ -934,7 +934,7 @@ def test_compute_pointwise_metrics(self, api_transport):
         scores = list(
             test_result.metrics_table["test_pointwise_metric/score"].to_list()
         )
-        assert scores == [5, 4] or scores == [4, 5]
+        assert sorted(scores) == [4, 5]
         assert list(
             test_result.metrics_table["test_pointwise_metric/explanation"].to_list()
         ) == [
@@ -994,9 +994,9 @@ def _pointwise_side_effect(**kwargs):
         assert test_result.metrics_table["prompt"].equals(
             _TEST_EVAL_DATASET_ALL_INCLUDED["prompt"]
         )
-        assert list(
-            test_result.metrics_table["test_pointwise_metric_str/score"].to_list()
-        ) == [5, 4]
+        assert sorted(
+            list(test_result.metrics_table["test_pointwise_metric_str/score"].to_list())
+        ) == [4, 5]
         assert list(
             test_result.metrics_table["test_pointwise_metric_str/explanation"].to_list()
         ) == [
@@ -1049,9 +1049,9 @@ def test_compute_pointwise_metrics_metric_prompt_template_example(
                 "summarization_quality/explanation",
             ]
         )
-        assert list(
-            test_result.metrics_table["summarization_quality/score"].to_list()
-        ) == [5, 4]
+        assert sorted(
+            list(test_result.metrics_table["summarization_quality/score"].to_list())
+        ) == [4, 5]
         assert list(
             test_result.metrics_table["summarization_quality/explanation"].to_list()
         ) == [
@@ -1166,9 +1166,9 @@ def _summarization_side_effect(**kwargs):
                 "source",
             ]
         )
-        assert list(
-            test_result.metrics_table["summarization_quality/score"].to_list()
-        ) == [5, 4]
+        assert sorted(
+            list(test_result.metrics_table["summarization_quality/score"].to_list())
+        ) == [4, 5]
         assert list(
             test_result.metrics_table["summarization_quality/explanation"].to_list()
         ) == [
@@ -1595,9 +1595,9 @@ def test_compute_multiple_metrics(self, api_transport):
             == 0.5
         )
 
-        assert list(
-            test_result.metrics_table["summarization_quality/score"].to_list()
-        ) == [5, 4]
+        assert sorted(
+            list(test_result.metrics_table["summarization_quality/score"].to_list())
+        ) == [4, 5]
         assert list(
             test_result.metrics_table["summarization_quality/explanation"].to_list()
         ) == [
@@ -1884,7 +1884,9 @@ def test_runnable_response_eval_with_runnable_inference(self, api_transport):
                 "coherence/explanation",
             ]
         )
-        assert list(test_result.metrics_table["coherence/score"].to_list()) == [5, 4]
+        assert sorted(
+            list(test_result.metrics_table["coherence/score"].to_list())
+        ) == [4, 5]
         assert list(test_result.metrics_table["coherence/explanation"].to_list()) == [
             "explanation",
             "explanation",