From 3b067820cc05ce7ffdb68778efa9b323694f81c6 Mon Sep 17 00:00:00 2001 From: Matthew Tang Date: Mon, 15 Jun 2026 12:40:03 -0700 Subject: [PATCH] chore: Deflake test_evaluation unit test PiperOrigin-RevId: 932610634 --- tests/unit/vertexai/test_evaluation.py | 30 ++++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py index 70775dfdc9..37789dadb5 100644 --- a/tests/unit/vertexai/test_evaluation.py +++ b/tests/unit/vertexai/test_evaluation.py @@ -934,7 +934,7 @@ def test_compute_pointwise_metrics(self, api_transport): scores = list( test_result.metrics_table["test_pointwise_metric/score"].to_list() ) - assert scores == [5, 4] or scores == [4, 5] + assert sorted(scores) == [4, 5] assert list( test_result.metrics_table["test_pointwise_metric/explanation"].to_list() ) == [ @@ -994,9 +994,9 @@ def _pointwise_side_effect(**kwargs): assert test_result.metrics_table["prompt"].equals( _TEST_EVAL_DATASET_ALL_INCLUDED["prompt"] ) - assert list( - test_result.metrics_table["test_pointwise_metric_str/score"].to_list() - ) == [5, 4] + assert sorted( + list(test_result.metrics_table["test_pointwise_metric_str/score"].to_list()) + ) == [4, 5] assert list( test_result.metrics_table["test_pointwise_metric_str/explanation"].to_list() ) == [ @@ -1049,9 +1049,9 @@ def test_compute_pointwise_metrics_metric_prompt_template_example( "summarization_quality/explanation", ] ) - assert list( - test_result.metrics_table["summarization_quality/score"].to_list() - ) == [5, 4] + assert sorted( + list(test_result.metrics_table["summarization_quality/score"].to_list()) + ) == [4, 5] assert list( test_result.metrics_table["summarization_quality/explanation"].to_list() ) == [ @@ -1166,9 +1166,9 @@ def _summarization_side_effect(**kwargs): "source", ] ) - assert list( - test_result.metrics_table["summarization_quality/score"].to_list() - ) == [5, 4] + assert sorted( + list(test_result.metrics_table["summarization_quality/score"].to_list()) + ) == [4, 5] assert list( test_result.metrics_table["summarization_quality/explanation"].to_list() ) == [ @@ -1595,9 +1595,9 @@ def test_compute_multiple_metrics(self, api_transport): == 0.5 ) - assert list( - test_result.metrics_table["summarization_quality/score"].to_list() - ) == [5, 4] + assert sorted( + list(test_result.metrics_table["summarization_quality/score"].to_list()) + ) == [4, 5] assert list( test_result.metrics_table["summarization_quality/explanation"].to_list() ) == [ @@ -1884,7 +1884,9 @@ def test_runnable_response_eval_with_runnable_inference(self, api_transport): "coherence/explanation", ] ) - assert list(test_result.metrics_table["coherence/score"].to_list()) == [5, 4] + assert sorted( + list(test_result.metrics_table["coherence/score"].to_list()) + ) == [4, 5] assert list(test_result.metrics_table["coherence/explanation"].to_list()) == [ "explanation", "explanation",