From c37460049a3edad1fad7c4ad4d1dc4c409f99319 Mon Sep 17 00:00:00 2001 From: Anshul Mishra Date: Fri, 26 Jun 2026 18:32:29 +0530 Subject: [PATCH 1/3] Add test for UpdateStatistics chaining set and remove --- tests/table/test_init.py | 47 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 7e64e6e7c0..1fe2b24702 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -18,12 +18,14 @@ import json import uuid from copy import copy +from pathlib import Path from typing import Any import pytest from pydantic import BaseModel, ValidationError from pytest_lazy_fixtures import lf +from pyiceberg.catalog.memory import InMemoryCatalog from pyiceberg.catalog.noop import NoopCatalog from pyiceberg.exceptions import CommitFailedException from pyiceberg.expressions import ( @@ -1536,7 +1538,6 @@ def test_set_statistics_update(table_v2_with_statistics: Table) -> None: assert len(updated_statistics) == 1 assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected) - def test_set_statistics_update_handles_deprecated_snapshot_id(table_v2_with_statistics: Table) -> None: snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id @@ -1587,7 +1588,6 @@ def test_set_statistics_update_handles_deprecated_snapshot_id(table_v2_with_stat assert model_roundtrips(update_with_json) assert update_with_json.snapshot_id == snapshot_id - def test_remove_statistics_update(table_v2_with_statistics: Table) -> None: update = RemoveStatisticsUpdate( snapshot_id=3055729675574597004, @@ -1689,6 +1689,49 @@ def test_remove_partition_statistics_update_with_invalid_snapshot_id(table_v2_wi (RemovePartitionStatisticsUpdate(snapshot_id=123456789),), ) +def test_update_statistics_set_remove_chain(tmp_path: Path) -> None: + + catalog = InMemoryCatalog("test",warehouse = f"file://{tmp_path}") + catalog.create_namespace("default") + table = catalog.create_table("default.test",schema = Schema(NestedField(1,"x", LongType()))) + + snapshot_id_1 = 1111111111 + snapshot_id_2 = 2222222222 + + statistics_file_1 = StatisticsFile( + snapshot_id=snapshot_id_1, + statistics_path="s3://bucket/warehouse/stats1.puffin", + file_size_in_bytes=124, + file_footer_size_in_bytes=27, + blob_metadata=[BlobMetadata( + type="apache-datasketches-theta-v1", + snapshot_id=snapshot_id_1, + sequence_number=1, + fields=[1], + )], + ) + + statistics_file_2 = StatisticsFile( + snapshot_id=snapshot_id_2, + statistics_path="s3://bucket/warehouse/stats2.puffin", + file_size_in_bytes=124, + file_footer_size_in_bytes=27, + blob_metadata=[BlobMetadata( + type="apache-datasketches-theta-v1", + snapshot_id=snapshot_id_2, + sequence_number=2, + fields=[1], + )], + ) + + with table.update_statistics() as update: + update.set_statistics(statistics_file_1) + + with table.update_statistics() as update: + update.set_statistics(statistics_file_2).remove_statistics(snapshot_id_1) + + assert len(table.metadata.statistics) == 1 + def test_add_snapshot_update_fails_without_first_row_id(table_v3: Table) -> None: new_snapshot = Snapshot( From 35220e31fc33e8c2f87f90f19b721e4f948870d2 Mon Sep 17 00:00:00 2001 From: Anshul Mishra Date: Fri, 26 Jun 2026 18:49:29 +0530 Subject: [PATCH 2/3] Mark test as xfail to document expected failure --- tests/table/test_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 1fe2b24702..6d52e662e4 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -1688,7 +1688,7 @@ def test_remove_partition_statistics_update_with_invalid_snapshot_id(table_v2_wi table_v2_with_statistics.metadata, (RemovePartitionStatisticsUpdate(snapshot_id=123456789),), ) - +@pytest.mark.xfail(reason="remove_statistics uses = instead of +=, dropping preceeding updates") def test_update_statistics_set_remove_chain(tmp_path: Path) -> None: catalog = InMemoryCatalog("test",warehouse = f"file://{tmp_path}") From e55ab1f33a2006299e967e20c69bc8fe161d3b4e Mon Sep 17 00:00:00 2001 From: Anshul Mishra Date: Fri, 26 Jun 2026 19:05:57 +0530 Subject: [PATCH 3/3] fixing typos --- tests/table/test_init.py | 89 ++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 41 deletions(-) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 6d52e662e4..3a18e5299d 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -1538,6 +1538,7 @@ def test_set_statistics_update(table_v2_with_statistics: Table) -> None: assert len(updated_statistics) == 1 assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected) + def test_set_statistics_update_handles_deprecated_snapshot_id(table_v2_with_statistics: Table) -> None: snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id @@ -1588,6 +1589,7 @@ def test_set_statistics_update_handles_deprecated_snapshot_id(table_v2_with_stat assert model_roundtrips(update_with_json) assert update_with_json.snapshot_id == snapshot_id + def test_remove_statistics_update(table_v2_with_statistics: Table) -> None: update = RemoveStatisticsUpdate( snapshot_id=3055729675574597004, @@ -1688,49 +1690,54 @@ def test_remove_partition_statistics_update_with_invalid_snapshot_id(table_v2_wi table_v2_with_statistics.metadata, (RemovePartitionStatisticsUpdate(snapshot_id=123456789),), ) -@pytest.mark.xfail(reason="remove_statistics uses = instead of +=, dropping preceeding updates") + + +@pytest.mark.xfail(reason="remove_statistics uses = instead of +=, dropping preceding updates") def test_update_statistics_set_remove_chain(tmp_path: Path) -> None: + catalog = InMemoryCatalog("test", warehouse=f"file://{tmp_path}") + catalog.create_namespace("default") + table = catalog.create_table("default.test", schema=Schema(NestedField(1, "x", LongType()))) + + snapshot_id_1 = 1111111111 + snapshot_id_2 = 2222222222 + + statistics_file_1 = StatisticsFile( + snapshot_id=snapshot_id_1, + statistics_path="s3://bucket/warehouse/stats1.puffin", + file_size_in_bytes=124, + file_footer_size_in_bytes=27, + blob_metadata=[ + BlobMetadata( + type="apache-datasketches-theta-v1", + snapshot_id=snapshot_id_1, + sequence_number=1, + fields=[1], + ) + ], + ) + + statistics_file_2 = StatisticsFile( + snapshot_id=snapshot_id_2, + statistics_path="s3://bucket/warehouse/stats2.puffin", + file_size_in_bytes=124, + file_footer_size_in_bytes=27, + blob_metadata=[ + BlobMetadata( + type="apache-datasketches-theta-v1", + snapshot_id=snapshot_id_2, + sequence_number=2, + fields=[1], + ) + ], + ) + + with table.update_statistics() as update: + update.set_statistics(statistics_file_1) + + with table.update_statistics() as update: + update.set_statistics(statistics_file_2).remove_statistics(snapshot_id_1) - catalog = InMemoryCatalog("test",warehouse = f"file://{tmp_path}") - catalog.create_namespace("default") - table = catalog.create_table("default.test",schema = Schema(NestedField(1,"x", LongType()))) - - snapshot_id_1 = 1111111111 - snapshot_id_2 = 2222222222 - - statistics_file_1 = StatisticsFile( - snapshot_id=snapshot_id_1, - statistics_path="s3://bucket/warehouse/stats1.puffin", - file_size_in_bytes=124, - file_footer_size_in_bytes=27, - blob_metadata=[BlobMetadata( - type="apache-datasketches-theta-v1", - snapshot_id=snapshot_id_1, - sequence_number=1, - fields=[1], - )], - ) - - statistics_file_2 = StatisticsFile( - snapshot_id=snapshot_id_2, - statistics_path="s3://bucket/warehouse/stats2.puffin", - file_size_in_bytes=124, - file_footer_size_in_bytes=27, - blob_metadata=[BlobMetadata( - type="apache-datasketches-theta-v1", - snapshot_id=snapshot_id_2, - sequence_number=2, - fields=[1], - )], - ) - - with table.update_statistics() as update: - update.set_statistics(statistics_file_1) - - with table.update_statistics() as update: - update.set_statistics(statistics_file_2).remove_statistics(snapshot_id_1) - - assert len(table.metadata.statistics) == 1 + assert len(table.metadata.statistics) == 1 def test_add_snapshot_update_fails_without_first_row_id(table_v3: Table) -> None: