MergeOnReadSpec:
A merge-on-read table
- should create no new files for merge-on-read
+ Given SQL:
CREATE TABLE polaris.my_namespace.MergeOnReadSpec (
id int,
label String,
partitionKey long,
date Date,
timestamp Timestamp
) USING iceberg TBLPROPERTIES (
'format-version' = '2',
'write.delete.mode' = 'merge-on-read',
'write.update.mode' = 'merge-on-read',
'sort-order' = 'partitionKey ASC NULLS FIRST',
'write.merge.mode' = 'merge-on-read'
) PARTITIONED BY (partitionKey);
+ When we execute it
+ Then there is an Iceberg table, polaris.my_namespace.MergeOnReadSpec
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
- should insert creates new files for merge-on-read
+ Given SQL:
INSERT INTO
TABLE polaris.my_namespace.MergeOnReadSpec (id, label, partitionKey, date, timestamp)
VALUES
(
0,
'label_0',
0,
cast(date_format('2025-05-13', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:37:59.688',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
1,
'label_1',
1,
cast(date_format('2025-05-12', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:37:59.888',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
2,
'label_2',
2,
cast(date_format('2025-05-11', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:00.088',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
3,
'label_3',
3,
cast(date_format('2025-05-10', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:00.288',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
4,
'label_4',
4,
cast(date_format('2025-05-09', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:00.488',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
5,
'label_5',
0,
cast(date_format('2025-05-08', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:00.688',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
6,
'label_6',
1,
cast(date_format('2025-05-07', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:00.888',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
7,
'label_7',
2,
cast(date_format('2025-05-06', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:01.088',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
8,
'label_8',
3,
cast(date_format('2025-05-05', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:01.288',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
9,
'label_9',
4,
cast(date_format('2025-05-04', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:01.488',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
10,
'label_10',
0,
cast(date_format('2025-05-03', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:01.688',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
11,
'label_11',
1,
cast(date_format('2025-05-02', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:01.888',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
12,
'label_12',
2,
cast(date_format('2025-05-01', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:02.088',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
13,
'label_13',
3,
cast(date_format('2025-04-30', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:02.288',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
14,
'label_14',
4,
cast(date_format('2025-04-29', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:02.488',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
15,
'label_15',
0,
cast(date_format('2025-04-28', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:02.688',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
16,
'label_16',
1,
cast(date_format('2025-04-27', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:02.888',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
17,
'label_17',
2,
cast(date_format('2025-04-26', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:03.088',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
18,
'label_18',
3,
cast(date_format('2025-04-25', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:03.288',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
),
(
19,
'label_19',
4,
cast(date_format('2025-04-24', 'yyyy-MM-dd') as date),
cast(
date_format(
'2025-05-13 16:38:03.488',
'yyyy-MM-dd HH:mm:ss.SSS'
) as timestamp
)
)
+ When we execute it
+ Then there are now 5 data files:
/partitionKey=0/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00001.parquet
/partitionKey=1/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00002.parquet
/partitionKey=2/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00004.parquet
/partitionKey=3/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00003.parquet
/partitionKey=4/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00005.parquet
+ And the table contains:
Datum(0,label_0,0,2025-05-13,2025-05-13 16:37:59.688)
Datum(1,label_1,1,2025-05-12,2025-05-13 16:37:59.888)
Datum(2,label_2,2,2025-05-11,2025-05-13 16:38:00.088)
Datum(3,label_3,3,2025-05-10,2025-05-13 16:38:00.288)
Datum(4,label_4,4,2025-05-09,2025-05-13 16:38:00.488)
Datum(5,label_5,0,2025-05-08,2025-05-13 16:38:00.688)
Datum(6,label_6,1,2025-05-07,2025-05-13 16:38:00.888)
Datum(7,label_7,2,2025-05-06,2025-05-13 16:38:01.088)
Datum(8,label_8,3,2025-05-05,2025-05-13 16:38:01.288)
Datum(9,label_9,4,2025-05-04,2025-05-13 16:38:01.488)
Datum(10,label_10,0,2025-05-03,2025-05-13 16:38:01.688)
Datum(11,label_11,1,2025-05-02,2025-05-13 16:38:01.888)
Datum(12,label_12,2,2025-05-01,2025-05-13 16:38:02.088)
Datum(13,label_13,3,2025-04-30,2025-05-13 16:38:02.288)
Datum(14,label_14,4,2025-04-29,2025-05-13 16:38:02.488)
Datum(15,label_15,0,2025-04-28,2025-05-13 16:38:02.688)
Datum(16,label_16,1,2025-04-27,2025-05-13 16:38:02.888)
Datum(17,label_17,2,2025-04-26,2025-05-13 16:38:03.088)
Datum(18,label_18,3,2025-04-25,2025-05-13 16:38:03.288)
Datum(19,label_19,4,2025-04-24,2025-05-13 16:38:03.488)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
- should update creates no new files for merge-on-read
+ Given SQL:
UPDATE
polaris.my_namespace.MergeOnReadSpec
SET
label = 'label_1X'
WHERE
id = 1
+ When we execute it
+ Then there are now 7 data files:
/partitionKey=0/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00001.parquet
/partitionKey=1/00000-1000-d5c280a5-ab4d-4e9a-96ed-893c9dfd61d9-00001-deletes.parquet
/partitionKey=1/00000-1000-d5c280a5-ab4d-4e9a-96ed-893c9dfd61d9-00001.parquet
/partitionKey=1/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00002.parquet
/partitionKey=2/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00004.parquet
/partitionKey=3/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00003.parquet
/partitionKey=4/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00005.parquet
+ And the table contains:
Datum(0,label_0,0,2025-05-13,2025-05-13 16:37:59.688)
Datum(1,label_1X,1,2025-05-12,2025-05-13 16:37:59.888)
Datum(2,label_2,2,2025-05-11,2025-05-13 16:38:00.088)
Datum(3,label_3,3,2025-05-10,2025-05-13 16:38:00.288)
Datum(4,label_4,4,2025-05-09,2025-05-13 16:38:00.488)
Datum(5,label_5,0,2025-05-08,2025-05-13 16:38:00.688)
Datum(6,label_6,1,2025-05-07,2025-05-13 16:38:00.888)
Datum(7,label_7,2,2025-05-06,2025-05-13 16:38:01.088)
Datum(8,label_8,3,2025-05-05,2025-05-13 16:38:01.288)
Datum(9,label_9,4,2025-05-04,2025-05-13 16:38:01.488)
Datum(10,label_10,0,2025-05-03,2025-05-13 16:38:01.688)
Datum(11,label_11,1,2025-05-02,2025-05-13 16:38:01.888)
Datum(12,label_12,2,2025-05-01,2025-05-13 16:38:02.088)
Datum(13,label_13,3,2025-04-30,2025-05-13 16:38:02.288)
Datum(14,label_14,4,2025-04-29,2025-05-13 16:38:02.488)
Datum(15,label_15,0,2025-04-28,2025-05-13 16:38:02.688)
Datum(16,label_16,1,2025-04-27,2025-05-13 16:38:02.888)
Datum(17,label_17,2,2025-04-26,2025-05-13 16:38:03.088)
Datum(18,label_18,3,2025-04-25,2025-05-13 16:38:03.288)
Datum(19,label_19,4,2025-04-24,2025-05-13 16:38:03.488)
+ And the '*-deletes.parquet' file /partitionKey=1/00000-1000-d5c280a5-ab4d-4e9a-96ed-893c9dfd61d9-00001-deletes.parquet contains a reference to /data/partitionKey=1/00000-992-56cb8eec-e0d8-4468-8d9d-396c41c88620-0-00002.parquet which contains:
Datum(1,label_1,1,2025-05-12,2025-05-13 16:37:59.888)
Datum(6,label_6,1,2025-05-07,2025-05-13 16:38:00.888)
Datum(11,label_11,1,2025-05-02,2025-05-13 16:38:01.888)
Datum(16,label_16,1,2025-04-27,2025-05-13 16:38:02.888)
+ And the new parquet file (/partitionKey=1/00000-1000-d5c280a5-ab4d-4e9a-96ed-893c9dfd61d9-00001.parquet) contains:
Datum(1,label_1X,1,2025-05-12,2025-05-13 16:37:59.888)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
- should reading an updated table using merge-on-read
+ Given a table that has been updated
+ When we read from it
+ Then the table still contains 20 records
+ And there are no new data files
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +