MergeOnReadSpec:
A merge-on-read table
- should create no new files for merge-on-read
  + Given SQL:
CREATE TABLE polaris.my_namespace.MergeOnReadSpec (
  id int,
  label String,
  partitionKey long,
  date Date,
  timestamp Timestamp
) USING iceberg TBLPROPERTIES (
  'format-version' = '2',
  'write.delete.mode' = 'merge-on-read',
  'write.update.mode' = 'merge-on-read',
  'sort-order' = 'partitionKey ASC NULLS FIRST',
  'write.merge.mode' = 'merge-on-read'
) PARTITIONED BY (partitionKey); 
  + When we execute it 
  + Then there is an Iceberg table, polaris.my_namespace.MergeOnReadSpec 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +  
- should insert creates new files for merge-on-read
  + Given SQL:
INSERT INTO
  TABLE polaris.my_namespace.MergeOnReadSpec (id, label, partitionKey, date, timestamp)
VALUES
  (
    0,
    'label_0',
    0,
    cast(date_format('2024-11-15', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:19.792',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    1,
    'label_1',
    1,
    cast(date_format('2024-11-14', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:19.992',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    2,
    'label_2',
    2,
    cast(date_format('2024-11-13', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:20.192',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    3,
    'label_3',
    3,
    cast(date_format('2024-11-12', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:20.392',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    4,
    'label_4',
    4,
    cast(date_format('2024-11-11', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:20.592',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    5,
    'label_5',
    0,
    cast(date_format('2024-11-10', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:20.792',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    6,
    'label_6',
    1,
    cast(date_format('2024-11-09', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:20.992',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    7,
    'label_7',
    2,
    cast(date_format('2024-11-08', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:21.192',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    8,
    'label_8',
    3,
    cast(date_format('2024-11-07', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:21.392',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    9,
    'label_9',
    4,
    cast(date_format('2024-11-06', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:21.592',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    10,
    'label_10',
    0,
    cast(date_format('2024-11-05', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:21.792',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    11,
    'label_11',
    1,
    cast(date_format('2024-11-04', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:21.992',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    12,
    'label_12',
    2,
    cast(date_format('2024-11-03', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:22.192',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    13,
    'label_13',
    3,
    cast(date_format('2024-11-02', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:22.392',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    14,
    'label_14',
    4,
    cast(date_format('2024-11-01', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:22.592',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    15,
    'label_15',
    0,
    cast(date_format('2024-10-31', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:22.792',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    16,
    'label_16',
    1,
    cast(date_format('2024-10-30', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:22.992',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    17,
    'label_17',
    2,
    cast(date_format('2024-10-29', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:23.192',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    18,
    'label_18',
    3,
    cast(date_format('2024-10-28', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:23.392',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ),
  (
    19,
    'label_19',
    4,
    cast(date_format('2024-10-27', 'yyyy-MM-dd') as date),
    cast(
      date_format(
        '2024-11-15 16:48:23.592',
        'yyyy-MM-dd HH:mm:ss.SSS'
      ) as timestamp
    )
  ) 
  + When we execute it 
  + Then there are now 5 data files:
/partitionKey=0/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00001.parquet
/partitionKey=1/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00002.parquet
/partitionKey=2/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00004.parquet
/partitionKey=3/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00003.parquet
/partitionKey=4/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00005.parquet 
  + And the table contains:
Datum(0,label_0,0,2024-11-15,2024-11-15 16:48:19.792)
Datum(1,label_1,1,2024-11-14,2024-11-15 16:48:19.992)
Datum(2,label_2,2,2024-11-13,2024-11-15 16:48:20.192)
Datum(3,label_3,3,2024-11-12,2024-11-15 16:48:20.392)
Datum(4,label_4,4,2024-11-11,2024-11-15 16:48:20.592)
Datum(5,label_5,0,2024-11-10,2024-11-15 16:48:20.792)
Datum(6,label_6,1,2024-11-09,2024-11-15 16:48:20.992)
Datum(7,label_7,2,2024-11-08,2024-11-15 16:48:21.192)
Datum(8,label_8,3,2024-11-07,2024-11-15 16:48:21.392)
Datum(9,label_9,4,2024-11-06,2024-11-15 16:48:21.592)
Datum(10,label_10,0,2024-11-05,2024-11-15 16:48:21.792)
Datum(11,label_11,1,2024-11-04,2024-11-15 16:48:21.992)
Datum(12,label_12,2,2024-11-03,2024-11-15 16:48:22.192)
Datum(13,label_13,3,2024-11-02,2024-11-15 16:48:22.392)
Datum(14,label_14,4,2024-11-01,2024-11-15 16:48:22.592)
Datum(15,label_15,0,2024-10-31,2024-11-15 16:48:22.792)
Datum(16,label_16,1,2024-10-30,2024-11-15 16:48:22.992)
Datum(17,label_17,2,2024-10-29,2024-11-15 16:48:23.192)
Datum(18,label_18,3,2024-10-28,2024-11-15 16:48:23.392)
Datum(19,label_19,4,2024-10-27,2024-11-15 16:48:23.592)
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +  
- should update creates no new files for merge-on-read
  + Given SQL:
UPDATE
  polaris.my_namespace.MergeOnReadSpec
SET
  label = 'label_1X'
WHERE
  id = 1 
  + When we execute it 
  + Then there are now 7 data files:
/partitionKey=0/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00001.parquet
/partitionKey=1/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00002.parquet
/partitionKey=1/00000-735-e229a473-b510-4706-8c06-f8d51561a0c8-00001-deletes.parquet
/partitionKey=1/00000-735-e229a473-b510-4706-8c06-f8d51561a0c8-00001.parquet
/partitionKey=2/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00004.parquet
/partitionKey=3/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00003.parquet
/partitionKey=4/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00005.parquet 
  + And the table contains:
Datum(0,label_0,0,2024-11-15,2024-11-15 16:48:19.792)
Datum(1,label_1X,1,2024-11-14,2024-11-15 16:48:19.992)
Datum(2,label_2,2,2024-11-13,2024-11-15 16:48:20.192)
Datum(3,label_3,3,2024-11-12,2024-11-15 16:48:20.392)
Datum(4,label_4,4,2024-11-11,2024-11-15 16:48:20.592)
Datum(5,label_5,0,2024-11-10,2024-11-15 16:48:20.792)
Datum(6,label_6,1,2024-11-09,2024-11-15 16:48:20.992)
Datum(7,label_7,2,2024-11-08,2024-11-15 16:48:21.192)
Datum(8,label_8,3,2024-11-07,2024-11-15 16:48:21.392)
Datum(9,label_9,4,2024-11-06,2024-11-15 16:48:21.592)
Datum(10,label_10,0,2024-11-05,2024-11-15 16:48:21.792)
Datum(11,label_11,1,2024-11-04,2024-11-15 16:48:21.992)
Datum(12,label_12,2,2024-11-03,2024-11-15 16:48:22.192)
Datum(13,label_13,3,2024-11-02,2024-11-15 16:48:22.392)
Datum(14,label_14,4,2024-11-01,2024-11-15 16:48:22.592)
Datum(15,label_15,0,2024-10-31,2024-11-15 16:48:22.792)
Datum(16,label_16,1,2024-10-30,2024-11-15 16:48:22.992)
Datum(17,label_17,2,2024-10-29,2024-11-15 16:48:23.192)
Datum(18,label_18,3,2024-10-28,2024-11-15 16:48:23.392)
Datum(19,label_19,4,2024-10-27,2024-11-15 16:48:23.592)
 
  + And  the '*-deletes.parquet' file /partitionKey=1/00000-735-e229a473-b510-4706-8c06-f8d51561a0c8-00001-deletes.parquet contains a reference to /data/partitionKey=1/00000-731-a12be8a3-6603-4583-bbf7-dc605b4b0dd4-0-00002.parquet which contains:
Datum(1,label_1,1,2024-11-14,2024-11-15 16:48:19.992)
Datum(6,label_6,1,2024-11-09,2024-11-15 16:48:20.992)
Datum(11,label_11,1,2024-11-04,2024-11-15 16:48:21.992)
Datum(16,label_16,1,2024-10-30,2024-11-15 16:48:22.992)
 
  + And the new parquet file (/partitionKey=1/00000-735-e229a473-b510-4706-8c06-f8d51561a0c8-00001.parquet) contains:
Datum(1,label_1X,1,2024-11-14,2024-11-15 16:48:19.992)
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +  
- should reading an updated table using merge-on-read
  + Given a table that has been updated 
  + When we read from it 
  + Then the table still contains 20 records 
  + And there are no new data files 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +  
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +