diff --git a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java index 3146a3763cce..94ec7606b588 100644 --- a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java @@ -26,6 +26,9 @@ public class UUIDUtil { private static final SecureRandom SECURE_RANDOM = new SecureRandom(); + private static final byte[] HEX_DIGITS = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; private UUIDUtil() {} @@ -82,6 +85,106 @@ public static ByteBuffer convertToByteBuffer(UUID value, ByteBuffer reuse) { return buffer; } + /** + * Parses the ASCII bytes of a canonical UUID string (36 bytes, layout {@code 8-4-4-4-12}) + * directly into the 16-byte big-endian representation, without allocating an intermediate {@link + * String} or {@link UUID}. + * + *

This is intended for hot write paths where the UUID is already available as the ASCII bytes + * of its string form (for example {@code UTF8String#getBytes()}). The output is byte-for-byte + * identical to {@code convertToByteBuffer(UUID.fromString(s), reuse)}. + * + * @param uuidStringBytes ASCII bytes of a canonical UUID string (length 36) + * @param reuse a 16-byte buffer to reuse, or null to allocate a new one + * @return a big-endian buffer holding the 16-byte UUID representation + */ + public static ByteBuffer convertToByteBuffer(byte[] uuidStringBytes, ByteBuffer reuse) { + Preconditions.checkArgument( + uuidStringBytes.length == 36, + "Invalid UUID string: expected 36 ASCII bytes, got %s", + uuidStringBytes.length); + checkDash(uuidStringBytes, 8); + checkDash(uuidStringBytes, 13); + checkDash(uuidStringBytes, 18); + checkDash(uuidStringBytes, 23); + + long mostSigBits = hexToLong(uuidStringBytes, 0, 8); + mostSigBits <<= 16; + mostSigBits |= hexToLong(uuidStringBytes, 9, 13); + mostSigBits <<= 16; + mostSigBits |= hexToLong(uuidStringBytes, 14, 18); + + long leastSigBits = hexToLong(uuidStringBytes, 19, 23); + leastSigBits <<= 48; + leastSigBits |= hexToLong(uuidStringBytes, 24, 36); + + ByteBuffer buffer = reuse != null ? reuse : ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + buffer.putLong(0, mostSigBits); + buffer.putLong(8, leastSigBits); + return buffer; + } + + /** + * Renders the 16-byte big-endian UUID representation as the ASCII bytes of its canonical string + * form (36 bytes, layout {@code 8-4-4-4-12}), without allocating an intermediate {@link UUID} or + * {@link String}. + * + *

This is intended for hot read paths: the result can be wrapped with {@code + * UTF8String#fromBytes} without an intermediate {@link String}. The output is byte-for-byte + * identical to {@code convert(uuidBytes).toString().getBytes(US_ASCII)}. Two longs are read + * relatively from the buffer's current position, mirroring {@link #convert(ByteBuffer)}. + * + * @param uuidBytes a buffer positioned at the 16 UUID bytes (big-endian) + * @param reuse a 36-byte array to reuse, or null to allocate a new one + * @return a 36-byte array holding the ASCII bytes of the canonical UUID string + */ + public static byte[] convertToStringBytes(ByteBuffer uuidBytes, byte[] reuse) { + Preconditions.checkArgument( + reuse == null || reuse.length == 36, + "Invalid reuse buffer: expected 36 bytes, got %s", + reuse == null ? 0 : reuse.length); + long mostSigBits = uuidBytes.getLong(); + long leastSigBits = uuidBytes.getLong(); + + byte[] out = reuse != null ? reuse : new byte[36]; + formatHex(out, 0, mostSigBits >>> 32, 8); + out[8] = '-'; + formatHex(out, 9, mostSigBits >>> 16, 4); + out[13] = '-'; + formatHex(out, 14, mostSigBits, 4); + out[18] = '-'; + formatHex(out, 19, leastSigBits >>> 48, 4); + out[23] = '-'; + formatHex(out, 24, leastSigBits, 12); + return out; + } + + private static void checkDash(byte[] bytes, int pos) { + Preconditions.checkArgument( + bytes[pos] == '-', "Invalid UUID string: expected '-' at position %s", pos); + } + + private static long hexToLong(byte[] bytes, int start, int end) { + long result = 0; + for (int i = start; i < end; i += 1) { + int digit = Character.digit((char) (bytes[i] & 0xFF), 16); + Preconditions.checkArgument( + digit >= 0, "Invalid UUID string: not a hex digit at position %s", i); + result = (result << 4) | digit; + } + + return result; + } + + private static void formatHex(byte[] out, int offset, long value, int digits) { + long bits = value; + for (int i = digits - 1; i >= 0; i -= 1) { + out[offset + i] = HEX_DIGITS[(int) (bits & 0xF)]; + bits >>>= 4; + } + } + /** * Generate a RFC 9562 UUIDv7. * diff --git a/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java b/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java index c5f85c2f20b3..db0efc69444b 100644 --- a/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java +++ b/api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java @@ -19,7 +19,11 @@ package org.apache.iceberg.util; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Locale; import java.util.UUID; import org.junit.jupiter.api.Test; @@ -31,4 +35,123 @@ public void uuidV7HasVersionAndVariant() { assertThat(uuid.version()).isEqualTo(7); assertThat(uuid.variant()).isEqualTo(2); } + + @Test + public void convertStringBytesMatchesUuidConversion() { + for (int i = 0; i < 100; i += 1) { + UUID uuid = UUID.randomUUID(); + byte[] stringBytes = uuid.toString().getBytes(StandardCharsets.US_ASCII); + + byte[] fromString = UUIDUtil.convertToByteBuffer(stringBytes, null).array(); + byte[] fromUuid = UUIDUtil.convertToByteBuffer(uuid, null).array(); + + assertThat(fromString).containsExactly(fromUuid).containsExactly(UUIDUtil.convert(uuid)); + } + } + + @Test + public void convertStringBytesHandlesEdgeValues() { + for (String value : + new String[] { + "00000000-0000-0000-0000-000000000000", + "ffffffff-ffff-ffff-ffff-ffffffffffff", + "12345678-90ab-cdef-1234-567890abcdef" + }) { + UUID uuid = UUID.fromString(value); + byte[] stringBytes = value.getBytes(StandardCharsets.US_ASCII); + + assertThat(UUIDUtil.convertToByteBuffer(stringBytes, null).array()) + .containsExactly(UUIDUtil.convert(uuid)); + } + } + + @Test + public void convertStringBytesAcceptsUppercaseHex() { + UUID uuid = UUID.randomUUID(); + byte[] upper = uuid.toString().toUpperCase(Locale.ROOT).getBytes(StandardCharsets.US_ASCII); + + assertThat(UUIDUtil.convertToByteBuffer(upper, null).array()) + .containsExactly(UUIDUtil.convert(uuid)); + } + + @Test + public void convertStringBytesReusesBuffer() { + UUID uuid = UUID.randomUUID(); + byte[] stringBytes = uuid.toString().getBytes(StandardCharsets.US_ASCII); + + ByteBuffer reuse = ByteBuffer.allocate(16); + ByteBuffer result = UUIDUtil.convertToByteBuffer(stringBytes, reuse); + + assertThat(result).isSameAs(reuse); + assertThat(result.array()).containsExactly(UUIDUtil.convert(uuid)); + } + + @Test + public void convertToStringBytesMatchesUuidToString() { + for (int i = 0; i < 100; i += 1) { + UUID uuid = UUID.randomUUID(); + ByteBuffer raw = UUIDUtil.convertToByteBuffer(uuid, null); + + byte[] stringBytes = UUIDUtil.convertToStringBytes(raw, null); + + assertThat(new String(stringBytes, StandardCharsets.US_ASCII)).isEqualTo(uuid.toString()); + } + } + + @Test + public void convertToStringBytesIsRoundTrippable() { + UUID uuid = UUID.randomUUID(); + + byte[] stringBytes = + UUIDUtil.convertToStringBytes(UUIDUtil.convertToByteBuffer(uuid, null), null); + byte[] raw = UUIDUtil.convertToByteBuffer(stringBytes, null).array(); + + assertThat(UUIDUtil.convert(raw)).isEqualTo(uuid); + } + + @Test + public void convertToStringBytesReusesBuffer() { + UUID uuid = UUID.randomUUID(); + + byte[] reuse = new byte[36]; + byte[] result = UUIDUtil.convertToStringBytes(UUIDUtil.convertToByteBuffer(uuid, null), reuse); + + assertThat(result).isSameAs(reuse); + assertThat(new String(result, StandardCharsets.US_ASCII)).isEqualTo(uuid.toString()); + } + + @Test + public void convertToByteBufferRejectsMalformedInput() { + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer("too-short".getBytes(StandardCharsets.US_ASCII), null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("expected 36 ASCII bytes"); + + // valid length, but the dash positions hold non-dash characters + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer( + "000000000000000000000000000000000000".getBytes(StandardCharsets.US_ASCII), + null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("expected '-'"); + + // valid layout, but a non-hex character in a hex group + assertThatThrownBy( + () -> + UUIDUtil.convertToByteBuffer( + "zzzzzzzz-0000-0000-0000-000000000000".getBytes(StandardCharsets.US_ASCII), + null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("not a hex digit"); + } + + @Test + public void convertToStringBytesRejectsWrongReuseLength() { + ByteBuffer raw = UUIDUtil.convertToByteBuffer(UUID.randomUUID(), null); + assertThatThrownBy(() -> UUIDUtil.convertToStringBytes(raw, new byte[16])) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("expected 36 bytes"); + } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index 670537fbf872..52e8aa6d561e 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -186,7 +186,8 @@ public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; ByteBuffer buffer = ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..27029e757460 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes(), null); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index f9300099fe25..e9efe4562409 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -415,7 +415,9 @@ private static class UUIDReader extends PrimitiveReader { @Override @SuppressWarnings("ByteBufferBackingArray") public UTF8String read(UTF8String ignored) { - return UTF8String.fromString(UUIDUtil.convert(column.nextBinary().toByteBuffer()).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes( + UUIDUtil.convertToStringBytes(column.nextBinary().toByteBuffer(), null)); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index a1ed6c66f337..99a74d5c06b8 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -25,7 +25,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import java.util.stream.IntStream; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; @@ -436,8 +435,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 7e65535f5ecb..81dc3c9bd5cc 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -141,7 +141,8 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { decoder.readFixed(buffer.array(), 0, 16); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index bb8218bd83df..01870518c4b4 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -101,10 +100,8 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index 670537fbf872..52e8aa6d561e 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -186,7 +186,8 @@ public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; ByteBuffer buffer = ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..27029e757460 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes(), null); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 87c97cc7a663..062ddff91760 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -378,7 +378,9 @@ private static class UUIDReader extends PrimitiveReader { @Override @SuppressWarnings("ByteBufferBackingArray") public UTF8String read(UTF8String ignored) { - return UTF8String.fromString(UUIDUtil.convert(column.nextBinary().toByteBuffer()).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes( + UUIDUtil.convertToStringBytes(column.nextBinary().toByteBuffer(), null)); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index a1ed6c66f337..99a74d5c06b8 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -25,7 +25,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import java.util.stream.IntStream; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; @@ -436,8 +435,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 7e65535f5ecb..81dc3c9bd5cc 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -141,7 +141,8 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { decoder.readFixed(buffer.array(), 0, 16); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index bb8218bd83df..01870518c4b4 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -101,10 +100,8 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index 670537fbf872..52e8aa6d561e 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -186,7 +186,8 @@ public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; ByteBuffer buffer = ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..27029e757460 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes(), null); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 28a9a31c6a6e..141df685f2b6 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -395,7 +395,9 @@ private static class UUIDReader extends PrimitiveReader { @Override @SuppressWarnings("ByteBufferBackingArray") public UTF8String read(UTF8String ignored) { - return UTF8String.fromString(UUIDUtil.convert(column.nextBinary().toByteBuffer()).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes( + UUIDUtil.convertToStringBytes(column.nextBinary().toByteBuffer(), null)); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index ba816efc0ac8..c6411cd88472 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,7 +26,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import java.util.stream.IntStream; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; @@ -456,8 +455,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 7bcb3e3fae01..729a27ee294d 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -146,7 +146,8 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { decoder.readFixed(buffer.array(), 0, 16); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index d0d3483a7690..9075ca397044 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -107,10 +106,8 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index 67664ac6c753..ca23c75538b7 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -193,7 +193,8 @@ public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; ByteBuffer buffer = ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index 7f9810e4c60c..27029e757460 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; import java.util.List; -import java.util.UUID; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; import org.apache.iceberg.orc.OrcValueWriter; @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { // ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so // can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes // would then overwrite previous values - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString())); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes(), null); ((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index a19ed8060737..0ea70e3b0a26 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -397,7 +397,9 @@ private static class UUIDReader extends PrimitiveReader { @Override @SuppressWarnings("ByteBufferBackingArray") public UTF8String read(UTF8String ignored) { - return UTF8String.fromString(UUIDUtil.convert(column.nextBinary().toByteBuffer()).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes( + UUIDUtil.convertToStringBytes(column.nextBinary().toByteBuffer(), null)); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index ba816efc0ac8..c6411cd88472 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,7 +26,6 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.UUID; import java.util.stream.IntStream; import java.util.stream.Stream; import org.apache.iceberg.FieldMetrics; @@ -456,8 +455,7 @@ private UUIDWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, UTF8String string) { - UUID uuid = UUID.fromString(string.toString()); - ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()); + ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer)); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 7bcb3e3fae01..729a27ee294d 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -146,7 +146,8 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { decoder.readFixed(buffer.array(), 0, 16); - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); + // a fresh array is required because UTF8String.fromBytes wraps it without copying + return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null)); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index d0d3483a7690..9075ca397044 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -23,7 +23,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; -import java.util.UUID; import org.apache.avro.io.Encoder; import org.apache.avro.util.Utf8; import org.apache.iceberg.avro.ValueWriter; @@ -107,10 +106,8 @@ private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); // calling array() is safe because the buffer is always allocated by the thread-local - encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array()); + encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array()); } }