Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions api/src/main/java/org/apache/iceberg/util/UUIDUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

public class UUIDUtil {
private static final SecureRandom SECURE_RANDOM = new SecureRandom();
private static final byte[] HEX_DIGITS = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};

private UUIDUtil() {}

Expand Down Expand Up @@ -82,6 +85,106 @@ public static ByteBuffer convertToByteBuffer(UUID value, ByteBuffer reuse) {
return buffer;
}

/**
* Parses the ASCII bytes of a canonical UUID string (36 bytes, layout {@code 8-4-4-4-12})
* directly into the 16-byte big-endian representation, without allocating an intermediate {@link
* String} or {@link UUID}.
*
* <p>This is intended for hot write paths where the UUID is already available as the ASCII bytes
* of its string form (for example {@code UTF8String#getBytes()}). The output is byte-for-byte
* identical to {@code convertToByteBuffer(UUID.fromString(s), reuse)}.
*
* @param uuidStringBytes ASCII bytes of a canonical UUID string (length 36)
* @param reuse a 16-byte buffer to reuse, or null to allocate a new one
* @return a big-endian buffer holding the 16-byte UUID representation
*/
public static ByteBuffer convertToByteBuffer(byte[] uuidStringBytes, ByteBuffer reuse) {
Preconditions.checkArgument(
uuidStringBytes.length == 36,
"Invalid UUID string: expected 36 ASCII bytes, got %s",
uuidStringBytes.length);
checkDash(uuidStringBytes, 8);
checkDash(uuidStringBytes, 13);
checkDash(uuidStringBytes, 18);
checkDash(uuidStringBytes, 23);

long mostSigBits = hexToLong(uuidStringBytes, 0, 8);
mostSigBits <<= 16;
mostSigBits |= hexToLong(uuidStringBytes, 9, 13);
mostSigBits <<= 16;
mostSigBits |= hexToLong(uuidStringBytes, 14, 18);

long leastSigBits = hexToLong(uuidStringBytes, 19, 23);
leastSigBits <<= 48;
leastSigBits |= hexToLong(uuidStringBytes, 24, 36);

ByteBuffer buffer = reuse != null ? reuse : ByteBuffer.allocate(16);
buffer.order(ByteOrder.BIG_ENDIAN);
buffer.putLong(0, mostSigBits);
buffer.putLong(8, leastSigBits);
return buffer;
}

/**
* Renders the 16-byte big-endian UUID representation as the ASCII bytes of its canonical string
* form (36 bytes, layout {@code 8-4-4-4-12}), without allocating an intermediate {@link UUID} or
* {@link String}.
*
* <p>This is intended for hot read paths: the result can be wrapped with {@code
* UTF8String#fromBytes} without an intermediate {@link String}. The output is byte-for-byte
* identical to {@code convert(uuidBytes).toString().getBytes(US_ASCII)}. Two longs are read
* relatively from the buffer's current position, mirroring {@link #convert(ByteBuffer)}.
*
* @param uuidBytes a buffer positioned at the 16 UUID bytes (big-endian)
* @param reuse a 36-byte array to reuse, or null to allocate a new one
* @return a 36-byte array holding the ASCII bytes of the canonical UUID string
*/
public static byte[] convertToStringBytes(ByteBuffer uuidBytes, byte[] reuse) {
Preconditions.checkArgument(
reuse == null || reuse.length == 36,
"Invalid reuse buffer: expected 36 bytes, got %s",
reuse == null ? 0 : reuse.length);
long mostSigBits = uuidBytes.getLong();
long leastSigBits = uuidBytes.getLong();

byte[] out = reuse != null ? reuse : new byte[36];
formatHex(out, 0, mostSigBits >>> 32, 8);
out[8] = '-';
formatHex(out, 9, mostSigBits >>> 16, 4);
out[13] = '-';
formatHex(out, 14, mostSigBits, 4);
out[18] = '-';
formatHex(out, 19, leastSigBits >>> 48, 4);
out[23] = '-';
formatHex(out, 24, leastSigBits, 12);
return out;
}

private static void checkDash(byte[] bytes, int pos) {
Preconditions.checkArgument(
bytes[pos] == '-', "Invalid UUID string: expected '-' at position %s", pos);
}

private static long hexToLong(byte[] bytes, int start, int end) {
long result = 0;
for (int i = start; i < end; i += 1) {
int digit = Character.digit((char) (bytes[i] & 0xFF), 16);
Preconditions.checkArgument(
digit >= 0, "Invalid UUID string: not a hex digit at position %s", i);
result = (result << 4) | digit;
}

return result;
}

private static void formatHex(byte[] out, int offset, long value, int digits) {
long bits = value;
for (int i = digits - 1; i >= 0; i -= 1) {
out[offset + i] = HEX_DIGITS[(int) (bits & 0xF)];
bits >>>= 4;
}
}

/**
* Generate a RFC 9562 UUIDv7.
*
Expand Down
123 changes: 123 additions & 0 deletions api/src/test/java/org/apache/iceberg/util/TestUUIDUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
package org.apache.iceberg.util;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.UUID;
import org.junit.jupiter.api.Test;

Expand All @@ -31,4 +35,123 @@ public void uuidV7HasVersionAndVariant() {
assertThat(uuid.version()).isEqualTo(7);
assertThat(uuid.variant()).isEqualTo(2);
}

@Test
public void convertStringBytesMatchesUuidConversion() {
for (int i = 0; i < 100; i += 1) {
UUID uuid = UUID.randomUUID();
byte[] stringBytes = uuid.toString().getBytes(StandardCharsets.US_ASCII);

byte[] fromString = UUIDUtil.convertToByteBuffer(stringBytes, null).array();
byte[] fromUuid = UUIDUtil.convertToByteBuffer(uuid, null).array();

assertThat(fromString).containsExactly(fromUuid).containsExactly(UUIDUtil.convert(uuid));
}
}

@Test
public void convertStringBytesHandlesEdgeValues() {
for (String value :
new String[] {
"00000000-0000-0000-0000-000000000000",
"ffffffff-ffff-ffff-ffff-ffffffffffff",
"12345678-90ab-cdef-1234-567890abcdef"
}) {
UUID uuid = UUID.fromString(value);
byte[] stringBytes = value.getBytes(StandardCharsets.US_ASCII);

assertThat(UUIDUtil.convertToByteBuffer(stringBytes, null).array())
.containsExactly(UUIDUtil.convert(uuid));
}
}

@Test
public void convertStringBytesAcceptsUppercaseHex() {
UUID uuid = UUID.randomUUID();
byte[] upper = uuid.toString().toUpperCase(Locale.ROOT).getBytes(StandardCharsets.US_ASCII);

assertThat(UUIDUtil.convertToByteBuffer(upper, null).array())
.containsExactly(UUIDUtil.convert(uuid));
}

@Test
public void convertStringBytesReusesBuffer() {
UUID uuid = UUID.randomUUID();
byte[] stringBytes = uuid.toString().getBytes(StandardCharsets.US_ASCII);

ByteBuffer reuse = ByteBuffer.allocate(16);
ByteBuffer result = UUIDUtil.convertToByteBuffer(stringBytes, reuse);

assertThat(result).isSameAs(reuse);
assertThat(result.array()).containsExactly(UUIDUtil.convert(uuid));
}

@Test
public void convertToStringBytesMatchesUuidToString() {
for (int i = 0; i < 100; i += 1) {
UUID uuid = UUID.randomUUID();
ByteBuffer raw = UUIDUtil.convertToByteBuffer(uuid, null);

byte[] stringBytes = UUIDUtil.convertToStringBytes(raw, null);

assertThat(new String(stringBytes, StandardCharsets.US_ASCII)).isEqualTo(uuid.toString());
}
}

@Test
public void convertToStringBytesIsRoundTrippable() {
UUID uuid = UUID.randomUUID();

byte[] stringBytes =
UUIDUtil.convertToStringBytes(UUIDUtil.convertToByteBuffer(uuid, null), null);
byte[] raw = UUIDUtil.convertToByteBuffer(stringBytes, null).array();

assertThat(UUIDUtil.convert(raw)).isEqualTo(uuid);
}

@Test
public void convertToStringBytesReusesBuffer() {
UUID uuid = UUID.randomUUID();

byte[] reuse = new byte[36];
byte[] result = UUIDUtil.convertToStringBytes(UUIDUtil.convertToByteBuffer(uuid, null), reuse);

assertThat(result).isSameAs(reuse);
assertThat(new String(result, StandardCharsets.US_ASCII)).isEqualTo(uuid.toString());
}

@Test
public void convertToByteBufferRejectsMalformedInput() {
assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer("too-short".getBytes(StandardCharsets.US_ASCII), null))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("expected 36 ASCII bytes");

// valid length, but the dash positions hold non-dash characters
assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer(
"000000000000000000000000000000000000".getBytes(StandardCharsets.US_ASCII),
null))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("expected '-'");

// valid layout, but a non-hex character in a hex group
assertThatThrownBy(
() ->
UUIDUtil.convertToByteBuffer(
"zzzzzzzz-0000-0000-0000-000000000000".getBytes(StandardCharsets.US_ASCII),
null))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("not a hex digit");
}

@Test
public void convertToStringBytesRejectsWrongReuseLength() {
ByteBuffer raw = UUIDUtil.convertToByteBuffer(UUID.randomUUID(), null);
assertThatThrownBy(() -> UUIDUtil.convertToStringBytes(raw, new byte[16]))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("expected 36 bytes");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ public UTF8String nonNullRead(ColumnVector vector, int row) {
BytesColumnVector bytesVector = (BytesColumnVector) vector;
ByteBuffer buffer =
ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]);
return UTF8String.fromString(UUIDUtil.convert(buffer).toString());
// a fresh array is required because UTF8String.fromBytes wraps it without copying
return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.orc.OrcValueWriter;
Expand Down Expand Up @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
// ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
// can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
// would then overwrite previous values
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes(), null);
((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,9 @@ private static class UUIDReader extends PrimitiveReader<UTF8String> {
@Override
@SuppressWarnings("ByteBufferBackingArray")
public UTF8String read(UTF8String ignored) {
return UTF8String.fromString(UUIDUtil.convert(column.nextBinary().toByteBuffer()).toString());
// a fresh array is required because UTF8String.fromBytes wraps it without copying
return UTF8String.fromBytes(
UUIDUtil.convertToStringBytes(column.nextBinary().toByteBuffer(), null));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.IntStream;
import org.apache.iceberg.Schema;
import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry;
Expand Down Expand Up @@ -436,8 +435,7 @@ private UUIDWriter(ColumnDescriptor desc) {

@Override
public void write(int repetitionLevel, UTF8String string) {
UUID uuid = UUID.fromString(string.toString());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException {

decoder.readFixed(buffer.array(), 0, 16);

return UTF8String.fromString(UUIDUtil.convert(buffer).toString());
// a fresh array is required because UTF8String.fromBytes wraps it without copying
return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.List;
import java.util.UUID;
import org.apache.avro.io.Encoder;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.avro.ValueWriter;
Expand Down Expand Up @@ -101,10 +100,8 @@ private UUIDWriter() {}
@Override
@SuppressWarnings("ByteBufferBackingArray")
public void write(UTF8String s, Encoder encoder) throws IOException {
// TODO: direct conversion from string to byte buffer
UUID uuid = UUID.fromString(s.toString());
// calling array() is safe because the buffer is always allocated by the thread-local
encoder.writeFixed(UUIDUtil.convertToByteBuffer(uuid, BUFFER.get()).array());
encoder.writeFixed(UUIDUtil.convertToByteBuffer(s.getBytes(), BUFFER.get()).array());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ public UTF8String nonNullRead(ColumnVector vector, int row) {
BytesColumnVector bytesVector = (BytesColumnVector) vector;
ByteBuffer buffer =
ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]);
return UTF8String.fromString(UUIDUtil.convert(buffer).toString());
// a fresh array is required because UTF8String.fromBytes wraps it without copying
return UTF8String.fromBytes(UUIDUtil.convertToStringBytes(buffer, null));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.orc.OrcValueWriter;
Expand Down Expand Up @@ -88,7 +87,7 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) {
// ((BytesColumnVector) output).setRef(..) just stores a reference to the passed byte[], so
// can't use a ThreadLocal ByteBuffer here like in other places because subsequent writes
// would then overwrite previous values
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(UUID.fromString(data.toString()));
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(data.getBytes(), null);
((BytesColumnVector) output).setRef(rowId, buffer.array(), 0, buffer.array().length);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,9 @@ private static class UUIDReader extends PrimitiveReader<UTF8String> {
@Override
@SuppressWarnings("ByteBufferBackingArray")
public UTF8String read(UTF8String ignored) {
return UTF8String.fromString(UUIDUtil.convert(column.nextBinary().toByteBuffer()).toString());
// a fresh array is required because UTF8String.fromBytes wraps it without copying
return UTF8String.fromBytes(
UUIDUtil.convertToStringBytes(column.nextBinary().toByteBuffer(), null));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.IntStream;
import org.apache.iceberg.Schema;
import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry;
Expand Down Expand Up @@ -436,8 +435,7 @@ private UUIDWriter(ColumnDescriptor desc) {

@Override
public void write(int repetitionLevel, UTF8String string) {
UUID uuid = UUID.fromString(string.toString());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(uuid, BUFFER.get());
ByteBuffer buffer = UUIDUtil.convertToByteBuffer(string.getBytes(), BUFFER.get());
column.writeBinary(repetitionLevel, Binary.fromReusedByteBuffer(buffer));
}
}
Expand Down
Loading