Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.transforms.Transforms;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
Expand Down Expand Up @@ -207,10 +208,54 @@ private Expression bindLiteralOperation(BoundTerm<T> boundTerm) {
}
}

// TODO: translate truncate(col) == value to startsWith(value)
return bindLiteralPredicate(boundTerm, lit);
}

private Expression bindLiteralPredicate(BoundTerm<T> boundTerm, Literal<T> lit) {
// Rewrite EQ/NOT_EQ on a string truncate transform term to an exactly-equivalent predicate on
// the source column so metrics, dictionary, and partition pruning can use the column directly.
if ((op() == Operation.EQ || op() == Operation.NOT_EQ)
&& boundTerm instanceof BoundTransform
&& lit.value() instanceof CharSequence) {
BoundTransform<?, ?> transformTerm = (BoundTransform<?, ?>) boundTerm;
Transforms.StringTruncateRewrite rewrite =
Transforms.stringTruncateRewrite(transformTerm, ((CharSequence) lit.value()).length());
if (rewrite != null) {
return rewriteStringTruncate(transformTerm, lit, rewrite);
}
}

return new BoundLiteralPredicate<>(op(), boundTerm, lit);
}

/**
* Rewrites an EQ/NOT_EQ predicate whose term is a string {@code truncate} transform to an
* exactly-equivalent predicate on the untransformed source column.
*
* <p>For {@code truncate[W](col) == v}: {@code len(v) > W} is unsatisfiable; {@code len(v) == W}
* is equivalent to {@code col STARTS_WITH v}; {@code len(v) < W} is equivalent to {@code col ==
* v}. NOT_EQ is the exact negation.
*/
@SuppressWarnings("unchecked")
private Expression rewriteStringTruncate(
BoundTransform<?, ?> transformTerm,
Literal<T> lit,
Transforms.StringTruncateRewrite rewrite) {
boolean isEq = op() == Operation.EQ;
BoundTerm<T> sourceRef = (BoundTerm<T>) transformTerm.ref();
switch (rewrite) {
case NONE:
return isEq ? Expressions.alwaysFalse() : Expressions.alwaysTrue();
case STARTS_WITH:
return new BoundLiteralPredicate<>(
isEq ? Operation.STARTS_WITH : Operation.NOT_STARTS_WITH, sourceRef, lit);
case EXACT:
return new BoundLiteralPredicate<>(isEq ? Operation.EQ : Operation.NOT_EQ, sourceRef, lit);
default:
throw new IllegalStateException("Unexpected string truncate rewrite: " + rewrite);
}
}

private Expression bindInOperation(BoundTerm<T> boundTerm) {
List<Literal<T>> convertedLiterals =
Lists.newArrayList(
Expand Down
41 changes: 41 additions & 0 deletions api/src/main/java/org/apache/iceberg/transforms/Transforms.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.regex.Pattern;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.BoundTransform;
import org.apache.iceberg.types.Type;

/**
Expand Down Expand Up @@ -280,4 +281,44 @@ public static <T> Transform<T, T> truncate(int width) {
public static <T> Transform<T, Void> alwaysNull() {
return VoidTransform.get();
}

/**
* Describes how an equality predicate on a string {@code truncate} transform term can be
* rewritten to an exactly-equivalent predicate on the untransformed source column.
*
* <ul>
* <li>{@link #NONE}: the literal is longer than the truncate width, so {@code truncate(col) ==
* v} can never match (and {@code != v} always matches)
* <li>{@link #STARTS_WITH}: the literal length equals the truncate width, so {@code
* truncate(col) == v} is equivalent to {@code col STARTS_WITH v}
* <li>{@link #EXACT}: the literal is shorter than the truncate width, so {@code truncate(col)
* == v} is equivalent to {@code col == v}
* </ul>
*/
public enum StringTruncateRewrite {
NONE,
STARTS_WITH,
EXACT
}

/**
* Returns how an equality predicate on a string {@code truncate} transform term can be rewritten
* to an exactly-equivalent predicate on the untransformed source column, or null if the term is
* not a string truncate transform (in which case the predicate must be left unchanged).
*
* <p>This is used during predicate binding so that metrics, dictionary, and partition pruning can
* use the source column directly instead of an opaque transform term.
*
* @param term a bound transform term, e.g. {@code truncate[W](col)}
* @param literalLength the length of the equality literal
* @return the rewrite kind, or null if {@code term} is not a string truncate transform
*/
public static StringTruncateRewrite stringTruncateRewrite(
BoundTransform<?, ?> term, int literalLength) {
if (!(term.transform() instanceof Truncate) || term.type().typeId() != Type.TypeID.STRING) {
return null;
}

return Truncate.lengthRewrite(literalLength, ((Truncate<?>) term.transform()).width());
}
}
88 changes: 66 additions & 22 deletions api/src/main/java/org/apache/iceberg/transforms/Truncate.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,25 @@ public Integer width() {
return width;
}

/**
* Classifies a string truncate predicate by the literal length relative to the truncate width.
*
* <p>This is the single source of truth for the {@code < width / == width / > width} decision. It
* is shared by {@link TruncateString#project} / {@link TruncateString#projectStrict} (projecting
* a {@code startsWith} predicate onto a truncate partition) and by predicate binding (rewriting
* {@code truncate(col) == v} to an equivalent predicate on the source column) so the two can
* never diverge.
*/
static Transforms.StringTruncateRewrite lengthRewrite(int literalLength, int width) {
if (literalLength < width) {
return Transforms.StringTruncateRewrite.EXACT;
} else if (literalLength == width) {
return Transforms.StringTruncateRewrite.STARTS_WITH;
} else {
return Transforms.StringTruncateRewrite.NONE;
}
}

@Override
public T apply(T value) {
throw new UnsupportedOperationException(
Expand Down Expand Up @@ -328,22 +347,34 @@ public UnboundPredicate<CharSequence> project(
BoundLiteralPredicate<CharSequence> pred = predicate.asLiteralPredicate();
switch (pred.op()) {
case STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.equal(name, pred.literal().value());
switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) {
case EXACT:
return Expressions.predicate(pred.op(), name, pred.literal().value());
case STARTS_WITH:
return Expressions.equal(name, pred.literal().value());
default:
return ProjectionUtil.truncateArray(name, pred, this);
}

return ProjectionUtil.truncateArray(name, pred, this);

case NOT_STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.notEqual(name, pred.literal().value());
switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) {
case EXACT:
return Expressions.predicate(pred.op(), name, pred.literal().value());
case STARTS_WITH:
return Expressions.notEqual(name, pred.literal().value());
default:
return null;
}

return null;
case EQ:
case NOT_EQ:
// when the literal is shorter than the truncate width, truncate(col) == v is
// equivalent to col == v, so the predicate projects exactly onto the partition value
if (Truncate.lengthRewrite(pred.literal().value().length(), width())
== Transforms.StringTruncateRewrite.EXACT) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
}
return ProjectionUtil.truncateArray(name, pred, this);

default:
return ProjectionUtil.truncateArray(name, pred, this);
Expand All @@ -367,22 +398,35 @@ public UnboundPredicate<CharSequence> projectStrict(
BoundLiteralPredicate<CharSequence> pred = predicate.asLiteralPredicate();
switch (pred.op()) {
case STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.equal(name, pred.literal().value());
switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) {
case EXACT:
return Expressions.predicate(pred.op(), name, pred.literal().value());
case STARTS_WITH:
return Expressions.equal(name, pred.literal().value());
default:
return null;
}

return null;

case NOT_STARTS_WITH:
if (pred.literal().value().length() < width()) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
} else if (pred.literal().value().length() == width()) {
return Expressions.notEqual(name, pred.literal().value());
switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) {
case EXACT:
return Expressions.predicate(pred.op(), name, pred.literal().value());
case STARTS_WITH:
return Expressions.notEqual(name, pred.literal().value());
default:
return Expressions.predicate(
pred.op(), name, apply(pred.literal().value()).toString());
}

return Expressions.predicate(pred.op(), name, apply(pred.literal().value()).toString());
case EQ:
case NOT_EQ:
// when the literal is shorter than the truncate width, truncate(col) == v is
// equivalent to col == v, so the predicate projects exactly onto the partition value
if (Truncate.lengthRewrite(pred.literal().value().length(), width())
== Transforms.StringTruncateRewrite.EXACT) {
return Expressions.predicate(pred.op(), name, pred.literal().value());
}
return ProjectionUtil.truncateArrayStrict(name, pred, this);

default:
return ProjectionUtil.truncateArrayStrict(name, pred, this);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,78 @@ public void testPredicateBindingForStringPrefixComparisons() {
}
}

@Test
public void testStringTruncateBindingRewrite() {
StructType struct =
StructType.of(
required(20, "s", Types.StringType.get()), required(21, "i", Types.IntegerType.get()));

// len(value) < width: equivalent to s == value
BoundPredicate<?> exact =
assertAndUnwrap(Expressions.equal(Expressions.truncate("s", 4), "abc").bind(struct));
assertThat(exact.op()).isEqualTo(EQ);
assertThat(exact.term()).isInstanceOf(BoundReference.class);
assertThat(exact.ref().fieldId()).isEqualTo(20);
assertThat(String.valueOf(exact.asLiteralPredicate().literal().value())).isEqualTo("abc");

// len(value) == width: equivalent to s startsWith value
BoundPredicate<?> prefix =
assertAndUnwrap(Expressions.equal(Expressions.truncate("s", 4), "abcd").bind(struct));
assertThat(prefix.op()).isEqualTo(STARTS_WITH);
assertThat(prefix.term()).isInstanceOf(BoundReference.class);
assertThat(String.valueOf(prefix.asLiteralPredicate().literal().value())).isEqualTo("abcd");

// len(value) > width: truncate(s) == value is unsatisfiable
assertThat(Expressions.equal(Expressions.truncate("s", 4), "abcde").bind(struct))
.as("equal beyond truncate width should be alwaysFalse")
.isEqualTo(Expressions.alwaysFalse());

// NOT_EQ is the exact negation
BoundPredicate<?> notExact =
assertAndUnwrap(Expressions.notEqual(Expressions.truncate("s", 4), "abc").bind(struct));
assertThat(notExact.op()).isEqualTo(NOT_EQ);
assertThat(notExact.term()).isInstanceOf(BoundReference.class);

BoundPredicate<?> notPrefix =
assertAndUnwrap(Expressions.notEqual(Expressions.truncate("s", 4), "abcd").bind(struct));
assertThat(notPrefix.op()).isEqualTo(NOT_STARTS_WITH);
assertThat(notPrefix.term()).isInstanceOf(BoundReference.class);

assertThat(Expressions.notEqual(Expressions.truncate("s", 4), "abcde").bind(struct))
.as("notEqual beyond truncate width should be alwaysTrue")
.isEqualTo(Expressions.alwaysTrue());

// empty string is shorter than any (positive) width: equivalent to s == ""
BoundPredicate<?> empty =
assertAndUnwrap(Expressions.equal(Expressions.truncate("s", 4), "").bind(struct));
assertThat(empty.op()).isEqualTo(EQ);
assertThat(empty.term()).isInstanceOf(BoundReference.class);
assertThat(String.valueOf(empty.asLiteralPredicate().literal().value())).isEmpty();

// non-string truncate is left unchanged (no startsWith analogue)
BoundPredicate<?> intTruncate =
assertAndUnwrap(Expressions.equal(Expressions.truncate("i", 10), 5).bind(struct));
assertThat(intTruncate.op()).isEqualTo(EQ);
assertThat(intTruncate.term()).isInstanceOf(BoundTransform.class);

// non-truncate transform is left unchanged
BoundPredicate<?> bucket =
assertAndUnwrap(Expressions.equal(Expressions.bucket("s", 16), 3).bind(struct));
assertThat(bucket.op()).isEqualTo(EQ);
assertThat(bucket.term()).isInstanceOf(BoundTransform.class);

// other operators are left unchanged
BoundPredicate<?> lt =
assertAndUnwrap(Expressions.lessThan(Expressions.truncate("s", 4), "abcd").bind(struct));
assertThat(lt.op()).isEqualTo(LT);
assertThat(lt.term()).isInstanceOf(BoundTransform.class);

BoundPredicate<?> sw =
assertAndUnwrap(Expressions.startsWith(Expressions.truncate("s", 4), "abcd").bind(struct));
assertThat(sw.op()).isEqualTo(STARTS_WITH);
assertThat(sw.term()).isInstanceOf(BoundTransform.class);
}

@Test
public void testLiteralConversion() {
StructType struct = StructType.of(required(15, "d", Types.DecimalType.of(9, 2)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
package org.apache.iceberg.transforms;

import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound;
import static org.apache.iceberg.expressions.Expressions.equal;
import static org.apache.iceberg.expressions.Expressions.startsWith;
import static org.apache.iceberg.expressions.Expressions.truncate;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.assertj.core.api.Assertions.assertThat;

Expand All @@ -28,6 +30,7 @@
import org.apache.iceberg.TestHelpers;
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.BoundPredicate;
import org.apache.iceberg.expressions.BoundReference;
import org.apache.iceberg.expressions.Evaluator;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.False;
Expand Down Expand Up @@ -75,6 +78,44 @@ public void testTruncateString() {
.isTrue();
}

@Test
@SuppressWarnings("unchecked")
public void testTruncateEqualityRewrite() {
// len(value) == width: truncate(col, 4) == "abab" is rewritten to col startsWith "abab"
BoundPredicate<String> prefix =
(BoundPredicate<String>)
Binder.bind(SCHEMA.asStruct(), equal(truncate(COLUMN, 4), "abab"), false);
assertThat(prefix.op()).isEqualTo(Expression.Operation.STARTS_WITH);
assertThat(prefix.term()).isInstanceOf(BoundReference.class);

Evaluator prefixEval = new Evaluator(SCHEMA.asStruct(), equal(truncate(COLUMN, 4), "abab"));
assertThat(prefixEval.eval(TestHelpers.Row.of("ababXYZ")))
.as("truncate(ababXYZ,4) == abab")
.isTrue();
assertThat(prefixEval.eval(TestHelpers.Row.of("abab"))).as("truncate(abab,4) == abab").isTrue();
assertThat(prefixEval.eval(TestHelpers.Row.of("abXX")))
.as("truncate(abXX,4) != abab")
.isFalse();

// len(value) < width: truncate(col, 6) == "abc" is rewritten to col == "abc" (exact),
// which must NOT match longer strings the way startsWith would
BoundPredicate<String> exact =
(BoundPredicate<String>)
Binder.bind(SCHEMA.asStruct(), equal(truncate(COLUMN, 6), "abc"), false);
assertThat(exact.op()).isEqualTo(Expression.Operation.EQ);
assertThat(exact.term()).isInstanceOf(BoundReference.class);

Evaluator exactEval = new Evaluator(SCHEMA.asStruct(), equal(truncate(COLUMN, 6), "abc"));
assertThat(exactEval.eval(TestHelpers.Row.of("abc"))).as("truncate(abc,6) == abc").isTrue();
assertThat(exactEval.eval(TestHelpers.Row.of("abcd")))
.as("truncate(abcd,6) != abc (exact, not prefix)")
.isFalse();

// len(value) > width: truncate(col, 2) == "abcde" can never match
Expression none = Binder.bind(SCHEMA.asStruct(), equal(truncate(COLUMN, 2), "abcde"), false);
assertThat(none).isInstanceOf(False.class);
}

private void assertProjectionInclusive(
PartitionSpec spec,
UnboundPredicate<?> filter,
Expand Down
Loading