diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java index 75ca9d5835bc..81c641ffdc62 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java @@ -26,6 +26,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -207,10 +208,54 @@ private Expression bindLiteralOperation(BoundTerm boundTerm) { } } - // TODO: translate truncate(col) == value to startsWith(value) + return bindLiteralPredicate(boundTerm, lit); + } + + private Expression bindLiteralPredicate(BoundTerm boundTerm, Literal lit) { + // Rewrite EQ/NOT_EQ on a string truncate transform term to an exactly-equivalent predicate on + // the source column so metrics, dictionary, and partition pruning can use the column directly. + if ((op() == Operation.EQ || op() == Operation.NOT_EQ) + && boundTerm instanceof BoundTransform + && lit.value() instanceof CharSequence) { + BoundTransform transformTerm = (BoundTransform) boundTerm; + Transforms.StringTruncateRewrite rewrite = + Transforms.stringTruncateRewrite(transformTerm, ((CharSequence) lit.value()).length()); + if (rewrite != null) { + return rewriteStringTruncate(transformTerm, lit, rewrite); + } + } + return new BoundLiteralPredicate<>(op(), boundTerm, lit); } + /** + * Rewrites an EQ/NOT_EQ predicate whose term is a string {@code truncate} transform to an + * exactly-equivalent predicate on the untransformed source column. + * + *

For {@code truncate[W](col) == v}: {@code len(v) > W} is unsatisfiable; {@code len(v) == W} + * is equivalent to {@code col STARTS_WITH v}; {@code len(v) < W} is equivalent to {@code col == + * v}. NOT_EQ is the exact negation. + */ + @SuppressWarnings("unchecked") + private Expression rewriteStringTruncate( + BoundTransform transformTerm, + Literal lit, + Transforms.StringTruncateRewrite rewrite) { + boolean isEq = op() == Operation.EQ; + BoundTerm sourceRef = (BoundTerm) transformTerm.ref(); + switch (rewrite) { + case NONE: + return isEq ? Expressions.alwaysFalse() : Expressions.alwaysTrue(); + case STARTS_WITH: + return new BoundLiteralPredicate<>( + isEq ? Operation.STARTS_WITH : Operation.NOT_STARTS_WITH, sourceRef, lit); + case EXACT: + return new BoundLiteralPredicate<>(isEq ? Operation.EQ : Operation.NOT_EQ, sourceRef, lit); + default: + throw new IllegalStateException("Unexpected string truncate rewrite: " + rewrite); + } + } + private Expression bindInOperation(BoundTerm boundTerm) { List> convertedLiterals = Lists.newArrayList( diff --git a/api/src/main/java/org/apache/iceberg/transforms/Transforms.java b/api/src/main/java/org/apache/iceberg/transforms/Transforms.java index a3a6a3f6321d..3eab92142d32 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Transforms.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Transforms.java @@ -23,6 +23,7 @@ import java.util.regex.Pattern; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.BoundTransform; import org.apache.iceberg.types.Type; /** @@ -280,4 +281,44 @@ public static Transform truncate(int width) { public static Transform alwaysNull() { return VoidTransform.get(); } + + /** + * Describes how an equality predicate on a string {@code truncate} transform term can be + * rewritten to an exactly-equivalent predicate on the untransformed source column. + * + *

    + *
  • {@link #NONE}: the literal is longer than the truncate width, so {@code truncate(col) == + * v} can never match (and {@code != v} always matches) + *
  • {@link #STARTS_WITH}: the literal length equals the truncate width, so {@code + * truncate(col) == v} is equivalent to {@code col STARTS_WITH v} + *
  • {@link #EXACT}: the literal is shorter than the truncate width, so {@code truncate(col) + * == v} is equivalent to {@code col == v} + *
+ */ + public enum StringTruncateRewrite { + NONE, + STARTS_WITH, + EXACT + } + + /** + * Returns how an equality predicate on a string {@code truncate} transform term can be rewritten + * to an exactly-equivalent predicate on the untransformed source column, or null if the term is + * not a string truncate transform (in which case the predicate must be left unchanged). + * + *

This is used during predicate binding so that metrics, dictionary, and partition pruning can + * use the source column directly instead of an opaque transform term. + * + * @param term a bound transform term, e.g. {@code truncate[W](col)} + * @param literalLength the length of the equality literal + * @return the rewrite kind, or null if {@code term} is not a string truncate transform + */ + public static StringTruncateRewrite stringTruncateRewrite( + BoundTransform term, int literalLength) { + if (!(term.transform() instanceof Truncate) || term.type().typeId() != Type.TypeID.STRING) { + return null; + } + + return Truncate.lengthRewrite(literalLength, ((Truncate) term.transform()).width()); + } } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java index a111e4ca394b..619a8723b0fa 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java @@ -78,6 +78,25 @@ public Integer width() { return width; } + /** + * Classifies a string truncate predicate by the literal length relative to the truncate width. + * + *

This is the single source of truth for the {@code < width / == width / > width} decision. It + * is shared by {@link TruncateString#project} / {@link TruncateString#projectStrict} (projecting + * a {@code startsWith} predicate onto a truncate partition) and by predicate binding (rewriting + * {@code truncate(col) == v} to an equivalent predicate on the source column) so the two can + * never diverge. + */ + static Transforms.StringTruncateRewrite lengthRewrite(int literalLength, int width) { + if (literalLength < width) { + return Transforms.StringTruncateRewrite.EXACT; + } else if (literalLength == width) { + return Transforms.StringTruncateRewrite.STARTS_WITH; + } else { + return Transforms.StringTruncateRewrite.NONE; + } + } + @Override public T apply(T value) { throw new UnsupportedOperationException( @@ -328,22 +347,34 @@ public UnboundPredicate project( BoundLiteralPredicate pred = predicate.asLiteralPredicate(); switch (pred.op()) { case STARTS_WITH: - if (pred.literal().value().length() < width()) { - return Expressions.predicate(pred.op(), name, pred.literal().value()); - } else if (pred.literal().value().length() == width()) { - return Expressions.equal(name, pred.literal().value()); + switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) { + case EXACT: + return Expressions.predicate(pred.op(), name, pred.literal().value()); + case STARTS_WITH: + return Expressions.equal(name, pred.literal().value()); + default: + return ProjectionUtil.truncateArray(name, pred, this); } - return ProjectionUtil.truncateArray(name, pred, this); - case NOT_STARTS_WITH: - if (pred.literal().value().length() < width()) { - return Expressions.predicate(pred.op(), name, pred.literal().value()); - } else if (pred.literal().value().length() == width()) { - return Expressions.notEqual(name, pred.literal().value()); + switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) { + case EXACT: + return Expressions.predicate(pred.op(), name, pred.literal().value()); + case STARTS_WITH: + return Expressions.notEqual(name, pred.literal().value()); + default: + return null; } - return null; + case EQ: + case NOT_EQ: + // when the literal is shorter than the truncate width, truncate(col) == v is + // equivalent to col == v, so the predicate projects exactly onto the partition value + if (Truncate.lengthRewrite(pred.literal().value().length(), width()) + == Transforms.StringTruncateRewrite.EXACT) { + return Expressions.predicate(pred.op(), name, pred.literal().value()); + } + return ProjectionUtil.truncateArray(name, pred, this); default: return ProjectionUtil.truncateArray(name, pred, this); @@ -367,22 +398,35 @@ public UnboundPredicate projectStrict( BoundLiteralPredicate pred = predicate.asLiteralPredicate(); switch (pred.op()) { case STARTS_WITH: - if (pred.literal().value().length() < width()) { - return Expressions.predicate(pred.op(), name, pred.literal().value()); - } else if (pred.literal().value().length() == width()) { - return Expressions.equal(name, pred.literal().value()); + switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) { + case EXACT: + return Expressions.predicate(pred.op(), name, pred.literal().value()); + case STARTS_WITH: + return Expressions.equal(name, pred.literal().value()); + default: + return null; } - return null; - case NOT_STARTS_WITH: - if (pred.literal().value().length() < width()) { - return Expressions.predicate(pred.op(), name, pred.literal().value()); - } else if (pred.literal().value().length() == width()) { - return Expressions.notEqual(name, pred.literal().value()); + switch (Truncate.lengthRewrite(pred.literal().value().length(), width())) { + case EXACT: + return Expressions.predicate(pred.op(), name, pred.literal().value()); + case STARTS_WITH: + return Expressions.notEqual(name, pred.literal().value()); + default: + return Expressions.predicate( + pred.op(), name, apply(pred.literal().value()).toString()); } - return Expressions.predicate(pred.op(), name, apply(pred.literal().value()).toString()); + case EQ: + case NOT_EQ: + // when the literal is shorter than the truncate width, truncate(col) == v is + // equivalent to col == v, so the predicate projects exactly onto the partition value + if (Truncate.lengthRewrite(pred.literal().value().length(), width()) + == Transforms.StringTruncateRewrite.EXACT) { + return Expressions.predicate(pred.op(), name, pred.literal().value()); + } + return ProjectionUtil.truncateArrayStrict(name, pred, this); default: return ProjectionUtil.truncateArrayStrict(name, pred, this); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java index a07c8fd1569d..ee0aa84d072f 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java @@ -122,6 +122,78 @@ public void testPredicateBindingForStringPrefixComparisons() { } } + @Test + public void testStringTruncateBindingRewrite() { + StructType struct = + StructType.of( + required(20, "s", Types.StringType.get()), required(21, "i", Types.IntegerType.get())); + + // len(value) < width: equivalent to s == value + BoundPredicate exact = + assertAndUnwrap(Expressions.equal(Expressions.truncate("s", 4), "abc").bind(struct)); + assertThat(exact.op()).isEqualTo(EQ); + assertThat(exact.term()).isInstanceOf(BoundReference.class); + assertThat(exact.ref().fieldId()).isEqualTo(20); + assertThat(String.valueOf(exact.asLiteralPredicate().literal().value())).isEqualTo("abc"); + + // len(value) == width: equivalent to s startsWith value + BoundPredicate prefix = + assertAndUnwrap(Expressions.equal(Expressions.truncate("s", 4), "abcd").bind(struct)); + assertThat(prefix.op()).isEqualTo(STARTS_WITH); + assertThat(prefix.term()).isInstanceOf(BoundReference.class); + assertThat(String.valueOf(prefix.asLiteralPredicate().literal().value())).isEqualTo("abcd"); + + // len(value) > width: truncate(s) == value is unsatisfiable + assertThat(Expressions.equal(Expressions.truncate("s", 4), "abcde").bind(struct)) + .as("equal beyond truncate width should be alwaysFalse") + .isEqualTo(Expressions.alwaysFalse()); + + // NOT_EQ is the exact negation + BoundPredicate notExact = + assertAndUnwrap(Expressions.notEqual(Expressions.truncate("s", 4), "abc").bind(struct)); + assertThat(notExact.op()).isEqualTo(NOT_EQ); + assertThat(notExact.term()).isInstanceOf(BoundReference.class); + + BoundPredicate notPrefix = + assertAndUnwrap(Expressions.notEqual(Expressions.truncate("s", 4), "abcd").bind(struct)); + assertThat(notPrefix.op()).isEqualTo(NOT_STARTS_WITH); + assertThat(notPrefix.term()).isInstanceOf(BoundReference.class); + + assertThat(Expressions.notEqual(Expressions.truncate("s", 4), "abcde").bind(struct)) + .as("notEqual beyond truncate width should be alwaysTrue") + .isEqualTo(Expressions.alwaysTrue()); + + // empty string is shorter than any (positive) width: equivalent to s == "" + BoundPredicate empty = + assertAndUnwrap(Expressions.equal(Expressions.truncate("s", 4), "").bind(struct)); + assertThat(empty.op()).isEqualTo(EQ); + assertThat(empty.term()).isInstanceOf(BoundReference.class); + assertThat(String.valueOf(empty.asLiteralPredicate().literal().value())).isEmpty(); + + // non-string truncate is left unchanged (no startsWith analogue) + BoundPredicate intTruncate = + assertAndUnwrap(Expressions.equal(Expressions.truncate("i", 10), 5).bind(struct)); + assertThat(intTruncate.op()).isEqualTo(EQ); + assertThat(intTruncate.term()).isInstanceOf(BoundTransform.class); + + // non-truncate transform is left unchanged + BoundPredicate bucket = + assertAndUnwrap(Expressions.equal(Expressions.bucket("s", 16), 3).bind(struct)); + assertThat(bucket.op()).isEqualTo(EQ); + assertThat(bucket.term()).isInstanceOf(BoundTransform.class); + + // other operators are left unchanged + BoundPredicate lt = + assertAndUnwrap(Expressions.lessThan(Expressions.truncate("s", 4), "abcd").bind(struct)); + assertThat(lt.op()).isEqualTo(LT); + assertThat(lt.term()).isInstanceOf(BoundTransform.class); + + BoundPredicate sw = + assertAndUnwrap(Expressions.startsWith(Expressions.truncate("s", 4), "abcd").bind(struct)); + assertThat(sw.op()).isEqualTo(STARTS_WITH); + assertThat(sw.term()).isInstanceOf(BoundTransform.class); + } + @Test public void testLiteralConversion() { StructType struct = StructType.of(required(15, "d", Types.DecimalType.of(9, 2))); diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java b/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java index a9a992ea2093..fd99e3861094 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java @@ -19,7 +19,9 @@ package org.apache.iceberg.transforms; import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.expressions.Expressions.truncate; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.assertj.core.api.Assertions.assertThat; @@ -28,6 +30,7 @@ import org.apache.iceberg.TestHelpers; import org.apache.iceberg.expressions.Binder; import org.apache.iceberg.expressions.BoundPredicate; +import org.apache.iceberg.expressions.BoundReference; import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.False; @@ -75,6 +78,44 @@ public void testTruncateString() { .isTrue(); } + @Test + @SuppressWarnings("unchecked") + public void testTruncateEqualityRewrite() { + // len(value) == width: truncate(col, 4) == "abab" is rewritten to col startsWith "abab" + BoundPredicate prefix = + (BoundPredicate) + Binder.bind(SCHEMA.asStruct(), equal(truncate(COLUMN, 4), "abab"), false); + assertThat(prefix.op()).isEqualTo(Expression.Operation.STARTS_WITH); + assertThat(prefix.term()).isInstanceOf(BoundReference.class); + + Evaluator prefixEval = new Evaluator(SCHEMA.asStruct(), equal(truncate(COLUMN, 4), "abab")); + assertThat(prefixEval.eval(TestHelpers.Row.of("ababXYZ"))) + .as("truncate(ababXYZ,4) == abab") + .isTrue(); + assertThat(prefixEval.eval(TestHelpers.Row.of("abab"))).as("truncate(abab,4) == abab").isTrue(); + assertThat(prefixEval.eval(TestHelpers.Row.of("abXX"))) + .as("truncate(abXX,4) != abab") + .isFalse(); + + // len(value) < width: truncate(col, 6) == "abc" is rewritten to col == "abc" (exact), + // which must NOT match longer strings the way startsWith would + BoundPredicate exact = + (BoundPredicate) + Binder.bind(SCHEMA.asStruct(), equal(truncate(COLUMN, 6), "abc"), false); + assertThat(exact.op()).isEqualTo(Expression.Operation.EQ); + assertThat(exact.term()).isInstanceOf(BoundReference.class); + + Evaluator exactEval = new Evaluator(SCHEMA.asStruct(), equal(truncate(COLUMN, 6), "abc")); + assertThat(exactEval.eval(TestHelpers.Row.of("abc"))).as("truncate(abc,6) == abc").isTrue(); + assertThat(exactEval.eval(TestHelpers.Row.of("abcd"))) + .as("truncate(abcd,6) != abc (exact, not prefix)") + .isFalse(); + + // len(value) > width: truncate(col, 2) == "abcde" can never match + Expression none = Binder.bind(SCHEMA.asStruct(), equal(truncate(COLUMN, 2), "abcde"), false); + assertThat(none).isInstanceOf(False.class); + } + private void assertProjectionInclusive( PartitionSpec spec, UnboundPredicate filter, diff --git a/core/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluatorWithTransforms.java b/core/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluatorWithTransforms.java index 41a14667eae1..1ae3f5c6729b 100644 --- a/core/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluatorWithTransforms.java +++ b/core/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluatorWithTransforms.java @@ -590,6 +590,46 @@ public void testStringNotStartsWith() { .isTrue(); } + @Test + public void testStringTruncateEquality() { + // str column: lower bound "abc", upper bound "abe" + + // len == width -> rewritten to startsWith; "xyz" is outside ["abc", "abe"] so the file is + // skipped. Before the binding rewrite this returned true (BoundTransform term was opaque). + assertThat(shouldRead(equal(truncate("str", 3), "xyz"))) + .as("Should skip: truncate(str,3) == xyz rewritten to startsWith outside bounds") + .isFalse(); + + assertThat(shouldRead(equal(truncate("str", 3), "abc"))) + .as("Should read: startsWith prefix within bounds") + .isTrue(); + + // len > width -> truncate(str,2) == "abcde" is unsatisfiable + assertThat(shouldRead(equal(truncate("str", 2), "abcde"))) + .as("Should skip: equal beyond truncate width is alwaysFalse") + .isFalse(); + + // len < width -> rewritten to exact equality; "ab" sorts below lower bound "abc" + assertThat(shouldRead(equal(truncate("str", 5), "ab"))) + .as("Should skip: exact equality below lower bound") + .isFalse(); + + // len < width -> exact equality; "abd" is within ["abc", "abe"] + assertThat(shouldRead(equal(truncate("str", 5), "abd"))) + .as("Should read: exact equality within bounds") + .isTrue(); + + // NOT_EQ, len > width -> alwaysTrue + assertThat(shouldRead(notEqual(truncate("str", 2), "abcde"))) + .as("Should read: notEqual beyond truncate width is alwaysTrue") + .isTrue(); + + // NOT_EQ, len == width -> notStartsWith; may contain null so cannot be pruned + assertThat(shouldRead(notEqual(truncate("str", 3), "abc"))) + .as("Should read: notStartsWith on column that may contain null") + .isTrue(); + } + @Test public void testIntegerIn() { assertThat(shouldRead(in(day("ts"), INT_MIN_VALUE - 25, INT_MIN_VALUE - 24))) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 3cb46b309d82..621491587001 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -1005,11 +1005,19 @@ public void testTransformFilter() { assumeThat(format).isEqualTo(FileFormat.PARQUET); boolean shouldRead = - new ParquetMetricsRowGroupFilter(SCHEMA, equal(truncate("required", 2), "some_value"), true) + new ParquetMetricsRowGroupFilter(SCHEMA, equal(truncate("id", 2), 12345), true) .shouldRead(parquetSchema, rowGroupMetadata); assertThat(shouldRead) - .as("Should read: filter contains non-reference evaluate as True") + .as("Should read: opaque transform term (non-string truncate) evaluates as true") .isTrue(); + + shouldRead = + new ParquetMetricsRowGroupFilter(SCHEMA, equal(truncate("required", 2), "some_value"), true) + .shouldRead(parquetSchema, rowGroupMetadata); + assertThat(shouldRead) + .as( + "Should not read: string truncate equality is unsatisfiable when the literal is longer than the truncate width") + .isFalse(); } @TestTemplate diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java index 22f8068c0fa3..cf972d0f9268 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java @@ -1267,11 +1267,19 @@ SCHEMA, greaterThanOrEqual("decimal_fixed", BigDecimal.ZERO)) @TestTemplate public void testTransformFilter() { boolean shouldRead = - new ParquetDictionaryRowGroupFilter(SCHEMA, equal(truncate("required", 2), "some_value")) + new ParquetDictionaryRowGroupFilter(SCHEMA, equal(truncate("id", 2), 12345)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); assertThat(shouldRead) - .as("Should read: filter contains non-reference evaluate as True") + .as("Should read: opaque transform term (non-string truncate) evaluates as true") .isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal(truncate("required", 2), "some_value")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead) + .as( + "Should not read: string truncate equality is unsatisfiable when the literal is longer than the truncate width") + .isFalse(); } @TestTemplate