From 29c791e603d89f7fd7c8f2c2566815547d8655fc Mon Sep 17 00:00:00 2001 From: "liquan.llq" Date: Fri, 15 May 2026 21:31:40 +0800 Subject: [PATCH 1/2] Core: Fix NPE in RemoveOrphanFiles with prefix_listing for root table location(#16350) --- .../apache/iceberg/util/FileSystemWalker.java | 15 ++- .../iceberg/util/TestFileSystemWalker.java | 110 ++++++++++++++++++ 2 files changed, 119 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java b/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java index 9cd4906952cc..479c8e221e12 100644 --- a/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java +++ b/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java @@ -160,18 +160,21 @@ public static void listDirRecursivelyWithHadoop( * @return {@code true} if the path is hidden, {@code false} otherwise */ private static boolean isHiddenPath(String baseDir, Path path, PathFilter pathFilter) { - boolean isHiddenPath = false; Path currentPath = path; - while (currentPath.getParent().toString().contains(baseDir)) { + Path parent = currentPath.getParent(); + // Walk up the path hierarchy while the parent directory is still within baseDir. + // Null-check the parent to avoid NPE when the walk reaches the storage root + // (e.g., an S3 bucket root such as "s3://bucket/"), whose getParent() returns null. + while (parent != null && parent.toString().contains(baseDir)) { if (!pathFilter.accept(currentPath)) { - isHiddenPath = true; - break; + return true; } - currentPath = currentPath.getParent(); + currentPath = parent; + parent = currentPath.getParent(); } - return isHiddenPath; + return false; } /** diff --git a/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java b/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java index a748a462158c..929fa51d4e7f 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java +++ b/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java @@ -19,6 +19,7 @@ package org.apache.iceberg.util; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; import java.io.IOException; import java.nio.file.Files; @@ -33,6 +34,10 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.hadoop.HadoopFileIO; import org.apache.iceberg.io.FileInfo; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SupportsPrefixOperations; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; @@ -242,4 +247,109 @@ private void listFilesRecursively( assertThat(remainingDirs).isEmpty(); } + + @Test + public void testListDirRecursivelyWithFileIOBucketRootBaseDir() { + List entries = + ImmutableList.of( + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/data/b.parquet", 1L, 0L), + new FileInfo("s3://bucket/top.parquet", 1L, 0L)); + SupportsPrefixOperations mockIO = new StaticPrefixFileIO(entries); + + List foundFiles = Lists.newArrayList(); + Predicate fileFilter = fileInfo -> fileInfo.location().endsWith(".parquet"); + + assertThatCode( + () -> + FileSystemWalker.listDirRecursivelyWithFileIO( + mockIO, "s3://bucket/", null, fileFilter, foundFiles::add)) + .doesNotThrowAnyException(); + + assertThat(foundFiles) + .containsExactlyInAnyOrder( + "s3://bucket/data/a.parquet", "s3://bucket/data/b.parquet", "s3://bucket/top.parquet"); + } + + /** + * Regression test: with a bucket-root base directory, files living under a hidden top-level + * directory (e.g. {@code _temporary}) must still be filtered out, and no NPE should occur. + */ + @Test + public void testListDirRecursivelyWithFileIOBucketRootFiltersHiddenDir() { + List entries = + ImmutableList.of( + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/_temporary/attempt_x/b.parquet", 1L, 0L), + new FileInfo("s3://bucket/.staging/c.parquet", 1L, 0L)); + SupportsPrefixOperations mockIO = new StaticPrefixFileIO(entries); + + List foundFiles = Lists.newArrayList(); + Predicate fileFilter = fileInfo -> fileInfo.location().endsWith(".parquet"); + + assertThatCode( + () -> + FileSystemWalker.listDirRecursivelyWithFileIO( + mockIO, "s3://bucket/", null, fileFilter, foundFiles::add)) + .doesNotThrowAnyException(); + + assertThat(foundFiles).containsExactly("s3://bucket/data/a.parquet"); + } + + /** + * Regression test: when the base directory is passed without a trailing slash (e.g. {@code + * oss://bucket}), the walk should still work correctly without NPE. + */ + @Test + public void testListDirRecursivelyWithFileIOBucketRootNoTrailingSlash() { + List entries = + ImmutableList.of( + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/_hidden/b.parquet", 1L, 0L)); + SupportsPrefixOperations mockIO = new StaticPrefixFileIO(entries); + + List foundFiles = Lists.newArrayList(); + Predicate fileFilter = fileInfo -> fileInfo.location().endsWith(".parquet"); + + assertThatCode( + () -> + FileSystemWalker.listDirRecursivelyWithFileIO( + mockIO, "s3://bucket", null, fileFilter, foundFiles::add)) + .doesNotThrowAnyException(); + + assertThat(foundFiles).containsExactly("s3://bucket/data/a.parquet"); + } + + private static class StaticPrefixFileIO implements SupportsPrefixOperations { + private final Iterable entries; + + StaticPrefixFileIO(Iterable entries) { + this.entries = entries; + } + + @Override + public Iterable listPrefix(String prefix) { + return entries; + } + + @Override + public void deletePrefix(String prefix) { + throw new UnsupportedOperationException(); + } + + @Override + public InputFile newInputFile(String path) { + throw new UnsupportedOperationException(); + } + + @Override + public OutputFile newOutputFile(String path) { + throw new UnsupportedOperationException(); + } + + @Override + public void deleteFile(String path) { + throw new UnsupportedOperationException(); + } + } } From f529f9b1873fb9a5559158e42665d48541317005 Mon Sep 17 00:00:00 2001 From: "liquan.llq" Date: Sat, 16 May 2026 12:55:05 +0800 Subject: [PATCH 2/2] Refactor TestFileSystemWalker as suggested --- .../iceberg/util/TestFileSystemWalker.java | 85 ++++++++----------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java b/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java index 929fa51d4e7f..ba03c073c65a 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java +++ b/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java @@ -19,7 +19,7 @@ package org.apache.iceberg.util; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatCode; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.IOException; import java.nio.file.Files; @@ -250,23 +250,12 @@ private void listFilesRecursively( @Test public void testListDirRecursivelyWithFileIOBucketRootBaseDir() { - List entries = - ImmutableList.of( - new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), - new FileInfo("s3://bucket/data/b.parquet", 1L, 0L), - new FileInfo("s3://bucket/top.parquet", 1L, 0L)); - SupportsPrefixOperations mockIO = new StaticPrefixFileIO(entries); - - List foundFiles = Lists.newArrayList(); - Predicate fileFilter = fileInfo -> fileInfo.location().endsWith(".parquet"); - - assertThatCode( - () -> - FileSystemWalker.listDirRecursivelyWithFileIO( - mockIO, "s3://bucket/", null, fileFilter, foundFiles::add)) - .doesNotThrowAnyException(); - - assertThat(foundFiles) + assertThat( + listWithMockFileIO( + "s3://bucket/", + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/data/b.parquet", 1L, 0L), + new FileInfo("s3://bucket/top.parquet", 1L, 0L))) .containsExactlyInAnyOrder( "s3://bucket/data/a.parquet", "s3://bucket/data/b.parquet", "s3://bucket/top.parquet"); } @@ -277,23 +266,13 @@ public void testListDirRecursivelyWithFileIOBucketRootBaseDir() { */ @Test public void testListDirRecursivelyWithFileIOBucketRootFiltersHiddenDir() { - List entries = - ImmutableList.of( - new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), - new FileInfo("s3://bucket/_temporary/attempt_x/b.parquet", 1L, 0L), - new FileInfo("s3://bucket/.staging/c.parquet", 1L, 0L)); - SupportsPrefixOperations mockIO = new StaticPrefixFileIO(entries); - - List foundFiles = Lists.newArrayList(); - Predicate fileFilter = fileInfo -> fileInfo.location().endsWith(".parquet"); - - assertThatCode( - () -> - FileSystemWalker.listDirRecursivelyWithFileIO( - mockIO, "s3://bucket/", null, fileFilter, foundFiles::add)) - .doesNotThrowAnyException(); - - assertThat(foundFiles).containsExactly("s3://bucket/data/a.parquet"); + assertThat( + listWithMockFileIO( + "s3://bucket/", + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/_temporary/attempt_x/b.parquet", 1L, 0L), + new FileInfo("s3://bucket/.staging/c.parquet", 1L, 0L))) + .containsExactly("s3://bucket/data/a.parquet"); } /** @@ -302,22 +281,32 @@ public void testListDirRecursivelyWithFileIOBucketRootFiltersHiddenDir() { */ @Test public void testListDirRecursivelyWithFileIOBucketRootNoTrailingSlash() { - List entries = - ImmutableList.of( - new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), - new FileInfo("s3://bucket/_hidden/b.parquet", 1L, 0L)); - SupportsPrefixOperations mockIO = new StaticPrefixFileIO(entries); - - List foundFiles = Lists.newArrayList(); - Predicate fileFilter = fileInfo -> fileInfo.location().endsWith(".parquet"); + assertThat( + listWithMockFileIO( + "s3://bucket", + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/_hidden/b.parquet", 1L, 0L))) + .containsExactly("s3://bucket/data/a.parquet"); + } - assertThatCode( + @Test + public void testListDirRecursivelyWithFileIONullFileLocation() { + assertThatThrownBy( () -> - FileSystemWalker.listDirRecursivelyWithFileIO( - mockIO, "s3://bucket", null, fileFilter, foundFiles::add)) - .doesNotThrowAnyException(); + listWithMockFileIO( + "s3://bucket/", + new FileInfo(null, 1L, 0L), + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Can not create a Path from a null string"); + } - assertThat(foundFiles).containsExactly("s3://bucket/data/a.parquet"); + private static List listWithMockFileIO(String baseDir, FileInfo... entries) { + SupportsPrefixOperations mockIO = new StaticPrefixFileIO(ImmutableList.copyOf(entries)); + List foundFiles = Lists.newArrayList(); + FileSystemWalker.listDirRecursivelyWithFileIO( + mockIO, baseDir, null, fileInfo -> true, foundFiles::add); + return foundFiles; } private static class StaticPrefixFileIO implements SupportsPrefixOperations {