diff --git a/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java b/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java index 9cd4906952cc..479c8e221e12 100644 --- a/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java +++ b/core/src/main/java/org/apache/iceberg/util/FileSystemWalker.java @@ -160,18 +160,21 @@ public static void listDirRecursivelyWithHadoop( * @return {@code true} if the path is hidden, {@code false} otherwise */ private static boolean isHiddenPath(String baseDir, Path path, PathFilter pathFilter) { - boolean isHiddenPath = false; Path currentPath = path; - while (currentPath.getParent().toString().contains(baseDir)) { + Path parent = currentPath.getParent(); + // Walk up the path hierarchy while the parent directory is still within baseDir. + // Null-check the parent to avoid NPE when the walk reaches the storage root + // (e.g., an S3 bucket root such as "s3://bucket/"), whose getParent() returns null. + while (parent != null && parent.toString().contains(baseDir)) { if (!pathFilter.accept(currentPath)) { - isHiddenPath = true; - break; + return true; } - currentPath = currentPath.getParent(); + currentPath = parent; + parent = currentPath.getParent(); } - return isHiddenPath; + return false; } /** diff --git a/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java b/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java index a748a462158c..ba03c073c65a 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java +++ b/core/src/test/java/org/apache/iceberg/util/TestFileSystemWalker.java @@ -19,6 +19,7 @@ package org.apache.iceberg.util; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.IOException; import java.nio.file.Files; @@ -33,6 +34,10 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.hadoop.HadoopFileIO; import org.apache.iceberg.io.FileInfo; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SupportsPrefixOperations; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; @@ -242,4 +247,98 @@ private void listFilesRecursively( assertThat(remainingDirs).isEmpty(); } + + @Test + public void testListDirRecursivelyWithFileIOBucketRootBaseDir() { + assertThat( + listWithMockFileIO( + "s3://bucket/", + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/data/b.parquet", 1L, 0L), + new FileInfo("s3://bucket/top.parquet", 1L, 0L))) + .containsExactlyInAnyOrder( + "s3://bucket/data/a.parquet", "s3://bucket/data/b.parquet", "s3://bucket/top.parquet"); + } + + /** + * Regression test: with a bucket-root base directory, files living under a hidden top-level + * directory (e.g. {@code _temporary}) must still be filtered out, and no NPE should occur. + */ + @Test + public void testListDirRecursivelyWithFileIOBucketRootFiltersHiddenDir() { + assertThat( + listWithMockFileIO( + "s3://bucket/", + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/_temporary/attempt_x/b.parquet", 1L, 0L), + new FileInfo("s3://bucket/.staging/c.parquet", 1L, 0L))) + .containsExactly("s3://bucket/data/a.parquet"); + } + + /** + * Regression test: when the base directory is passed without a trailing slash (e.g. {@code + * oss://bucket}), the walk should still work correctly without NPE. + */ + @Test + public void testListDirRecursivelyWithFileIOBucketRootNoTrailingSlash() { + assertThat( + listWithMockFileIO( + "s3://bucket", + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L), + new FileInfo("s3://bucket/_hidden/b.parquet", 1L, 0L))) + .containsExactly("s3://bucket/data/a.parquet"); + } + + @Test + public void testListDirRecursivelyWithFileIONullFileLocation() { + assertThatThrownBy( + () -> + listWithMockFileIO( + "s3://bucket/", + new FileInfo(null, 1L, 0L), + new FileInfo("s3://bucket/data/a.parquet", 1L, 0L))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Can not create a Path from a null string"); + } + + private static List listWithMockFileIO(String baseDir, FileInfo... entries) { + SupportsPrefixOperations mockIO = new StaticPrefixFileIO(ImmutableList.copyOf(entries)); + List foundFiles = Lists.newArrayList(); + FileSystemWalker.listDirRecursivelyWithFileIO( + mockIO, baseDir, null, fileInfo -> true, foundFiles::add); + return foundFiles; + } + + private static class StaticPrefixFileIO implements SupportsPrefixOperations { + private final Iterable entries; + + StaticPrefixFileIO(Iterable entries) { + this.entries = entries; + } + + @Override + public Iterable listPrefix(String prefix) { + return entries; + } + + @Override + public void deletePrefix(String prefix) { + throw new UnsupportedOperationException(); + } + + @Override + public InputFile newInputFile(String path) { + throw new UnsupportedOperationException(); + } + + @Override + public OutputFile newOutputFile(String path) { + throw new UnsupportedOperationException(); + } + + @Override + public void deleteFile(String path) { + throw new UnsupportedOperationException(); + } + } }