linkedin · mkuchenbecker · Apr 7, 2026 · Apr 7, 2026 · mkuchenbecker · Apr 7, 2026
diff --git a/apps/optimizer-analyzer/build.gradle b/apps/optimizer-analyzer/build.gradle
@@ -0,0 +1,20 @@
+plugins {
+  id 'openhouse.springboot-ext-conventions'
+  id 'org.springframework.boot' version '2.7.8'
+}
+
+dependencies {
+  implementation project(':apps:optimizer')
+  implementation 'org.springframework.boot:spring-boot-starter:2.7.8'
+  implementation 'org.springframework.boot:spring-boot-starter-webflux:2.7.8'
+  implementation 'org.springframework.boot:spring-boot-starter-data-jpa:2.7.8'
+  implementation 'org.springframework.boot:spring-boot-starter-aop:2.7.8'
+  runtimeOnly 'mysql:mysql-connector-java:8.0.33'
+  testImplementation 'org.springframework.boot:spring-boot-starter-test:2.7.8'
+  testImplementation 'com.squareup.okhttp3:mockwebserver:4.10.0'
+  testRuntimeOnly 'com.h2database:h2'
+}
+
+test {
+  useJUnitPlatform()
+}
diff --git a/...optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerApplication.java b/...optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerApplication.java
@@ -0,0 +1,25 @@
+package com.linkedin.openhouse.analyzer;
+
+import org.springframework.boot.CommandLineRunner;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.boot.autoconfigure.domain.EntityScan;
+import org.springframework.context.annotation.Bean;
+import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
+
+/** Entry point for the Optimizer Analyzer application. */
+@SpringBootApplication
+@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.entity")
+@EnableJpaRepositories(basePackages = "com.linkedin.openhouse.optimizer.repository")
+public class AnalyzerApplication {
+
+  public static void main(String[] args) {
+    SpringApplication.run(AnalyzerApplication.class, args);
+  }
+
+  /** Delegates to {@link AnalyzerRunner#analyze()} once per process invocation. */
+  @Bean
+  public CommandLineRunner run(AnalyzerRunner runner) {
+    return args -> runner.analyze();
+  }
+}
diff --git a/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerRunner.java b/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/AnalyzerRunner.java
@@ -0,0 +1,131 @@
+package com.linkedin.openhouse.analyzer;
+
+import com.linkedin.openhouse.analyzer.model.Table;
+import com.linkedin.openhouse.analyzer.model.TableOperation;
+import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
+import com.linkedin.openhouse.optimizer.entity.TableOperationRow;
+import com.linkedin.openhouse.optimizer.repository.TableOperationHistoryRepository;
+import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository;
+import com.linkedin.openhouse.optimizer.repository.TableStatsRepository;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.stream.Collectors;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.data.domain.Pageable;
+import org.springframework.stereotype.Component;
+
+/**
+ * Core analysis loop. Loads {@code table_stats} rows and evaluates each table against every
+ * registered {@link OperationAnalyzer} in a single pass.
+ *
+ * <p>The two sides of the join — current operations and circuit-breaker history — are loaded into
+ * memory once per run before the table loop. Both are naturally bounded (only tables with active or
+ * recently failed operations have rows), so holding them in maps is safe at any table scale.
+ *
+ * <p>// TODO: Iterate per-database instead of loading all tables at once. This scopes memory usage
+ * and allows incremental progress. When we go per-db we may still see 10k tables per iteration, but
+ * that should be fine.
+ *
+ * <p>// TODO: Add benchmarking and scale tests. Measure memory footprint at 10k tables per
+ * iteration to validate the in-memory join approach.
+ */
+@Slf4j
+@Component
+@RequiredArgsConstructor
+public class AnalyzerRunner {
+
+  private final List<OperationAnalyzer> analyzers;
+  private final TableStatsRepository statsRepo;
+  private final TableOperationsRepository operationsRepo;
+  private final TableOperationHistoryRepository historyRepo;
+
+  /** Run the full analysis loop once with no filters. */
+  public void analyze() {
+    analyze(null, null, null, null);
+  }
+
+  /**
+   * Run the analysis loop, optionally scoped to a specific operation type, database, table name, or
+   * table UUID. Pass {@code null} for any parameter to skip that filter.
+   */
+  public void analyze(
+      String operationType, String databaseName, String tableName, String tableUuid) {
+
+    List<OperationAnalyzer> activeAnalyzers =
+        operationType == null
+            ? analyzers
+            : analyzers.stream()
+                .filter(a -> a.getOperationType().equals(operationType))
+                .collect(Collectors.toList());
+
+    // Pre-load the small sides of the joins — one query per analyzer type.
+    // TODO: Move to a query builder (Criteria API or jOOQ) as filter count grows.
+    Map<String, Map<String, TableOperation>> opsByType =
+        activeAnalyzers.stream()
+            .collect(
+                Collectors.toMap(
+                    OperationAnalyzer::getOperationType,
+                    a ->
+                        operationsRepo
+                            .find(a.getOperationType(), null, tableUuid, databaseName, tableName)
+                            .stream()
+                            .filter(e -> e.getTableUuid() != null)
+                            .collect(
+                                Collectors.toMap(
+                                    TableOperationRow::getTableUuid,
+                                    TableOperation::from,
+                                    TableOperation::mostRecent))));
+
+    Map<String, Map<String, List<TableOperationHistoryRow>>> historyByType =
+        activeAnalyzers.stream()
+            .collect(
+                Collectors.toMap(
+                    OperationAnalyzer::getOperationType,
+                    a ->
+                        historyRepo.find(a.getOperationType(), null, null, null, Pageable.unpaged())
+                            .stream()
+                            .collect(
+                                Collectors.groupingBy(TableOperationHistoryRow::getTableUuid))));
+
+    List<Table> tables =
+        statsRepo.find(databaseName, tableName, tableUuid).stream()
+            .filter(row -> row.getTableUuid() != null)
+            .map(Table::from)
+            .collect(Collectors.toList());
+    log.info("Found {} tables in optimizer table_stats", tables.size());
+
+    tables.forEach(
+        table ->
+            activeAnalyzers.forEach(
+                analyzer -> {
+                  if (!analyzer.isEnabled(table)) {
+                    return;
+                  }
+
+                  Optional<TableOperation> currentOp =
+                      Optional.ofNullable(
+                          opsByType.get(analyzer.getOperationType()).get(table.getTableUuid()));
+                  List<TableOperationHistoryRow> history =
+                      historyByType
+                          .get(analyzer.getOperationType())
+                          .getOrDefault(table.getTableUuid(), Collections.emptyList());
+                  Optional<TableOperationHistoryRow> latestHistory = history.stream().findFirst();
+
+                  if (analyzer.shouldSchedule(table, currentOp, latestHistory)
+                      && !analyzer.isCircuitBroken(table.getTableUuid(), history)) {
+                    TableOperation op = TableOperation.pending(table, analyzer.getOperationType());
+                    operationsRepo.save(op.toRow());
+                    log.info(
+                        "Created PENDING {} operation for table {}.{}",
+                        analyzer.getOperationType(),
+                        table.getDatabaseId(),
+                        table.getTableId());
+                  }
+                }));
+
+    log.info("Analysis complete");
+  }
+}
diff --git a/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/CadencePolicy.java b/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/CadencePolicy.java
@@ -0,0 +1,92 @@
+package com.linkedin.openhouse.analyzer;
+
+import com.linkedin.openhouse.analyzer.model.TableOperation;
+import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.Optional;
+import lombok.RequiredArgsConstructor;
+
+/**
+ * Encapsulates the time-based scheduling logic shared across operation types. An analyzer delegates
+ * to {@link CadencePolicy} to decide whether to re-issue a recommendation for a table that already
+ * has an active operation record and/or history.
+ *
+ * <p>The SCHEDULED timeout is a key safety mechanism: if a Spark job crashes without reporting
+ * back, the SCHEDULED row would otherwise block the table forever. When the row has been SCHEDULED
+ * (or SCHEDULING) longer than {@code scheduledTimeout}, the Analyzer treats it as stale and returns
+ * {@code true}, causing a new PENDING row to be inserted.
+ */
+@RequiredArgsConstructor
+public class CadencePolicy {
+
+  /**
+   * How long to wait after a successful operation before re-evaluating the table. For example, if
+   * set to 24 hours and OFD succeeded at 10:00 AM Monday, the table won't be scheduled again until
+   * after 10:00 AM Tuesday.
+   */
+  private final Duration successRetryInterval;
+
+  /**
+   * How long to wait after a failed operation before retrying. Shorter than the success interval to
+   * allow quick recovery. For example, if set to 1 hour and OFD failed at 2:00 PM, the table
+   * becomes eligible for retry at 3:00 PM.
+   */
+  private final Duration failureRetryInterval;
+
+  /**
+   * Maximum time a row can stay in SCHEDULED status before the analyzer treats it as stale and
+   * overwrites it with a new PENDING row. Handles the case where a Spark job crashes without
+   * reporting back. For example, if set to 6 hours and a job was submitted at noon but never
+   * completed, the analyzer will re-schedule the table after 6:00 PM.
+   */
+  private final Duration scheduledTimeout;
+
+  /**
+   * Returns {@code true} if a new or refreshed operation record should be upserted.
+   *
+   * @param currentOp the existing active operation record, or empty if none exists
+   * @param latestHistory the most recent history entry for this (table, type), or empty
+   */
+  public boolean shouldSchedule(
+      Optional<TableOperation> currentOp, Optional<TableOperationHistoryRow> latestHistory) {
+    if (currentOp.isEmpty()) {
+      return decideFromHistory(latestHistory);
+    }
+    TableOperation op = currentOp.get();
+    switch (op.getStatus()) {
+      case "PENDING":
+      case "SCHEDULING":
+        return false;
+      case "SCHEDULED":
+        if (latestHistory.isEmpty()) {
+          return pastInterval(op.getScheduledAt(), scheduledTimeout);
+        }
+        return decideFromHistoryEntry(latestHistory.get());
+      default:
+        return true;
+    }
+  }
+
+  private boolean decideFromHistory(Optional<TableOperationHistoryRow> latestHistory) {
+    if (latestHistory.isEmpty()) {
+      return true;
+    }
+    return decideFromHistoryEntry(latestHistory.get());
+  }
+
+  private boolean decideFromHistoryEntry(TableOperationHistoryRow entry) {
+    switch (entry.getStatus()) {
+      case "SUCCESS":
+        return pastInterval(entry.getSubmittedAt(), successRetryInterval);
+      case "FAILED":
+        return pastInterval(entry.getSubmittedAt(), failureRetryInterval);
+      default:
+        return true;
+    }
+  }
+
+  private boolean pastInterval(Instant timestamp, Duration interval) {
+    return timestamp == null || Duration.between(timestamp, Instant.now()).compareTo(interval) > 0;
+  }
+}
diff --git a/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/OperationAnalyzer.java b/apps/optimizer-analyzer/src/main/java/com/linkedin/openhouse/analyzer/OperationAnalyzer.java
@@ -0,0 +1,66 @@
+package com.linkedin.openhouse.analyzer;
+
+import com.linkedin.openhouse.analyzer.model.Table;
+import com.linkedin.openhouse.analyzer.model.TableOperation;
+import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
+import java.util.List;
+import java.util.Optional;
+
+/**
+ * Strategy interface for a single operation type. Each implementation decides whether a given table
+ * needs an operation recommendation upserted in the Optimizer Service.
+ */
+public interface OperationAnalyzer {
+
+  /** The operation type this analyzer handles (e.g., {@code "ORPHAN_FILES_DELETION"}). */
+  String getOperationType();
+
+  /**
+   * Returns {@code true} if this operation is opted-in for the given table. Tables that return
+   * {@code false} are skipped entirely — no upsert is issued.
+   */
+  boolean isEnabled(Table table);
+
+  /**
+   * Returns {@code true} if a new or refreshed operation record should be upserted.
+   *
+   * @param table the table entry
+   * @param currentOp the existing active operation record, or empty if none exists
+   * @param latestHistory the most recent history entry for this (table, type), or empty
+   */
+  boolean shouldSchedule(
+      Table table,
+      Optional<TableOperation> currentOp,
+      Optional<TableOperationHistoryRow> latestHistory);
+
+  /**
+   * Maximum number of consecutive FAILED history entries before the circuit breaker trips and
+   * scheduling is suppressed for this (table, operation_type). Override per operation type. Returns
+   * 0 to disable the circuit breaker.
+   */
+  default int getCircuitBreakerThreshold() {
+    return 5;
+  }
+
+  /**
+   * Returns {@code true} if the circuit breaker has tripped for this table. The default
+   * implementation checks whether the last N history entries are all FAILED. Individual analyzers
+   * can override this to implement different strategies (e.g., time-based backoff).
+   *
+   * <p>// TODO: Add circuit breaker reset with exponential backoff so tables can recover
+   * automatically after a cooldown period instead of staying tripped permanently.
+   *
+   * <p>// TODO: Add a communication path to surface tripped circuit breakers to users (e.g.,
+   * metrics, alerts, or a dashboard query).
+   *
+   * @param tableUuid the table whose history to check
+   * @param history recent history entries for this (table, type), newest first
+   */
+  default boolean isCircuitBroken(String tableUuid, List<TableOperationHistoryRow> history) {
+    int threshold = getCircuitBreakerThreshold();
+    if (threshold <= 0 || history.size() < threshold) {
+      return false;
+    }
+    return history.stream().limit(threshold).allMatch(r -> "FAILED".equals(r.getStatus()));
+  }
+}
diff --git a/...r-analyzer/src/main/java/com/linkedin/openhouse/analyzer/OrphanFilesDeletionAnalyzer.java b/...r-analyzer/src/main/java/com/linkedin/openhouse/analyzer/OrphanFilesDeletionAnalyzer.java
@@ -0,0 +1,55 @@
+package com.linkedin.openhouse.analyzer;
+
+import com.linkedin.openhouse.analyzer.model.Table;
+import com.linkedin.openhouse.analyzer.model.TableOperation;
+import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
+import java.time.Duration;
+import java.util.Optional;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+
+/** Analyzer for the {@code ORPHAN_FILES_DELETION} operation type. */
+@Component
+public class OrphanFilesDeletionAnalyzer implements OperationAnalyzer {
+
+  static final String OPERATION_TYPE = "ORPHAN_FILES_DELETION";
+  static final String OFD_ENABLED_PROPERTY = "maintenance.optimizer.ofd.enabled";
+
+  private final CadencePolicy cadencePolicy;
+
+  @Autowired
+  public OrphanFilesDeletionAnalyzer(
+      @Value("${ofd.success-retry-hours:24}") long successRetryHours,
+      @Value("${ofd.failure-retry-hours:1}") long failureRetryHours,
+      @Value("${ofd.scheduled-timeout-hours:6}") long scheduledTimeoutHours) {
+    this.cadencePolicy =
+        new CadencePolicy(
+            Duration.ofHours(successRetryHours),
+            Duration.ofHours(failureRetryHours),
+            Duration.ofHours(scheduledTimeoutHours));
+  }
+
+  /** Package-private for tests that supply a pre-built {@link CadencePolicy}. */
+  OrphanFilesDeletionAnalyzer(CadencePolicy cadencePolicy) {
+    this.cadencePolicy = cadencePolicy;
+  }
+
+  @Override
+  public String getOperationType() {
+    return OPERATION_TYPE;
+  }
+
+  @Override
+  public boolean isEnabled(Table table) {
+    return "true".equals(table.getTableProperties().get(OFD_ENABLED_PROPERTY));
+  }
+
+  @Override
+  public boolean shouldSchedule(
+      Table table,
+      Optional<TableOperation> currentOp,
+      Optional<TableOperationHistoryRow> latestHistory) {
+    return cadencePolicy.shouldSchedule(currentOp, latestHistory);
+  }
+}