Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions apps/optimizer-analyzer/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
plugins {
id 'openhouse.springboot-ext-conventions'
id 'org.springframework.boot' version '2.7.8'
}

dependencies {
implementation project(':apps:optimizer')
implementation 'org.springframework.boot:spring-boot-starter:2.7.8'
implementation 'org.springframework.boot:spring-boot-starter-webflux:2.7.8'
implementation 'org.springframework.boot:spring-boot-starter-data-jpa:2.7.8'
implementation 'org.springframework.boot:spring-boot-starter-aop:2.7.8'
runtimeOnly 'mysql:mysql-connector-java:8.0.33'
testImplementation 'org.springframework.boot:spring-boot-starter-test:2.7.8'
testImplementation 'com.squareup.okhttp3:mockwebserver:4.10.0'
testRuntimeOnly 'com.h2database:h2'
}

test {
useJUnitPlatform()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package com.linkedin.openhouse.analyzer;

import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.context.annotation.Bean;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;

/** Entry point for the Optimizer Analyzer application. */
@SpringBootApplication
@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.entity")
@EnableJpaRepositories(basePackages = "com.linkedin.openhouse.optimizer.repository")
public class AnalyzerApplication {

public static void main(String[] args) {
SpringApplication.run(AnalyzerApplication.class, args);
}

/** Delegates to {@link AnalyzerRunner#analyze()} once per process invocation. */
@Bean
public CommandLineRunner run(AnalyzerRunner runner) {
return args -> runner.analyze();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package com.linkedin.openhouse.analyzer;

import com.linkedin.openhouse.analyzer.model.Table;
import com.linkedin.openhouse.analyzer.model.TableOperation;
import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
import com.linkedin.openhouse.optimizer.entity.TableOperationRow;
import com.linkedin.openhouse.optimizer.repository.TableOperationHistoryRepository;
import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository;
import com.linkedin.openhouse.optimizer.repository.TableStatsRepository;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.Pageable;
import org.springframework.stereotype.Component;

/**
* Core analysis loop. Loads {@code table_stats} rows and evaluates each table against every
* registered {@link OperationAnalyzer} in a single pass.
*
* <p>The two sides of the join — current operations and circuit-breaker history — are loaded into
* memory once per run before the table loop. Both are naturally bounded (only tables with active or
* recently failed operations have rows), so holding them in maps is safe at any table scale.
*
* <p>// TODO: Iterate per-database instead of loading all tables at once. This scopes memory usage
* and allows incremental progress. When we go per-db we may still see 10k tables per iteration, but
* that should be fine.
*
* <p>// TODO: Add benchmarking and scale tests. Measure memory footprint at 10k tables per
* iteration to validate the in-memory join approach.
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class AnalyzerRunner {

private final List<OperationAnalyzer> analyzers;
private final TableStatsRepository statsRepo;
private final TableOperationsRepository operationsRepo;
private final TableOperationHistoryRepository historyRepo;

/** Run the full analysis loop once with no filters. */
public void analyze() {
Comment thread
mkuchenbecker marked this conversation as resolved.
analyze(null, null, null, null);
}

/**
* Run the analysis loop, optionally scoped to a specific operation type, database, table name, or
* table UUID. Pass {@code null} for any parameter to skip that filter.
*/
public void analyze(
String operationType, String databaseName, String tableName, String tableUuid) {

List<OperationAnalyzer> activeAnalyzers =
operationType == null
? analyzers
: analyzers.stream()
.filter(a -> a.getOperationType().equals(operationType))
.collect(Collectors.toList());

// Pre-load the small sides of the joins — one query per analyzer type.
// TODO: Move to a query builder (Criteria API or jOOQ) as filter count grows.
Copy link
Copy Markdown
Collaborator Author

@mkuchenbecker mkuchenbecker Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this todo or move it to where we have a null,null,null when querying the history repo.

Map<String, Map<String, TableOperation>> opsByType =
activeAnalyzers.stream()
.collect(
Collectors.toMap(
OperationAnalyzer::getOperationType,
a ->
operationsRepo
.find(a.getOperationType(), null, tableUuid, databaseName, tableName)
.stream()
.filter(e -> e.getTableUuid() != null)
.collect(
Collectors.toMap(
TableOperationRow::getTableUuid,
TableOperation::from,
TableOperation::mostRecent))));

Map<String, Map<String, List<TableOperationHistoryRow>>> historyByType =
activeAnalyzers.stream()
.collect(
Collectors.toMap(
OperationAnalyzer::getOperationType,
a ->
historyRepo.find(a.getOperationType(), null, null, null, Pageable.unpaged())
.stream()
.collect(
Collectors.groupingBy(TableOperationHistoryRow::getTableUuid))));

List<Table> tables =
statsRepo.find(databaseName, tableName, tableUuid).stream()
.filter(row -> row.getTableUuid() != null)
.map(Table::from)
.collect(Collectors.toList());
log.info("Found {} tables in optimizer table_stats", tables.size());

tables.forEach(
table ->
activeAnalyzers.forEach(
analyzer -> {
if (!analyzer.isEnabled(table)) {
return;
}

Optional<TableOperation> currentOp =
Optional.ofNullable(
opsByType.get(analyzer.getOperationType()).get(table.getTableUuid()));
List<TableOperationHistoryRow> history =
historyByType
.get(analyzer.getOperationType())
.getOrDefault(table.getTableUuid(), Collections.emptyList());
Optional<TableOperationHistoryRow> latestHistory = history.stream().findFirst();

if (analyzer.shouldSchedule(table, currentOp, latestHistory)
&& !analyzer.isCircuitBroken(table.getTableUuid(), history)) {
TableOperation op = TableOperation.pending(table, analyzer.getOperationType());
operationsRepo.save(op.toRow());
log.info(
"Created PENDING {} operation for table {}.{}",
analyzer.getOperationType(),
table.getDatabaseId(),
table.getTableId());
}
}));

log.info("Analysis complete");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package com.linkedin.openhouse.analyzer;

import com.linkedin.openhouse.analyzer.model.TableOperation;
import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
import java.time.Duration;
import java.time.Instant;
import java.util.Optional;
import lombok.RequiredArgsConstructor;

/**
* Encapsulates the time-based scheduling logic shared across operation types. An analyzer delegates
* to {@link CadencePolicy} to decide whether to re-issue a recommendation for a table that already
* has an active operation record and/or history.
*
* <p>The SCHEDULED timeout is a key safety mechanism: if a Spark job crashes without reporting
* back, the SCHEDULED row would otherwise block the table forever. When the row has been SCHEDULED
* (or SCHEDULING) longer than {@code scheduledTimeout}, the Analyzer treats it as stale and returns
* {@code true}, causing a new PENDING row to be inserted.
*/
@RequiredArgsConstructor
public class CadencePolicy {

/**
* How long to wait after a successful operation before re-evaluating the table. For example, if
* set to 24 hours and OFD succeeded at 10:00 AM Monday, the table won't be scheduled again until
* after 10:00 AM Tuesday.
*/
private final Duration successRetryInterval;

/**
* How long to wait after a failed operation before retrying. Shorter than the success interval to
* allow quick recovery. For example, if set to 1 hour and OFD failed at 2:00 PM, the table
* becomes eligible for retry at 3:00 PM.
*/
private final Duration failureRetryInterval;

/**
* Maximum time a row can stay in SCHEDULED status before the analyzer treats it as stale and
* overwrites it with a new PENDING row. Handles the case where a Spark job crashes without
* reporting back. For example, if set to 6 hours and a job was submitted at noon but never
* completed, the analyzer will re-schedule the table after 6:00 PM.
*/
private final Duration scheduledTimeout;
Comment thread
mkuchenbecker marked this conversation as resolved.

/**
* Returns {@code true} if a new or refreshed operation record should be upserted.
*
* @param currentOp the existing active operation record, or empty if none exists
* @param latestHistory the most recent history entry for this (table, type), or empty
*/
public boolean shouldSchedule(
Optional<TableOperation> currentOp, Optional<TableOperationHistoryRow> latestHistory) {
if (currentOp.isEmpty()) {
return decideFromHistory(latestHistory);
}
TableOperation op = currentOp.get();
switch (op.getStatus()) {
case "PENDING":
case "SCHEDULING":
return false;
case "SCHEDULED":
if (latestHistory.isEmpty()) {
return pastInterval(op.getScheduledAt(), scheduledTimeout);
}
return decideFromHistoryEntry(latestHistory.get());
default:
return true;
}
}

private boolean decideFromHistory(Optional<TableOperationHistoryRow> latestHistory) {
if (latestHistory.isEmpty()) {
return true;
}
return decideFromHistoryEntry(latestHistory.get());
}

private boolean decideFromHistoryEntry(TableOperationHistoryRow entry) {
switch (entry.getStatus()) {
case "SUCCESS":
return pastInterval(entry.getSubmittedAt(), successRetryInterval);
case "FAILED":
return pastInterval(entry.getSubmittedAt(), failureRetryInterval);
default:
return true;
}
}

private boolean pastInterval(Instant timestamp, Duration interval) {
return timestamp == null || Duration.between(timestamp, Instant.now()).compareTo(interval) > 0;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package com.linkedin.openhouse.analyzer;

import com.linkedin.openhouse.analyzer.model.Table;
import com.linkedin.openhouse.analyzer.model.TableOperation;
import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
import java.util.List;
import java.util.Optional;

/**
* Strategy interface for a single operation type. Each implementation decides whether a given table
* needs an operation recommendation upserted in the Optimizer Service.
*/
public interface OperationAnalyzer {

/** The operation type this analyzer handles (e.g., {@code "ORPHAN_FILES_DELETION"}). */
String getOperationType();

/**
* Returns {@code true} if this operation is opted-in for the given table. Tables that return
* {@code false} are skipped entirely — no upsert is issued.
*/
boolean isEnabled(Table table);

/**
* Returns {@code true} if a new or refreshed operation record should be upserted.
*
* @param table the table entry
* @param currentOp the existing active operation record, or empty if none exists
* @param latestHistory the most recent history entry for this (table, type), or empty
*/
boolean shouldSchedule(
Table table,
Optional<TableOperation> currentOp,
Optional<TableOperationHistoryRow> latestHistory);

/**
* Maximum number of consecutive FAILED history entries before the circuit breaker trips and
* scheduling is suppressed for this (table, operation_type). Override per operation type. Returns
* 0 to disable the circuit breaker.
*/
default int getCircuitBreakerThreshold() {
return 5;
}

/**
* Returns {@code true} if the circuit breaker has tripped for this table. The default
* implementation checks whether the last N history entries are all FAILED. Individual analyzers
* can override this to implement different strategies (e.g., time-based backoff).
*
* <p>// TODO: Add circuit breaker reset with exponential backoff so tables can recover
* automatically after a cooldown period instead of staying tripped permanently.
*
* <p>// TODO: Add a communication path to surface tripped circuit breakers to users (e.g.,
* metrics, alerts, or a dashboard query).
*
* @param tableUuid the table whose history to check
* @param history recent history entries for this (table, type), newest first
*/
default boolean isCircuitBroken(String tableUuid, List<TableOperationHistoryRow> history) {
int threshold = getCircuitBreakerThreshold();
if (threshold <= 0 || history.size() < threshold) {
return false;
}
return history.stream().limit(threshold).allMatch(r -> "FAILED".equals(r.getStatus()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package com.linkedin.openhouse.analyzer;

import com.linkedin.openhouse.analyzer.model.Table;
import com.linkedin.openhouse.analyzer.model.TableOperation;
import com.linkedin.openhouse.optimizer.entity.TableOperationHistoryRow;
import java.time.Duration;
import java.util.Optional;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

/** Analyzer for the {@code ORPHAN_FILES_DELETION} operation type. */
@Component
public class OrphanFilesDeletionAnalyzer implements OperationAnalyzer {

static final String OPERATION_TYPE = "ORPHAN_FILES_DELETION";
static final String OFD_ENABLED_PROPERTY = "maintenance.optimizer.ofd.enabled";

private final CadencePolicy cadencePolicy;

@Autowired
public OrphanFilesDeletionAnalyzer(
@Value("${ofd.success-retry-hours:24}") long successRetryHours,
@Value("${ofd.failure-retry-hours:1}") long failureRetryHours,
@Value("${ofd.scheduled-timeout-hours:6}") long scheduledTimeoutHours) {
this.cadencePolicy =
new CadencePolicy(
Duration.ofHours(successRetryHours),
Duration.ofHours(failureRetryHours),
Duration.ofHours(scheduledTimeoutHours));
}

/** Package-private for tests that supply a pre-built {@link CadencePolicy}. */
OrphanFilesDeletionAnalyzer(CadencePolicy cadencePolicy) {
this.cadencePolicy = cadencePolicy;
}

@Override
public String getOperationType() {
return OPERATION_TYPE;
}

@Override
public boolean isEnabled(Table table) {
return "true".equals(table.getTableProperties().get(OFD_ENABLED_PROPERTY));
}

@Override
public boolean shouldSchedule(
Table table,
Optional<TableOperation> currentOp,
Optional<TableOperationHistoryRow> latestHistory) {
return cadencePolicy.shouldSchedule(currentOp, latestHistory);
}
}
Loading