Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions apps/backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ DEALBOT_MAINTENANCE_WINDOW_MINUTES=20
DEALS_PER_SP_PER_HOUR=2
DATASET_CREATIONS_PER_SP_PER_HOUR=1
RETRIEVALS_PER_SP_PER_HOUR=1
# data_set_lifecycle_check canary: creates a throwaway data set and terminates it each tick
# (defaults: enabled on calibration, disabled on mainnet).
# DATASET_LIFECYCLE_CHECK_ENABLED=true
DATASET_LIFECYCLE_CHECKS_PER_SP_PER_HOUR=1
DATA_SET_LIFECYCLE_CHECK_JOB_TIMEOUT_SECONDS=600 # 10m: create + upload + terminate + pdpEndEpoch poll
PG_BOSS_LOCAL_CONCURRENCY=20
JOB_SCHEDULER_POLL_SECONDS=300
JOB_WORKER_POLL_SECONDS=60
Expand Down
7 changes: 7 additions & 0 deletions apps/backend/src/common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,10 @@ export const ZERO_ADDRESS = "0x0000000000000000000000000000000000000000";
export const MAX_BLOCK_SIZE = 5 * 1024 * 1024;

export const DEV_TAG = stringToHex("dev");

/**
* Fixed metadata marker key tagging every throwaway data set created by the
* `data_set_lifecycle_check` job. The value is a per-run nonce; the key is the stable
* handle operators use to list/sweep leaked sets (create-OK / terminate-failed runs).
*/
export const LIFECYCLE_CHECK_METADATA_KEY = "dealbotLifecycleCheck";
39 changes: 39 additions & 0 deletions apps/backend/src/config/app.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ export const configValidationSchema = Joi.object({
// Per-hour limits are guardrails to avoid excessive background load.
DEALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(4),
DATASET_CREATIONS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1),
DATASET_LIFECYCLE_CHECKS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1),
RETRIEVALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(2),
// Enables the data_set_lifecycle_check canary job. The network-dependent default (true on
// calibration, false on mainnet) is resolved in loadConfig; here we only validate the
// type when explicitly set. See docs/checks/data-set-lifecycle-check.md.
DATASET_LIFECYCLE_CHECK_ENABLED: Joi.boolean().optional(),
// Polling interval for pg-boss scheduler (lower = more responsive, higher = less DB chatter).
JOB_SCHEDULER_POLL_SECONDS: Joi.number().min(60).default(300),
JOB_WORKER_POLL_SECONDS: Joi.number().min(5).default(60),
Expand All @@ -93,6 +98,7 @@ export const configValidationSchema = Joi.object({
DEAL_JOB_TIMEOUT_SECONDS: Joi.number().min(120).default(360), // 6 minutes max runtime for data storage jobs (TODO: reduce default to 3 minutes)
RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(60), // 1 minute max runtime for retrieval jobs (TODO: reduce default to 30 seconds)
DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs
DATA_SET_LIFECYCLE_CHECK_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes: covers create + seed-piece upload + terminate + pdpEndEpoch poll
// Seconds to hold the process alive after pg-boss drain completes, so Prometheus
// captures at least one scrape of the terminal counter increments emitted during
// shutdown. Default 35 covers the 30s ServiceMonitor interval plus a 5s buffer.
Expand Down Expand Up @@ -226,6 +232,17 @@ export interface IJobsConfig {
* Target number of dataset creation runs per storage provider per hour.
*/
dataSetCreationsPerSpPerHour: number;
/**
* Enables the `data_set_lifecycle_check` canary job, which creates a
* throwaway data set and immediately terminates it in a single tick.
*
* Defaults to true on calibration and false on mainnet.
*/
dataSetLifecycleCheckEnabled: boolean;
/**
* Target number of dataset lifecycle check runs per storage provider per hour.
*/
dataSetLifecycleChecksPerSpPerHour: number;
/**
* How often the scheduler polls Postgres for due jobs (seconds).
*
Expand Down Expand Up @@ -284,6 +301,13 @@ export interface IJobsConfig {
* Uses AbortController to actively cancel job execution.
*/
dataSetCreationJobTimeoutSeconds: number;
/**
* Maximum runtime (seconds) for data-set lifecycle check jobs before forced abort.
*
* Bounds the create-with-seed-piece upload, the terminateService call, and the
* `pdpEndEpoch != 0` confirmation poll. Uses AbortController to actively cancel execution.
*/
dataSetLifecycleCheckJobTimeoutSeconds: number;
/**
* Maximum runtime (seconds) for retrieval jobs before forced abort.
*
Expand Down Expand Up @@ -473,6 +497,17 @@ export function loadConfig(): IConfig {
dealsPerSpPerHour: Number.parseFloat(process.env.DEALS_PER_SP_PER_HOUR || "4"),
retrievalsPerSpPerHour: Number.parseFloat(process.env.RETRIEVALS_PER_SP_PER_HOUR || "2"),
dataSetCreationsPerSpPerHour: Number.parseFloat(process.env.DATASET_CREATIONS_PER_SP_PER_HOUR || "1"),
dataSetLifecycleCheckEnabled: (() => {
const raw = process.env.DATASET_LIFECYCLE_CHECK_ENABLED;
if (raw == null || raw.trim().length === 0) {
// Default: enabled on calibration, disabled on mainnet.
return (process.env.NETWORK || "calibration") === "calibration";
}
return raw === "true";
})(),
dataSetLifecycleChecksPerSpPerHour: Number.parseFloat(
process.env.DATASET_LIFECYCLE_CHECKS_PER_SP_PER_HOUR || "1",
),
schedulerPollSeconds: Number.parseInt(process.env.JOB_SCHEDULER_POLL_SECONDS || "300", 10),
workerPollSeconds: Number.parseInt(process.env.JOB_WORKER_POLL_SECONDS || "60", 10),
pgbossLocalConcurrency: Number.parseInt(process.env.PG_BOSS_LOCAL_CONCURRENCY || "20", 10),
Expand All @@ -484,6 +519,10 @@ export function loadConfig(): IConfig {
dealJobTimeoutSeconds: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10),
retrievalJobTimeoutSeconds: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10),
dataSetCreationJobTimeoutSeconds: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10),
dataSetLifecycleCheckJobTimeoutSeconds: Number.parseInt(
process.env.DATA_SET_LIFECYCLE_CHECK_JOB_TIMEOUT_SECONDS || "360",
10,
),
shutdownFinalScrapeDelaySeconds: Number.parseInt(process.env.SHUTDOWN_FINAL_SCRAPE_DELAY_SECONDS || "35", 10),
pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)),
maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export type JobType =
| "deal"
| "retrieval"
| "data_set_creation"
| "data_set_lifecycle_check"
| "pull_check"
| "providers_refresh"
| "data_retention_poll"
Expand Down
84 changes: 84 additions & 0 deletions apps/backend/src/deal/deal.service.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { DealAddonsService } from "../deal-addons/deal-addons.service.js";
import { DealPreprocessingResult } from "../deal-addons/types.js";
import {
DataSetCreationCheckMetrics,
DataSetLifecycleCheckMetrics,
DataStorageCheckMetrics,
RetrievalCheckMetrics,
} from "../metrics-prometheus/check-metrics.service.js";
Expand Down Expand Up @@ -169,6 +170,10 @@ describe("DealService", () => {
observeCheckDuration: vi.fn(),
recordStatus: vi.fn(),
};
const mockDataSetLifecycleCheckMetrics = {
observeCheckDuration: vi.fn(),
recordStatus: vi.fn(),
};

beforeEach(async () => {
const module: TestingModule = await Test.createTestingModule({
Expand All @@ -184,6 +189,7 @@ describe("DealService", () => {
{ provide: DataStorageCheckMetrics, useValue: mockDataStorageMetrics },
{ provide: RetrievalCheckMetrics, useValue: mockRetrievalMetrics },
{ provide: DataSetCreationCheckMetrics, useValue: mockDataSetCreationMetrics },
{ provide: DataSetLifecycleCheckMetrics, useValue: mockDataSetLifecycleCheckMetrics },
{ provide: ClickhouseService, useValue: { insert: vi.fn(), probeLocation: "test" } },
{ provide: DatasetLivenessService, useValue: mockDatasetLivenessService },
],
Expand Down Expand Up @@ -1068,6 +1074,7 @@ describe("DealService", () => {
{ provide: DataStorageCheckMetrics, useValue: mockDataStorageMetrics },
{ provide: RetrievalCheckMetrics, useValue: mockRetrievalMetrics },
{ provide: DataSetCreationCheckMetrics, useValue: mockDataSetCreationMetrics },
{ provide: DataSetLifecycleCheckMetrics, useValue: mockDataSetLifecycleCheckMetrics },
{ provide: ClickhouseService, useValue: { insert: vi.fn(), probeLocation: "test" } },
{ provide: DatasetLivenessService, useValue: mockDatasetLivenessService },
],
Expand Down Expand Up @@ -1445,6 +1452,83 @@ describe("DealService", () => {
});
});

describe("runDataSetLifecycleCheck", () => {
beforeEach(() => {
vi.spyOn(mockWalletSdkService, "getProviderInfo").mockReturnValue({
id: 1n,
name: "sp",
isApproved: true,
} as any);
});

it("creates a throwaway data set, terminates it, and records only lifecycle metrics", async () => {
const terminateMock = vi.fn().mockResolvedValue("0xhash");
const synapseMock = {
storage: {
createContext: vi.fn().mockResolvedValue({ dataSetId: 9n }),
terminateDataSet: terminateMock,
},
client: { waitForTransactionReceipt: vi.fn().mockResolvedValue({ status: "success" }) },
};
vi.spyOn(service as any, "createSynapseInstance").mockImplementation(() => synapseMock as unknown as Synapse);
(executeUpload as Mock).mockImplementation(async (_s, _d, _r, options) => {
await triggerUploadProgress(options?.onProgress);
return { pieceCid: "bafk-seed", pieceId: 1, transactionHash: "0xhash" };
});

// getDataSet: first probe inside ensureDataSetTerminated, then the confirmation poll.
mockWarmStorageService.getDataSet.mockResolvedValueOnce({ pdpEndEpoch: 0n });
mockWarmStorageService.getDataSet.mockResolvedValueOnce({ pdpEndEpoch: 4321n });

const result = await service.runDataSetLifecycleCheck(
"0xaaa",
{ dealbotLifecycleCheck: "nonce-1" },
undefined,
5_000,
);

expect(synapseMock.storage.createContext).toHaveBeenCalledWith(
expect.objectContaining({ metadata: { dealbotLifecycleCheck: "nonce-1" } }),
);
expect(terminateMock).toHaveBeenCalledWith({ dataSetId: 9n });
expect(result).toEqual({ dataSetId: 9n, pdpEndEpoch: 4321n });
expect(mockDataSetLifecycleCheckMetrics.recordStatus).toHaveBeenCalledWith(
expect.objectContaining({ checkType: "dataSetLifecycleCheck" }),
"success",
);
expect(mockDataSetLifecycleCheckMetrics.observeCheckDuration).toHaveBeenCalledWith(
expect.objectContaining({ checkType: "dataSetLifecycleCheck" }),
expect.any(Number),
);
// The create step must NOT record dataSetCreation metrics (those belong to data_set_creation).
expect(mockDataSetCreationMetrics.recordStatus).not.toHaveBeenCalled();
expect(mockDataSetCreationMetrics.observeCheckDuration).not.toHaveBeenCalled();
// No Deal rows exist for the throwaway set, so no cleanup is attempted.
expect(dealRepoMock.save).not.toHaveBeenCalled();
});

it("records failure.timedout and rethrows when the signal is already aborted", async () => {
const createContextMock = vi.fn().mockResolvedValue({ dataSetId: 9n });
const synapseMock = {
storage: { createContext: createContextMock, terminateDataSet: vi.fn() },
client: { waitForTransactionReceipt: vi.fn() },
};
vi.spyOn(service as any, "createSynapseInstance").mockImplementation(() => synapseMock as unknown as Synapse);

const controller = new AbortController();
controller.abort(new Error("Data set lifecycle check job timeout (600s)"));

await expect(
service.runDataSetLifecycleCheck("0xaaa", { dealbotLifecycleCheck: "nonce-2" }, controller.signal, 5_000),
).rejects.toThrow();

expect(mockDataSetLifecycleCheckMetrics.recordStatus).toHaveBeenCalledWith(
expect.objectContaining({ checkType: "dataSetLifecycleCheck" }),
"failure.timedout",
);
});
});

describe("createDeal isLive guard", () => {
it("throws DealJobTerminatedDataSetError when data set is PDP-terminated; no metrics or save", async () => {
const providerInfo: PDPProviderEx = {
Expand Down
Loading