From 2297309b64fc30c464a49c216a757da1b985bfb6 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Thu, 2 Jul 2026 11:55:33 +0200 Subject: [PATCH 01/12] feature - add execution observations and adapter coverage checks (#66, #67) --- docs/language/reference/execution_context.md | 44 ++ docs/language/reference/inspection.md | 2 +- docs/release_notes/v0_1.md | 4 + docs/rfcs/032_execution_observations.md | 17 +- .../rfcs/033_adapter_requirements_coverage.md | 10 +- docs/rfcs/README.md | 4 +- src/evidence.incn | 138 +++++ src/inspect.incn | 3 +- src/lib.incn | 12 +- src/prism/mod.incn | 5 + src/session/errors.incn | 1 + src/session/mod.incn | 2 +- src/session/types.incn | 570 +++++++++++++++++- tests/test_execution_observations.incn | 185 ++++++ 14 files changed, 955 insertions(+), 42 deletions(-) create mode 100644 tests/test_execution_observations.incn diff --git a/docs/language/reference/execution_context.md b/docs/language/reference/execution_context.md index e818af8..7eb4e96 100644 --- a/docs/language/reference/execution_context.md +++ b/docs/language/reference/execution_context.md @@ -45,6 +45,37 @@ All read APIs return `LazyFrame[T]`. They create deferred logical work; they do - `execute(...)` proves the plan can bind, lower, and run. - `collect(...)` performs that same work and materializes a local `DataFrame[T]`. +## Execution observations + +Observed execution methods preserve the ordinary session contracts while also returning runtime evidence. They are the +author-facing surface for RFC 032 execution observations. + +| API | Returns | Role | +| -------------------------------- | ------------------------ | -------------------------------------------------------------------- | +| `session.execute_observed(data)` | `ObservedLazyFrame[T]` | Execute and return `data`, `observation`, and `error` fields | +| `session.collect_observed(data)` | `ObservedDataFrame[T]` | Collect and return `data`, `observation`, and `error` fields | +| `session.write_observed(data, target)` | `ObservedWrite` | Write and return `observation` plus an optional `error` | + +The ordinary `execute`, `collect`, and `write` methods use the same execution path internally and keep returning +`Result[...]` values for compact application code. Use the observed variants when an audit, governance, debugging, or +verification flow needs a durable execution attempt record. + +An `ExecutionObservation` records the operation, status, backend name, optional adapter version, requested and observed +semantic profile IDs, plan target, execution-attempt target, client-session context target, Unix nanosecond wall-clock +start/end values from `std.datetime.runtime.SystemTime`, monotonic duration nanoseconds from +`std.datetime.runtime.Instant`, row count or byte count when materialization supplies them, optional trace IDs, +diagnostics, and linked coverage records when present. Observation records do not contain row payloads or backend logs +by default. + +```incan +observed = session.collect_observed(summary) + +assert observed.observation.status == ExecutionObservationStatus.Success +match observed.data: + Some(df) => println(df.preview_text()) + None => println(observed.observation.diagnostics[0].message) +``` + ## Write surface | API | Returns | Notes | @@ -57,6 +88,19 @@ All read APIs return `LazyFrame[T]`. They create deferred logical work; they do These writes are Session-owned. They do not bypass the execution context even when the input is deferred. +## Adapter coverage + +`session.check_coverage(requirements)` accepts explicit `AdapterRequirement` records and returns one +`AdapterCoverageRecord` per requirement. This is the current RFC 033 coverage surface. It does not infer requirements +from every plan shape yet; callers must pass the requirements they want evaluated. + +Coverage states are conservative: + +- `covered` means the selected adapter is known to cover that requirement family. +- `partially_covered` means support depends on the concrete function, plan shape, or restriction. +- `uncovered` means the selected adapter is known not to provide that guarantee. +- `unknown` means InQL has not classified coverage; consumers must not treat it as enforced behavior. + ## Active-session convenience | API | Returns | Purpose | diff --git a/docs/language/reference/inspection.md b/docs/language/reference/inspection.md index d39f560..6a2652d 100644 --- a/docs/language/reference/inspection.md +++ b/docs/language/reference/inspection.md @@ -72,4 +72,4 @@ def inspect_paid_spend(orders: LazyFrame[Order]) -> None: Inspection is read-only and plan-local. It does not execute the plan, inspect DataFusion physical plans, read catalog metadata, emit files, or make governance decisions. -The first implementation computes local Prism plan graph, schema flow, lineage graph, public version/schema metadata attachments, diagnostics shape, and unsupported-evidence markers. Semantic profiles, ingress mappings, client-session context, frontend coverage, execution observations, adapter coverage, quality observations, policy checkpoints, governed bundles, and external exchange bridges remain owned by their RFCs and are not silently inferred by this API. +The first implementation computes local Prism plan graph, schema flow, lineage graph, public version/schema metadata attachments, diagnostics shape, and unsupported-evidence markers. Session execution observations and explicit adapter coverage checks are exposed through the execution context rather than through plan inspection. Semantic profiles, ingress mappings, client-session context, frontend coverage, quality observations, policy checkpoints, governed bundles, and external exchange bridges remain owned by their RFCs and are not silently inferred by this API. diff --git a/docs/release_notes/v0_1.md b/docs/release_notes/v0_1.md index 83cbd14..3df56fd 100644 --- a/docs/release_notes/v0_1.md +++ b/docs/release_notes/v0_1.md @@ -41,6 +41,10 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Substrait internals:** RFC 002 helpers are now split into focused owner modules for relation building, plan assembly, inspection, schema registry, extension bookkeeping, and expression lowering instead of one `substrait.plan` godmodule. - **Prism:** `LazyFrame` lowering applies safe canonical rewrites (`Filter(true)` elimination and adjacent `Limit`/`Project`/`OrderBy` collapse) before RFC 002 plan emission. - **Inspection:** RFCs 028–031 now have a first local evidence spine for Prism-backed `LazyFrame` plans. `inspect_plan(...)` and `inspect_lineage(...)` expose semantic targets, output schema, authored and rewritten Prism node records, lineage edges, artifact-family summaries, metadata attachment records, diagnostics shape, and explicit unsupported-evidence markers without executing or backend-binding the plan. +- **Execution observations and adapter coverage:** RFC 032 adds observed `Session` variants for `execute`, `collect`, + and `write` so callers can capture structured runtime evidence for success and failure attempts. RFC 033 adds adapter + requirement and coverage records plus `Session.check_coverage(requirements)` for explicit capability checks with + covered, partially covered, uncovered, and unknown states. - **Execution:** Session-oriented read, execute, and write (reference backend per RFC 004), with `collect(...)` now producing structured `DataFrame` materialization metadata plus preview text instead of treating rendered text as the canonical contract. Session execution dispatch now routes through a backend adapter boundary over Substrait plans; DataFusion remains the first adapter rather than being encoded directly into Session state. - **Session API:** `Session.write(data, target)` now accepts typed sink descriptors such as `csv_sink(uri)` and `parquet_sink(uri)`, while the file-specific `write_csv(...)` and `write_parquet(...)` helpers remain as convenience methods. - **Documentation:** Current package behavior is documented under `docs/language/`, while RFCs remain design records rather than implementation diaries. diff --git a/docs/rfcs/032_execution_observations.md b/docs/rfcs/032_execution_observations.md index f082f25..db23591 100644 --- a/docs/rfcs/032_execution_observations.md +++ b/docs/rfcs/032_execution_observations.md @@ -1,6 +1,6 @@ # InQL RFC 032: Execution observations -- **Status:** Draft +- **Status:** In Progress - **Created:** 2026-05-29 - **Author(s):** Danny Meijer (@dannymeijer) - **Related:** @@ -42,11 +42,11 @@ After a plan executes, users and tools need evidence about what was attempted an An author can collect data and then inspect the observation: ```incan -result = session.collect(summary) -observation = result.execution_observation() +observed = session.collect_observed(summary) +observation = observed.observation -assert observation.plan_id == inspect_plan(summary).plan_id -assert observation.status == "success" +assert observation.plan_target.target_id == inspect_plan(summary).plan_id +assert observation.status == ExecutionObservationStatus.Success ``` Execution evidence explains the run. It does not replace plan inspection. @@ -81,6 +81,13 @@ Quality observations, adapter coverage records, semantic profile records, and ev Existing session execution remains valid. Implementations may initially emit partial observations, but unsupported fields must be explicit rather than silently omitted when consumers request them. +The first implementation adds observed variants for `execute`, `collect`, and `write` while preserving the ordinary +`Result[...]`-returning session APIs. Observed variants return success and failure observations. Wall-clock fields are +Unix nanoseconds from Incan's `std.datetime.runtime.SystemTime`, and duration uses monotonic elapsed nanoseconds from +`std.datetime.runtime.Instant`. The model exposes adapter version, requested and observed semantic profile IDs, byte +count, and trace IDs explicitly; the initial DataFusion path reports `None` or empty values for those fields rather than +fabricating unavailable evidence. + ## Alternatives considered - **Use backend logs only.** Rejected because logs are not stable semantic evidence and may be sensitive. diff --git a/docs/rfcs/033_adapter_requirements_coverage.md b/docs/rfcs/033_adapter_requirements_coverage.md index 4ed5eeb..f78e8fc 100644 --- a/docs/rfcs/033_adapter_requirements_coverage.md +++ b/docs/rfcs/033_adapter_requirements_coverage.md @@ -1,6 +1,6 @@ # InQL RFC 033: Adapter requirements and coverage -- **Status:** Draft +- **Status:** In Progress - **Created:** 2026-05-29 - **Author(s):** Danny Meijer (@dannymeijer) - **Related:** @@ -55,10 +55,10 @@ for requirement in inspection.adapter_requirements(): print(requirement.capability, requirement.guarantee_level) ``` -A session can then report whether the selected adapter covers them: +A session can then report whether the selected adapter covers explicit requirements: ```incan -coverage = session.check_coverage(summary) +coverage = session.check_coverage([requirement]) ``` If coverage is unknown for a requirement whose guarantee level is required, tools should not present that as enforced behavior. @@ -104,6 +104,10 @@ Function registry entries, semi-structured functions, extensions, quality assert Existing adapters may initially report unknown coverage for capabilities they do not declare. Consumers must distinguish unknown from covered. +The first implementation provides the adapter requirement and coverage record vocabulary plus +`Session.check_coverage(requirements)`. Requirement inference from arbitrary inspection records remains part of the +remaining RFC 033 work; the current API evaluates requirements that callers pass explicitly. + ## Alternatives considered - **Fail only at backend runtime.** Rejected because users need pre-execution visibility when possible. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index d2997d9..c689711 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -38,8 +38,8 @@ InQL uses its **own** RFC series (starting at 000), independent of the [Incan la | [029][rfc-029] | In Progress | Typed metadata attachments | | | [030][rfc-030] | In Progress | Prism lineage graph | | | [031][rfc-031] | In Progress | Local inspection APIs and artifacts | | -| [032][rfc-032] | Draft | Execution observations | | -| [033][rfc-033] | Draft | Adapter requirements and coverage | | +| [032][rfc-032] | In Progress | Execution observations | | +| [033][rfc-033] | In Progress | Adapter requirements and coverage | | | [034][rfc-034] | Draft | Quality assertions and observations | | | [035][rfc-035] | Draft | Governed attributes and policy checkpoints | | | [036][rfc-036] | Draft | Governed plan bundle | | diff --git a/src/evidence.incn b/src/evidence.incn index 0df228c..2438db8 100644 --- a/src/evidence.incn +++ b/src/evidence.incn @@ -121,6 +121,78 @@ pub enum EvidenceArtifactStatus(str): Unsupported = "unsupported" +@derive(Clone) +pub enum ExecutionOperationKind(str): + """Session operation families that can emit execution observations.""" + + Execute = "execute" + Collect = "collect" + Write = "write" + + +@derive(Clone) +pub enum ExecutionObservationStatus(str): + """Terminal status for one observed execution attempt.""" + + Success = "success" + Failure = "failure" + Cancelled = "cancelled" + Skipped = "skipped" + Unsupported = "unsupported" + + +@derive(Clone) +pub enum ExecutionDiagnosticSeverity(str): + """Severity class for one structured execution diagnostic.""" + + Info = "info" + Warning = "warning" + Error = "error" + + +@derive(Clone) +pub enum AdapterRequirementCapability(str): + """Adapter capability families that can be required by a semantic plan or evidence policy.""" + + ExtensionFunction = "extension_function" + VariantSemantics = "variant_semantics" + DecimalSemantics = "decimal_semantics" + NullSemantics = "null_semantics" + LineagePreservation = "lineage_preservation" + AuditEmission = "audit_emission" + RowFilter = "row_filter" + ColumnMask = "column_mask" + AggregateThreshold = "aggregate_threshold" + RegionBinding = "region_binding" + OrderedExecution = "ordered_execution" + SnapshotCapture = "snapshot_capture" + CanonicalDigest = "canonical_digest" + CrossRelationReconciliation = "cross_relation_reconciliation" + IncrementalWatermark = "incremental_watermark" + VerificationEventStream = "verification_event_stream" + WaiverRecording = "waiver_recording" + CryptographicQueryProof = "cryptographic_query_proof" + + +@derive(Clone) +pub enum AdapterRequirementGuarantee(str): + """How strongly one semantic target depends on an adapter capability.""" + + Required = "required" + Preferred = "preferred" + Optional = "optional" + + +@derive(Clone) +pub enum AdapterCoverageState(str): + """Adapter coverage state for one explicit requirement.""" + + Covered = "covered" + PartiallyCovered = "partially_covered" + Uncovered = "uncovered" + Unknown = "unknown" + + @derive(Clone) pub model SemanticTarget: """ @@ -140,6 +212,72 @@ pub model SemanticTarget: pub ordinal: Option[int] +@derive(Clone) +pub model ExecutionDiagnostic: + """One structured diagnostic attached to an execution observation or coverage record.""" + + pub severity: ExecutionDiagnosticSeverity + pub code: str + pub message: str + pub target: Option[SemanticTarget] + + +@derive(Clone) +pub model AdapterRequirement: + """One explicit adapter capability requirement anchored to a semantic target.""" + + pub requirement_id: str + pub target: SemanticTarget + pub capability: AdapterRequirementCapability + pub guarantee: AdapterRequirementGuarantee + pub reason: str + pub evidence_refs: list[str] + + +@derive(Clone) +pub model AdapterCoverageRecord: + """One adapter coverage answer for an explicit adapter requirement.""" + + pub coverage_id: str + pub requirement: AdapterRequirement + pub adapter_name: str + pub adapter_version: Option[str] + pub semantic_profile_id: Option[str] + pub state: AdapterCoverageState + pub diagnostics: list[ExecutionDiagnostic] + pub evidence_refs: list[str] + + +@derive(Clone) +pub model ExecutionObservation: + """ + Runtime evidence for one session execution attempt. + + Wall-clock timestamps are Unix nanoseconds from `std.datetime.runtime.SystemTime`; duration is monotonic elapsed + nanoseconds from `std.datetime.runtime.Instant`, so consumers can sort attempts and measure runtime separately. + """ + + pub observation_id: str + pub attempt_target: SemanticTarget + pub plan_target: SemanticTarget + pub context_targets: list[SemanticTarget] + pub operation: ExecutionOperationKind + pub status: ExecutionObservationStatus + pub backend_name: str + pub adapter_version: Option[str] + pub requested_semantic_profile_id: Option[str] + pub observed_semantic_profile_id: Option[str] + pub started_at_unix_nanoseconds: int + pub ended_at_unix_nanoseconds: int + pub duration_nanoseconds: int + pub row_count: Option[int] + pub byte_count: Option[int] + pub trace_ids: list[str] + pub diagnostics: list[ExecutionDiagnostic] + pub coverage_records: list[AdapterCoverageRecord] + pub evidence_refs: list[str] + + @derive(Clone) pub model MetadataPayload: """Schema-versioned typed payload stored as a compact string value for this evidence slice.""" diff --git a/src/inspect.incn b/src/inspect.incn index 0a93bed..e5a5e49 100644 --- a/src/inspect.incn +++ b/src/inspect.incn @@ -20,6 +20,7 @@ from evidence import ( UnsupportedEvidence, ) from metadata import inql_version +from prism import prism_plan_id from prism.output_columns import authored_output_schema, rewritten_output_schema from prism.rewrite import derive_rewritten_view, rewrite_explain from prism.store import node_at, reachable_node_ids @@ -1019,7 +1020,7 @@ def _field_index(schema: list[ScalarColumnSpec], name: str) -> int: def _plan_id(store_id: PrismStoreId, tip_id: int) -> str: """Build a local deterministic plan id for one Prism store/tip snapshot.""" - return f"prism:store-{store_id.0}:tip-{tip_id}" + return prism_plan_id(store_id, tip_id) def _target_id(plan_id: str, kind: str, path: str) -> str: diff --git a/src/lib.incn b/src/lib.incn index c526c18..ff079c5 100644 --- a/src/lib.incn +++ b/src/lib.incn @@ -408,7 +408,17 @@ pub from backends import ( pub from metadata import inql_version pub from session.domain import SinkKind, SinkTarget, csv_sink, parquet_sink, sink_kind_name pub from evidence import ( + AdapterCoverageRecord, + AdapterCoverageState, + AdapterRequirement, + AdapterRequirementCapability, + AdapterRequirementGuarantee, EvidenceArtifactStatus, + ExecutionDiagnostic, + ExecutionDiagnosticSeverity, + ExecutionObservation, + ExecutionObservationStatus, + ExecutionOperationKind, InspectionArtifact, LineageConfidence, LineageEdge, @@ -433,7 +443,7 @@ pub from inspect import ( inspect_lineage, inspect_plan, ) -pub from session.types import Session, SessionBuilder +pub from session.types import ObservedDataFrame, ObservedLazyFrame, ObservedWrite, Session, SessionBuilder pub from session.errors import SessionError, SessionErrorKind, format_session_diagnostic, report_session_error pub from substrait.errors import SubstraitLoweringError, SubstraitLoweringErrorKind pub from substrait.schema import ( diff --git a/src/prism/mod.incn b/src/prism/mod.incn index 716b3d3..1ed17a1 100644 --- a/src/prism/mod.incn +++ b/src/prism/mod.incn @@ -345,6 +345,11 @@ pub def prism_cursor_output_columns[T with Clone](cursor: PrismCursor[T]) -> lis return cursor.planned_columns() +pub def prism_plan_id(store_id: PrismStoreId, tip_id: int) -> str: + """Return the stable local evidence plan id for one authored Prism tip.""" + return f"prism:store-{store_id.0}:tip-{tip_id}" + + pub def lower_prism_tip(store_id: PrismStoreId, tip_id: int) -> Rel: """Lower one authored Prism tip through canonical rewrite view to the `Rel` boundary.""" return lower_prism_tip_impl(store_id, tip_id) diff --git a/src/session/errors.incn b/src/session/errors.incn index e80092b..6d5e1b7 100644 --- a/src/session/errors.incn +++ b/src/session/errors.incn @@ -20,6 +20,7 @@ pub enum SessionErrorKind(str): UnknownScalarColumn = "unknown_scalar_column" +@derive(Clone) pub model SessionError: """Typed error envelope for Session-facing APIs.""" diff --git a/src/session/mod.incn b/src/session/mod.incn index 745175e..e85d13c 100644 --- a/src/session/mod.incn +++ b/src/session/mod.incn @@ -12,6 +12,6 @@ Module layout: - `session.datafusion_backend`: DataFusion adapter implementation """ -pub from session.types import Session, SessionBuilder +pub from session.types import ObservedDataFrame, ObservedLazyFrame, ObservedWrite, Session, SessionBuilder pub from session.errors import SessionError, SessionErrorKind, format_session_diagnostic, report_session_error pub from session.domain import SinkKind, SinkTarget, csv_sink, parquet_sink, sink_kind_name diff --git a/src/session/types.incn b/src/session/types.incn index d275340..0548692 100644 --- a/src/session/types.incn +++ b/src/session/types.incn @@ -1,6 +1,7 @@ """Public Session API and execution-context types.""" from rust::substrait::proto import Plan +from std.datetime.runtime import Instant, SystemTime from backends import ( BackendSelection, DataFusion, @@ -15,6 +16,19 @@ from backends import ( parquet_source, ) from dataset import BoundedDataSet, DataFrame, LazyFrame, lazy_frame_named_table +from evidence import ( + AdapterCoverageRecord, + AdapterCoverageState, + AdapterRequirement, + AdapterRequirementCapability, + ExecutionDiagnostic, + ExecutionDiagnosticSeverity, + ExecutionObservation, + ExecutionObservationStatus, + ExecutionOperationKind, + SemanticTarget, + SemanticTargetKind, +) pub from session.errors import SessionError, SessionErrorKind, format_session_diagnostic, report_session_error from session.errors import invalid_registration, invalid_sink from session.domain import LogicalName, SourceUri, SinkUri, SinkTarget, csv_sink, parquet_sink @@ -30,9 +44,17 @@ from session.active import ( ) from session.backend_dispatch import backend_collect_plan, backend_execute_plan, backend_write_plan from session.backend_types import BackendError, BackendErrorKind, BackendRegistration +from prism import prism_plan_id from substrait.errors import SubstraitLoweringError, SubstraitLoweringErrorKind from substrait.schema_registry import register_named_table_schema -from substrait.inspect import relation_output_columns, root_rel, read_named_table_name +from substrait.inspect import relation_output_columns, root_rel, read_kind_name, read_named_table_name + + +model _ExecutionClockStart: + """Wall-clock and monotonic clock snapshot for one observed Session operation.""" + + started_at_unix_nanoseconds: int + started_monotonic: Instant @derive(Clone) @@ -45,6 +67,29 @@ pub class SessionRegistration: return SessionRegistration(logical_name=self.logical_name, source=self.source) +pub class ObservedLazyFrame[T with Clone]: + """Observed `Session.execute(...)` result with data on success and diagnostics on failure.""" + + pub data: Option[LazyFrame[T]] + pub observation: ExecutionObservation + pub error: Option[SessionError] + + +pub class ObservedDataFrame[T with Clone]: + """Observed `Session.collect(...)` result with data on success and diagnostics on failure.""" + + pub data: Option[DataFrame[T]] + pub observation: ExecutionObservation + pub error: Option[SessionError] + + +pub class ObservedWrite: + """Observed `Session.write(...)` result with diagnostics when the write fails.""" + + pub observation: ExecutionObservation + pub error: Option[SessionError] + + pub class Session: """Public execution-context entrypoint.""" @@ -166,30 +211,153 @@ pub class Session: def execute[T with Clone](self, data: LazyFrame[T]) -> Result[LazyFrame[T], SessionError]: """Validate and execute one lazy plan while preserving deferred carrier shape.""" - plan = _plan_from_lazy_frame(data)? - _validate_named_table_binding(self._registrations, plan)? - - match backend_execute_plan(self._backend, _to_backend_registrations(self._registrations), plan): - Ok(_) => return Ok(data) - Err(err) => return Err(_session_error_from_backend_error(err)) + observed = self.execute_observed(data) + match observed.data: + Some(frame) => return Ok(frame) + None => return Err(_observed_error_or_internal(observed.error)) + + def execute_observed[T with Clone](self, data: LazyFrame[T]) -> ObservedLazyFrame[T]: + """Execute one lazy plan and return structured observation evidence for success or failure.""" + clock = _start_execution_clock() + backend_name = self.backend_name() + plan_target = _lazy_frame_plan_target(data.clone()) + # Keep lowering and validation failures anchored to the authored Prism plan target. Backend errors still attach + # to the same target, but they are produced after the adapter attempt has started. + match _plan_from_lazy_frame(data.clone()): + Err(err) => + return ObservedLazyFrame( + data=None, + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Execute, + backend_name, + clock, + err.clone(), + ), + error=Some(err), + ) + Ok(plan) => + match _validate_named_table_binding(self._registrations, plan.clone()): + Err(err) => + return ObservedLazyFrame( + data=None, + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Execute, + backend_name, + clock, + err.clone(), + ), + error=Some(err), + ) + Ok(_) => + match backend_execute_plan(self._backend, _to_backend_registrations(self._registrations), plan): + Ok(_) => + return ObservedLazyFrame( + data=Some(data), + observation=_success_observation( + plan_target, + ExecutionOperationKind.Execute, + backend_name, + clock, + None, + ), + error=None, + ) + Err(err) => + session_err = _session_error_from_backend_error(err) + return ObservedLazyFrame( + data=None, + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Execute, + backend_name, + clock, + session_err.clone(), + ), + error=Some(session_err), + ) def collect[T with Clone](self, data: LazyFrame[T]) -> Result[DataFrame[T], SessionError]: """Validate and execute one lazy plan, returning a structured materialized DataFrame.""" - plan = _plan_from_lazy_frame(data)? - rel = root_rel(plan.clone()) - _validate_named_table_binding(self._registrations, plan)? - - match backend_collect_plan(self._backend, _to_backend_registrations(self._registrations), plan): - Ok(materialization) => - return Ok( - DataFrame( - _type_witness=_empty_type_witness(), - _materialization=materialization, - _substrait_rel=rel, - _planned_columns=relation_output_columns(rel.clone()), + observed = self.collect_observed(data) + match observed.data: + Some(frame) => return Ok(frame) + None => return Err(_observed_error_or_internal(observed.error)) + + def collect_observed[T with Clone](self, data: LazyFrame[T]) -> ObservedDataFrame[T]: + """Collect one lazy plan and return structured observation evidence for success or failure.""" + clock = _start_execution_clock() + backend_name = self.backend_name() + plan_target = _lazy_frame_plan_target(data.clone()) + # Materialization is the first point where row counts are available, so only successful collects record them. + match _plan_from_lazy_frame(data): + Err(err) => + return ObservedDataFrame( + data=None, + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Collect, + backend_name, + clock, + err.clone(), ), + error=Some(err), ) - Err(err) => return Err(_session_error_from_backend_error(err)) + Ok(plan) => + rel = root_rel(plan.clone()) + match _validate_named_table_binding(self._registrations, plan.clone()): + Err(err) => + return ObservedDataFrame( + data=None, + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Collect, + backend_name, + clock, + err.clone(), + ), + error=Some(err), + ) + Ok(_) => + match backend_collect_plan(self._backend, _to_backend_registrations(self._registrations), plan): + Ok(materialization) => + row_count = materialization.row_count + return ObservedDataFrame( + data=Some( + DataFrame( + _type_witness=_empty_type_witness(), + _materialization=materialization, + _substrait_rel=rel, + _planned_columns=relation_output_columns(rel.clone()), + ), + ), + observation=_success_observation( + plan_target, + ExecutionOperationKind.Collect, + backend_name, + clock, + Some(row_count), + ), + error=None, + ) + Err(err) => + session_err = _session_error_from_backend_error(err) + return ObservedDataFrame( + data=None, + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Collect, + backend_name, + clock, + session_err.clone(), + ), + error=Some(session_err), + ) + + def check_coverage(self, requirements: list[AdapterRequirement]) -> list[AdapterCoverageRecord]: + """Return adapter coverage records for explicit semantic requirements without inventing requirements.""" + return [_coverage_record_for_requirement(requirement, self.backend_name()) for requirement in requirements] def write_csv[T with Clone](self, data: LazyFrame[T], uri: str) -> Result[None, SessionError]: """Execute one lazy plan and write result rows to a CSV sink URI.""" @@ -201,17 +369,97 @@ pub class Session: def write[T with Clone](self, data: BoundedDataSet[T], target: SinkTarget) -> Result[None, SessionError]: """Execute one bounded dataset and write result rows to a typed sink target.""" - return self._write_plan_to_sink(_plan_from_bounded_dataset(data)?, target) + observed = self.write_observed(data, target) + match observed.error: + Some(err) => return Err(err) + None => return Ok(None) + + def write_observed[T with Clone](self, data: BoundedDataSet[T], target: SinkTarget) -> ObservedWrite: + """Write one bounded dataset and return structured observation evidence for success or failure.""" + clock = _start_execution_clock() + backend_name = self.backend_name() + match _plan_from_bounded_dataset(data): + Err(err) => + plan_target = _unavailable_plan_target(ExecutionOperationKind.Write, clock.started_at_unix_nanoseconds) + return ObservedWrite( + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Write, + backend_name, + clock, + err.clone(), + ), + error=Some(err), + ) + Ok(plan) => return self._write_plan_to_sink_observed(plan, target, clock) def _write_plan_to_sink(self, plan: Plan, target: SinkTarget) -> Result[None, SessionError]: """Run one validated plan through the selected sink writer and normalize runtime/backend errors.""" - sink_uri = _sink_uri_from_text(target.uri)? - _validate_named_table_binding(self._registrations, plan)? - registrations = _to_backend_registrations(self._registrations) - - return _write_result_from_backend_result( - backend_write_plan(self._backend, registrations, plan, sink_uri.0, target.sink_kind), - ) + observed = self._write_plan_to_sink_observed(plan, target, _start_execution_clock()) + match observed.error: + Some(err) => return Err(err) + None => return Ok(None) + + def _write_plan_to_sink_observed( + self, + plan: Plan, + target: SinkTarget, + clock: _ExecutionClockStart, + ) -> ObservedWrite: + """Run one validated write and preserve execution evidence for the write path.""" + backend_name = self.backend_name() + plan_target = _substrait_plan_target(plan.clone()) + match _sink_uri_from_text(target.uri): + Err(err) => + return ObservedWrite( + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Write, + backend_name, + clock, + err.clone(), + ), + error=Some(err), + ) + Ok(sink_uri) => + match _validate_named_table_binding(self._registrations, plan.clone()): + Err(err) => + return ObservedWrite( + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Write, + backend_name, + clock, + err.clone(), + ), + error=Some(err), + ) + Ok(_) => + registrations = _to_backend_registrations(self._registrations) + match backend_write_plan(self._backend, registrations, plan, sink_uri.0, target.sink_kind): + Ok(_) => + return ObservedWrite( + observation=_success_observation( + plan_target, + ExecutionOperationKind.Write, + backend_name, + clock, + None, + ), + error=None, + ) + Err(err) => + session_err = _session_error_from_backend_error(err) + return ObservedWrite( + observation=_failure_observation( + plan_target, + ExecutionOperationKind.Write, + backend_name, + clock, + session_err.clone(), + ), + error=Some(session_err), + ) pub class SessionBuilder: @@ -232,6 +480,272 @@ pub class SessionBuilder: return Session(_backend=self._backend.clone(), _registrations=[]) +def _start_execution_clock() -> _ExecutionClockStart: + """Capture wall-clock and monotonic starts for one execution observation.""" + return _ExecutionClockStart( + started_at_unix_nanoseconds=SystemTime.now().unix_nanoseconds(), + started_monotonic=Instant.now(), + ) + + +def _lazy_frame_plan_target[T with Clone](data: LazyFrame[T]) -> SemanticTarget: + """Build the same Prism plan target used by local inspection for a lazy frame.""" + plan_id = prism_plan_id(data._cursor.store_id, data._cursor.tip_id) + return SemanticTarget( + kind=SemanticTargetKind.Plan, + target_id=plan_id, + plan_id=plan_id, + scope_id="plan", + path=plan_id, + name="plan", + node_id=None, + ordinal=None, + ) + + +def _substrait_plan_target(plan: Plan) -> SemanticTarget: + """Build a fallback plan target for bounded carriers that no longer expose a Prism cursor.""" + rel = root_rel(plan.clone()) + source_name = read_named_table_name(rel.clone()) + root_kind = read_kind_name(rel.clone()) + output_columns = relation_output_columns(rel.clone()) + joined_columns = ",".join(output_columns) + target_id = f"substrait:root:{root_kind}:{source_name}:{joined_columns}" + return SemanticTarget( + kind=SemanticTargetKind.Plan, + target_id=target_id, + plan_id=target_id, + scope_id="substrait-plan", + path=f"root/{root_kind}", + name=_substrait_target_name(source_name, root_kind), + node_id=None, + ordinal=None, + ) + + +def _substrait_target_name(source_name: str, root_kind: str) -> str: + """Choose the most specific fallback target name available for one Substrait root.""" + if len(source_name) > 0: + return source_name + return root_kind + + +def _unavailable_plan_target(operation: ExecutionOperationKind, started_at_unix_nanoseconds: int) -> SemanticTarget: + """Build an explicit unavailable-plan target when lowering fails before a plan exists.""" + target_id = f"plan:unavailable:{operation.value()}:{started_at_unix_nanoseconds}" + return SemanticTarget( + kind=SemanticTargetKind.Plan, + target_id=target_id, + plan_id=target_id, + scope_id="plan-unavailable", + path=f"unavailable/{operation.value()}", + name="plan_unavailable", + node_id=None, + ordinal=None, + ) + + +def _success_observation( + plan_target: SemanticTarget, + operation: ExecutionOperationKind, + backend_name: str, + clock: _ExecutionClockStart, + row_count: Option[int], +) -> ExecutionObservation: + """Build a success observation with the operation-specific materialization facts available so far.""" + return _execution_observation( + plan_target, + operation, + ExecutionObservationStatus.Success, + backend_name, + clock, + row_count, + [], + [], + ) + + +def _failure_observation( + plan_target: SemanticTarget, + operation: ExecutionOperationKind, + backend_name: str, + clock: _ExecutionClockStart, + err: SessionError, +) -> ExecutionObservation: + """Build a failure observation with one normalized SessionError diagnostic.""" + return _execution_observation( + plan_target.clone(), + operation, + ExecutionObservationStatus.Failure, + backend_name, + clock, + None, + [_execution_diagnostic_from_session_error(plan_target, err)], + [], + ) + + +def _execution_observation( + plan_target: SemanticTarget, + operation: ExecutionOperationKind, + status: ExecutionObservationStatus, + backend_name: str, + clock: _ExecutionClockStart, + row_count: Option[int], + diagnostics: list[ExecutionDiagnostic], + coverage_records: list[AdapterCoverageRecord], +) -> ExecutionObservation: + """Build one completed execution observation from a start-clock snapshot and terminal status.""" + ended_at_unix_nanoseconds = SystemTime.now().unix_nanoseconds() + attempt_target = _execution_attempt_target( + plan_target.clone(), + operation.clone(), + backend_name, + clock.started_at_unix_nanoseconds, + ) + return ExecutionObservation( + observation_id=attempt_target.target_id, + attempt_target=attempt_target, + plan_target=plan_target, + context_targets=[_client_session_target(plan_target.clone(), backend_name)], + operation=operation, + status=status, + backend_name=backend_name, + adapter_version=None, + requested_semantic_profile_id=None, + observed_semantic_profile_id=None, + started_at_unix_nanoseconds=clock.started_at_unix_nanoseconds, + ended_at_unix_nanoseconds=ended_at_unix_nanoseconds, + duration_nanoseconds=clock.started_monotonic.elapsed().total_nanoseconds(), + row_count=row_count, + byte_count=None, + trace_ids=[], + diagnostics=diagnostics, + coverage_records=coverage_records, + evidence_refs=[], + ) + + +def _client_session_target(plan_target: SemanticTarget, backend_name: str) -> SemanticTarget: + """Build a redacted session-context target for one observed execution attempt.""" + target_id = f"{plan_target.target_id}:client-session:{backend_name}" + return SemanticTarget( + kind=SemanticTargetKind.ClientSession, + target_id=target_id, + plan_id=plan_target.plan_id, + scope_id=plan_target.scope_id, + path=f"{plan_target.path}/client-session/{backend_name}", + name=f"session:{backend_name}", + node_id=None, + ordinal=None, + ) + + +def _execution_attempt_target( + plan_target: SemanticTarget, + operation: ExecutionOperationKind, + backend_name: str, + started_at_unix_nanoseconds: int, +) -> SemanticTarget: + """Build the semantic target for one concrete execution attempt.""" + operation_name = operation.value() + target_id = f"{plan_target.target_id}:execution-attempt:{operation_name}:{backend_name}:{started_at_unix_nanoseconds}" + return SemanticTarget( + kind=SemanticTargetKind.ExecutionAttempt, + target_id=target_id, + plan_id=plan_target.plan_id, + scope_id=plan_target.scope_id, + path=f"{plan_target.path}/execution-attempt/{operation_name}", + name=f"{operation_name}:{backend_name}", + node_id=None, + ordinal=None, + ) + + +def _execution_diagnostic_from_session_error(plan_target: SemanticTarget, err: SessionError) -> ExecutionDiagnostic: + """Convert one SessionError into an execution diagnostic anchored to the plan target.""" + return ExecutionDiagnostic( + severity=ExecutionDiagnosticSeverity.Error, + code=err.kind.value(), + message=err.message, + target=Some(plan_target), + ) + + +def _coverage_record_for_requirement(requirement: AdapterRequirement, backend_name: str) -> AdapterCoverageRecord: + """Build one conservative adapter coverage record for an explicit requirement.""" + state = _coverage_state_for_requirement(requirement.capability.clone(), backend_name) + return AdapterCoverageRecord( + coverage_id=f"{requirement.requirement_id}:coverage:{backend_name}", + requirement=requirement.clone(), + adapter_name=backend_name, + adapter_version=None, + semantic_profile_id=None, + state=state.clone(), + diagnostics=_coverage_diagnostics(requirement, backend_name, state), + evidence_refs=[], + ) + + +def _coverage_state_for_requirement( + capability: AdapterRequirementCapability, + backend_name: str, +) -> AdapterCoverageState: + """Return the known adapter coverage state for one capability family.""" + if backend_name != "datafusion": + return AdapterCoverageState.Unknown + match capability: + AdapterRequirementCapability.RowFilter => return AdapterCoverageState.Covered + AdapterRequirementCapability.OrderedExecution => return AdapterCoverageState.Covered + AdapterRequirementCapability.NullSemantics => return AdapterCoverageState.Covered + AdapterRequirementCapability.ExtensionFunction => return AdapterCoverageState.PartiallyCovered + AdapterRequirementCapability.LineagePreservation => return AdapterCoverageState.Uncovered + AdapterRequirementCapability.AuditEmission => return AdapterCoverageState.Uncovered + _ => return AdapterCoverageState.Unknown + + +def _coverage_diagnostics( + requirement: AdapterRequirement, + backend_name: str, + state: AdapterCoverageState, +) -> list[ExecutionDiagnostic]: + """Return diagnostics that explain non-covered adapter coverage states.""" + match state: + AdapterCoverageState.Covered => return [] + AdapterCoverageState.PartiallyCovered => + return [ExecutionDiagnostic( + severity=ExecutionDiagnosticSeverity.Warning, + code="adapter_coverage.partial", + message=f"{backend_name} coverage for {requirement.capability.value()} is function- or plan-shape-specific", + target=Some(requirement.target), + )] + AdapterCoverageState.Uncovered => + return [ExecutionDiagnostic( + severity=ExecutionDiagnosticSeverity.Error, + code="adapter_coverage.uncovered", + message=f"{backend_name} does not currently cover {requirement.capability.value()} as an adapter guarantee", + target=Some(requirement.target), + )] + AdapterCoverageState.Unknown => + return [ExecutionDiagnostic( + severity=ExecutionDiagnosticSeverity.Warning, + code="adapter_coverage.unknown", + message=f"{backend_name} coverage for {requirement.capability.value()} has not been classified", + target=Some(requirement.target), + )] + + +def _observed_error_or_internal(error: Option[SessionError]) -> SessionError: + """Return an observed error or a defensive internal execution error for impossible observed-result shapes.""" + match error: + Some(err) => return err + None => + return SessionError( + kind=SessionErrorKind.BackendExecutionError, + message="observed execution ended without data or error", + ) + + def _active_state_from_session(session: Session) -> ActiveSessionState: """Convert public Session state into the lightweight active-session snapshot model.""" registrations = [ActiveRegistration(logical_name=registration.logical_name, source=registration.source.clone()) for registration in session._registrations] diff --git a/tests/test_execution_observations.incn b/tests/test_execution_observations.incn new file mode 100644 index 0000000..9315079 --- /dev/null +++ b/tests/test_execution_observations.incn @@ -0,0 +1,185 @@ +"""Tests for RFC 032 execution observations and RFC 033 adapter coverage records.""" + +from std.testing import assert_is_ok, fail, fail_t, parametrize +from rust::std::path import Path +from dataset import LazyFrame, lazy_frame_named_table +from evidence import ( + AdapterCoverageState, + AdapterRequirement, + AdapterRequirementCapability, + AdapterRequirementGuarantee, + ExecutionObservationStatus, + ExecutionOperationKind, + SemanticTargetKind, +) +from session import Session, SessionErrorKind, csv_sink + + +@derive(Clone) +model Order: + """Fixture row shape used by observed session execution tests.""" + + id: int + + +const TESTS_DIR: str = "tests/" +const FIXTURE_DIR: str = TESTS_DIR + "fixtures/" +const TARGET_DIR: str = TESTS_DIR + "target/" +const ORDERS_CSV_FIXTURE: str = FIXTURE_DIR + "orders.csv" + + +def _target_uri(name: str) -> str: + """Return one test-target output URI.""" + return TARGET_DIR + name + + +def _capability_from_name(name: str) -> AdapterRequirementCapability: + """Return the adapter capability enum value for one parametrized test name.""" + match name: + "row_filter" => return AdapterRequirementCapability.RowFilter + "extension_function" => return AdapterRequirementCapability.ExtensionFunction + "lineage_preservation" => return AdapterRequirementCapability.LineagePreservation + "cryptographic_query_proof" => return AdapterRequirementCapability.CryptographicQueryProof + _ => return fail_t(f"unexpected capability: {name}") + + +def _coverage_state_from_name(name: str) -> AdapterCoverageState: + """Return the adapter coverage state enum value for one parametrized test name.""" + match name: + "covered" => return AdapterCoverageState.Covered + "partially_covered" => return AdapterCoverageState.PartiallyCovered + "uncovered" => return AdapterCoverageState.Uncovered + "unknown" => return AdapterCoverageState.Unknown + _ => return fail_t(f"unexpected coverage state: {name}") + + +def test_collect_observed__records_success_attempt_and_row_count() -> None: + """collect_observed should return a DataFrame plus a successful execution observation.""" + # -- Arrange -- + mut session = Session.default() + lazy: LazyFrame[Order] = assert_is_ok( + session.read_csv("orders", ORDERS_CSV_FIXTURE), + "orders CSV fixture should load", + ) + + # -- Act -- + observed = session.collect_observed(lazy) + + # -- Assert -- + match observed.data: + Some(df) => + observation = observed.observation + assert observation.operation == ExecutionOperationKind.Collect, "collect observation should record collect operation" + assert observation.status == ExecutionObservationStatus.Success, "collect observation should be successful" + assert observation.backend_name == "datafusion", "collect observation should record the selected backend" + assert len(observation.context_targets) == 1, "collect observation should expose session context" + assert observation.context_targets[0].kind == SemanticTargetKind.ClientSession, "session context should use the client-session target kind" + assert observation.plan_target.kind == SemanticTargetKind.Plan, "collect observation should anchor to a plan" + assert observation.plan_target.target_id.startswith("prism:store-"), "lazy collect observations should reuse Prism plan ids" + assert observation.attempt_target.kind == SemanticTargetKind.ExecutionAttempt, "collect observation should expose an execution-attempt target" + assert observation.started_at_unix_nanoseconds != 0, "collect observation should record a wall-clock start" + assert observation.ended_at_unix_nanoseconds != 0, "collect observation should record a wall-clock end" + assert observation.duration_nanoseconds >= 0, "collect observation should record monotonic elapsed time" + assert len(observation.trace_ids) == 0, "collect observation should expose empty trace IDs when no telemetry bridge is configured" + assert len(observation.diagnostics) == 0, "successful collect should not emit diagnostics" + match observation.adapter_version: + Some(_) => fail("default DataFusion observation should not invent an adapter version") + None => pass + match observation.requested_semantic_profile_id: + Some(_) => fail("collect should not invent a requested semantic profile") + None => pass + match observation.observed_semantic_profile_id: + Some(_) => fail("collect should not invent an observed semantic profile") + None => pass + match observation.byte_count: + Some(_) => fail("collect should not invent byte-count evidence") + None => pass + match observation.row_count: + Some(row_count) => + assert row_count == df.row_count(), "collect observation row count should match the DataFrame" + None => fail("successful collect should include row-count evidence") + None => fail("collect_observed should include data on success") + + +def test_collect_observed__records_failure_diagnostic() -> None: + """collect_observed should expose failure evidence when execution rejects the plan.""" + # -- Arrange -- + session = Session.default() + lazy: LazyFrame[Order] = lazy_frame_named_table("missing_observed_orders") + + # -- Act -- + observed = session.collect_observed(lazy) + + # -- Assert -- + match observed.data: + Some(_) => fail("failed collect_observed should not include a DataFrame") + None => pass + match observed.error: + Some(err) => + assert err.kind == SessionErrorKind.UnknownTable, "missing table should remain a typed session error" + None => fail("failed collect_observed should expose the SessionError") + observation = observed.observation + assert observation.operation == ExecutionOperationKind.Collect, "failed collect observation should record collect operation" + assert observation.status == ExecutionObservationStatus.Failure, "failed collect observation should record failure" + assert len(observation.diagnostics) == 1, "failed collect should include one structured diagnostic" + assert observation.diagnostics[0].code == "unknown_table", "diagnostic code should mirror the SessionError kind" + match observation.row_count: + Some(_) => fail("failed collect should not report row-count evidence") + None => pass + + +def test_write_observed__records_success_attempt() -> None: + """write_observed should write the sink and return successful execution observation evidence.""" + # -- Arrange -- + mut session = Session.default() + output_uri = _target_uri("observed_write_output.csv") + lazy: LazyFrame[Order] = assert_is_ok( + session.read_csv("orders", ORDERS_CSV_FIXTURE), + "orders CSV fixture should load", + ) + + # -- Act -- + observed = session.write_observed(lazy, csv_sink(output_uri)) + + # -- Assert -- + match observed.error: + Some(err) => fail(f"write_observed should succeed: {err.error_message()}") + None => pass + assert observed.observation.operation == ExecutionOperationKind.Write, "write observation should record write operation" + assert observed.observation.status == ExecutionObservationStatus.Success, "write observation should record success" + assert observed.observation.attempt_target.kind == SemanticTargetKind.ExecutionAttempt, "write observation should expose an execution-attempt target" + assert Path.new(output_uri).exists() is true, "write_observed should create the requested output artifact" + + +@parametrize("capability_name, expected_state_name, expected_diagnostics", [("row_filter", "covered", 0), ("extension_function", "partially_covered", 1), ("lineage_preservation", "uncovered", 1), ("cryptographic_query_proof", "unknown", 1)]) +def test_check_coverage__classifies_explicit_adapter_requirements( + capability_name: str, + expected_state_name: str, + expected_diagnostics: int, +) -> None: + """check_coverage should classify explicit requirements without fabricating plan requirements.""" + # -- Arrange -- + mut session = Session.default() + lazy: LazyFrame[Order] = assert_is_ok( + session.read_csv("orders", ORDERS_CSV_FIXTURE), + "orders CSV fixture should load", + ) + observed = session.collect_observed(lazy) + capability = _capability_from_name(capability_name) + expected_state = _coverage_state_from_name(expected_state_name) + requirement = AdapterRequirement( + requirement_id=f"req:{capability.value()}", + target=observed.observation.plan_target, + capability=capability, + guarantee=AdapterRequirementGuarantee.Required, + reason="test requirement", + evidence_refs=[], + ) + + # -- Act -- + records = session.check_coverage([requirement]) + + # -- Assert -- + assert len(records) == 1, "check_coverage should return one record per explicit requirement" + assert records[0].state == expected_state, "coverage state should match the conservative DataFusion mapping" + assert len(records[0].diagnostics) == expected_diagnostics, "non-covered coverage states should include diagnostics" From 7c2a23b29e25519a15e6a29ea8cde3665eb5901a Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Thu, 2 Jul 2026 13:29:59 +0200 Subject: [PATCH 02/12] fix - tighten execution observation review coverage (#66, #67) --- src/session/types.incn | 11 +++------- tests/test_execution_observations.incn | 30 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/session/types.incn b/src/session/types.incn index 0548692..3aefd5a 100644 --- a/src/session/types.incn +++ b/src/session/types.incn @@ -68,7 +68,7 @@ pub class SessionRegistration: pub class ObservedLazyFrame[T with Clone]: - """Observed `Session.execute(...)` result with data on success and diagnostics on failure.""" + """`Session.execute_observed(...)` result with data on success and diagnostics on failure.""" pub data: Option[LazyFrame[T]] pub observation: ExecutionObservation @@ -76,7 +76,7 @@ pub class ObservedLazyFrame[T with Clone]: pub class ObservedDataFrame[T with Clone]: - """Observed `Session.collect(...)` result with data on success and diagnostics on failure.""" + """`Session.collect_observed(...)` result with data on success and diagnostics on failure.""" pub data: Option[DataFrame[T]] pub observation: ExecutionObservation @@ -84,7 +84,7 @@ pub class ObservedDataFrame[T with Clone]: pub class ObservedWrite: - """Observed `Session.write(...)` result with diagnostics when the write fails.""" + """`Session.write_observed(...)` result with diagnostics when the write fails.""" pub observation: ExecutionObservation pub error: Option[SessionError] @@ -822,11 +822,6 @@ def _plan_from_bounded_dataset[T with Clone](data: BoundedDataSet[T]) -> Result[ return data.try_to_substrait_plan().map_err(_session_error_from_lowering_error) -def _write_result_from_backend_result(write_result: Result[None, BackendError]) -> Result[None, SessionError]: - """Normalize one backend write result into the Session-facing write result contract.""" - return write_result.map_err(_session_error_from_backend_error) - - def _invalid_registration_from_validation_error(err: ValidationError) -> SessionError: """Translate a validated input failure into a Session registration error.""" return invalid_registration(err.to_string()) diff --git a/tests/test_execution_observations.incn b/tests/test_execution_observations.incn index 9315079..fdf410c 100644 --- a/tests/test_execution_observations.incn +++ b/tests/test_execution_observations.incn @@ -53,6 +53,36 @@ def _coverage_state_from_name(name: str) -> AdapterCoverageState: _ => return fail_t(f"unexpected coverage state: {name}") +def test_execute_observed__records_success_attempt_without_materialization() -> None: + """execute_observed should validate and execute a lazy plan without inventing materialization facts.""" + # -- Arrange -- + mut session = Session.default() + lazy: LazyFrame[Order] = assert_is_ok( + session.read_csv("orders", ORDERS_CSV_FIXTURE), + "orders CSV fixture should load", + ) + + # -- Act -- + observed = session.execute_observed(lazy) + + # -- Assert -- + match observed.error: + Some(err) => fail(f"execute_observed should succeed: {err.error_message()}") + None => pass + match observed.data: + Some(frame) => + assert len(frame.planned_columns()) > 0, "execute_observed should preserve the deferred carrier" + None => fail("execute_observed should include the lazy frame on success") + observation = observed.observation + assert observation.operation == ExecutionOperationKind.Execute, "execute observation should record execute operation" + assert observation.status == ExecutionObservationStatus.Success, "execute observation should record success" + assert observation.plan_target.target_id.startswith("prism:store-"), "lazy execute observations should reuse Prism plan ids" + assert observation.attempt_target.kind == SemanticTargetKind.ExecutionAttempt, "execute observation should expose an execution-attempt target" + match observation.row_count: + Some(_) => fail("execute should not invent row-count evidence") + None => pass + + def test_collect_observed__records_success_attempt_and_row_count() -> None: """collect_observed should return a DataFrame plus a successful execution observation.""" # -- Arrange -- From 3c3076b95e6e677164e1e77d20939799fd161003 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Thu, 2 Jul 2026 13:37:06 +0200 Subject: [PATCH 03/12] docs - reflow shortcut prose --- .../language/reference/builders/aggregates.md | 12 ++-- docs/language/reference/execution_context.md | 18 ++--- docs/release_notes/v0_1.md | 27 ++------ docs/rfcs/016_core_aggregate_functions.md | 3 +- docs/rfcs/017_aggregate_modifiers.md | 6 +- docs/rfcs/023_approximate_sketch_functions.md | 55 +++++---------- docs/rfcs/025_typed_sketch_logical_values.md | 67 ++++++------------- .../026_semi_structured_variant_values.md | 14 ++-- docs/rfcs/032_execution_observations.md | 7 +- .../rfcs/033_adapter_requirements_coverage.md | 4 +- docs/whitepapers/inql_db.md | 24 ++----- 11 files changed, 69 insertions(+), 168 deletions(-) diff --git a/docs/language/reference/builders/aggregates.md b/docs/language/reference/builders/aggregates.md index 85c0560..8a73cac 100644 --- a/docs/language/reference/builders/aggregates.md +++ b/docs/language/reference/builders/aggregates.md @@ -59,14 +59,10 @@ grouped = orders.group_by([col("customer_id")]).agg([ - `count(...)` accepts zero or one expression; passing multiple expressions is an error. - `count_expr(expr)` is a compatibility spelling for `count(expr)`. - `count_distinct(expr)` is compatibility sugar for `count(expr).distinct()`. -- `count_if(predicate)` is compatibility sugar for `count().filter(predicate)`. Rows where the predicate is false or - null do not contribute to the aggregate. +- `count_if(predicate)` is compatibility sugar for `count().filter(predicate)`. Rows where the predicate is false or null do not contribute to the aggregate. - `sum`, `avg`, `min`, and `max` skip null values. They return backend-null results when no non-null input value exists. -- `approx_count_distinct` and `approx_percentile` are approximate aggregate choices. They allow aggregate-local filters - but reject extra `DISTINCT` and ordered input in the portable contract. -- `approx_percentile` output names include percentile and accuracy parameters so two percentile estimates over the same - expression do not collapse into the same output column name. -- `hll_sketch` and `hll_merge` are aggregate-shaped typed sketch helpers. They produce typed sketch state and preserve - sketch family, value domain, precision, and format metadata through the registry and Substrait boundary. +- `approx_count_distinct` and `approx_percentile` are approximate aggregate choices. They allow aggregate-local filters but reject extra `DISTINCT` and ordered input in the portable contract. +- `approx_percentile` output names include percentile and accuracy parameters so two percentile estimates over the same expression do not collapse into the same output column name. +- `hll_sketch` and `hll_merge` are aggregate-shaped typed sketch helpers. They produce typed sketch state and preserve sketch family, value domain, precision, and format metadata through the registry and Substrait boundary. - Unsupported aggregate modifiers fail at lowering or backend planning; they are not ignored. - Future `.column` sugar and scoped aggregate symbols should lower to this same surface rather than replacing its semantics. diff --git a/docs/language/reference/execution_context.md b/docs/language/reference/execution_context.md index 7eb4e96..b9d2f5a 100644 --- a/docs/language/reference/execution_context.md +++ b/docs/language/reference/execution_context.md @@ -47,8 +47,7 @@ All read APIs return `LazyFrame[T]`. They create deferred logical work; they do ## Execution observations -Observed execution methods preserve the ordinary session contracts while also returning runtime evidence. They are the -author-facing surface for RFC 032 execution observations. +Observed execution methods preserve the ordinary session contracts while also returning runtime evidence. They are the author-facing surface for RFC 032 execution observations. | API | Returns | Role | | -------------------------------- | ------------------------ | -------------------------------------------------------------------- | @@ -56,16 +55,9 @@ author-facing surface for RFC 032 execution observations. | `session.collect_observed(data)` | `ObservedDataFrame[T]` | Collect and return `data`, `observation`, and `error` fields | | `session.write_observed(data, target)` | `ObservedWrite` | Write and return `observation` plus an optional `error` | -The ordinary `execute`, `collect`, and `write` methods use the same execution path internally and keep returning -`Result[...]` values for compact application code. Use the observed variants when an audit, governance, debugging, or -verification flow needs a durable execution attempt record. +The ordinary `execute`, `collect`, and `write` methods use the same execution path internally and keep returning `Result[...]` values for compact application code. Use the observed variants when an audit, governance, debugging, or verification flow needs a durable execution attempt record. -An `ExecutionObservation` records the operation, status, backend name, optional adapter version, requested and observed -semantic profile IDs, plan target, execution-attempt target, client-session context target, Unix nanosecond wall-clock -start/end values from `std.datetime.runtime.SystemTime`, monotonic duration nanoseconds from -`std.datetime.runtime.Instant`, row count or byte count when materialization supplies them, optional trace IDs, -diagnostics, and linked coverage records when present. Observation records do not contain row payloads or backend logs -by default. +An `ExecutionObservation` records the operation, status, backend name, optional adapter version, requested and observed semantic profile IDs, plan target, execution-attempt target, client-session context target, Unix nanosecond wall-clock start/end values from `std.datetime.runtime.SystemTime`, monotonic duration nanoseconds from `std.datetime.runtime.Instant`, row count or byte count when materialization supplies them, optional trace IDs, diagnostics, and linked coverage records when present. Observation records do not contain row payloads or backend logs by default. ```incan observed = session.collect_observed(summary) @@ -90,9 +82,7 @@ These writes are Session-owned. They do not bypass the execution context even wh ## Adapter coverage -`session.check_coverage(requirements)` accepts explicit `AdapterRequirement` records and returns one -`AdapterCoverageRecord` per requirement. This is the current RFC 033 coverage surface. It does not infer requirements -from every plan shape yet; callers must pass the requirements they want evaluated. +`session.check_coverage(requirements)` accepts explicit `AdapterRequirement` records and returns one `AdapterCoverageRecord` per requirement. This is the current RFC 033 coverage surface. It does not infer requirements from every plan shape yet; callers must pass the requirements they want evaluated. Coverage states are conservative: diff --git a/docs/release_notes/v0_1.md b/docs/release_notes/v0_1.md index 3df56fd..92f42ce 100644 --- a/docs/release_notes/v0_1.md +++ b/docs/release_notes/v0_1.md @@ -9,11 +9,7 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Language:** Foundational InQL syntax and semantics (naming, query schema, layer boundaries). - **Carriers:** `DataSet[T]` hierarchy including bounded vs unbounded traits and concrete frame/stream types. - **Plans:** Apache Substrait as the logical interchange contract. -- **Authoring:** `LazyFrame` method chains are Prism-backed, and RFC 003 `query {}` blocks desugar into the same - carrier calls before lowering through the current carrier planning paths and Substrait boundary. Query blocks support - the brace spelling and expression-position `query:` spelling, including SELECT aliases, lateral alias reuse, grouped - aggregates, `SELECT DISTINCT`, post-SELECT filters, ordering, limits, inner and left joins, generator clauses, and - named window expressions. +- **Authoring:** `LazyFrame` method chains are Prism-backed, and RFC 003 `query {}` blocks desugar into the same carrier calls before lowering through the current carrier planning paths and Substrait boundary. Query blocks support the brace spelling and expression-position `query:` spelling, including SELECT aliases, lateral alias reuse, grouped aggregates, `SELECT DISTINCT`, post-SELECT filters, ordering, limits, inner and left joins, generator clauses, and named window expressions. - **Aggregates:** builder-based `col`, `sum`, `count`, `count_expr`, `count_distinct`, `count_if`, `avg`, `min`, and `max` helpers now lower grouped and global aggregates through Prism, Substrait, and Session execution. `count()` counts rows, `count(expr)` counts non-null expression values, `count_expr(expr)` remains a compatibility spelling, and the first aggregate modifier slice supports `DISTINCT` plus aggregate-local `FILTER` where valid. - **Scalar expressions:** RFC 012 unifies filter predicates, computed projection values, grouping keys, and aggregate inputs around one `ColumnExpr` surface with canonical `lit(...)` and typed literal helpers. - **Core scalar functions:** RFC 015 adds registry-backed scalar function applications and the first core helper slice for casts, comparisons, boolean logic, null/NaN predicates, arithmetic, conditionals, membership/range predicates, and ordering expressions. Primitive cast targets can use source-level type tokens such as `cast(col("amount_text"), float)`, while explicit string target spellings remain available for compatibility aliases such as `int64` and `float64`. Implemented helpers lower to Substrait IR through registry metadata, built-in Rex shapes, or structural sort-field lowering; DataFusion remains the first execution adapter rather than the semantic boundary. @@ -22,29 +18,16 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Generator functions:** RFC 021 adds registry-backed generator applications for `explode(...)`, `explode_outer(...)`, `posexplode(...)`, `posexplode_outer(...)`, `inline(...)`, `inline_outer(...)`, portable `flatten(...)`, and `stack(...)`. Generators remain relation-shaping operations applied with `generate(...)`; they preserve input columns, require explicit output aliases, lower through the current Substrait extension-relation gap encoding, and execute through the DataFusion Session adapter with concrete output-column materialization. - **Window functions:** RFC 019 adds `window()` specs, explicit row/range frame bounds, ranking and distribution helpers (`row_number`, `rank`, `dense_rank`, `percent_rank`, `cume_dist`, `ntile`), offset and value helpers (`lag`, `lead`, `first_value`, `last_value`, `nth_value`), and aggregate-over-window placement through `with_window_column(...)`. Portable window helpers require explicit ordering where appropriate, lower through Substrait `ConsistentPartitionWindowRel`, and execute through the DataFusion session adapter. - **Format functions:** RFC 022 adds scalar payload helpers for deterministic hashes (`md5`, `sha1`, `sha224`, `sha256`, `sha384`, `sha512`, `sha2`, `crc32`, and `xxhash64`), URL parsing/encoding/decoding, JSON validation/path/schema helpers, and CSV row/schema helpers. Format helpers lower through registry-owned Substrait metadata; the DataFusion adapter executes the full helper set with native functions where available and Incan-authored adapter callbacks for non-native helpers. -- **Approximate functions:** RFC 023 adds explicit approximate aggregate helpers for `approx_count_distinct(...)` and - `approx_percentile(...)`. They carry approximation policy in registry metadata, lower through InQL-owned Substrait - extension names, and keep DataFusion implementation-name rewrites inside the backend adapter. -- **Typed sketches:** RFC 025 adds typed HyperLogLog sketch logical values with `SketchLogicalType`, `SketchExpr`, - `hll_type(...)`, `sketch_col(...)`, `hll_sketch(...)`, `hll_merge(...)`, `hll_estimate(...)`, - `hll_serialize(...)`, and `hll_deserialize(...)`. Sketch metadata remains InQL-owned through registry and Substrait - options; DataFusion reports a backend planning diagnostic because it has no sketch runtime implementation. -- **Typed variants:** RFC 026 adds typed semi-structured variant logical values with `VariantLogicalType`, `VariantExpr`, - `variant_type(...)`, `variant_col(...)`, `variant_value(...)`, `parse_variant_json(...)`, - `try_parse_variant_json(...)`, `variant_get(...)`, `typeof(...)`, and kind predicates for null, boolean, integer, - float, string, timestamp, array, and object values. Variant parse helpers accept string value-or-column inputs, - variant metadata remains InQL-owned through registry and Substrait options, and DataFusion reports a backend planning - diagnostic because it has no variant runtime implementation. +- **Approximate functions:** RFC 023 adds explicit approximate aggregate helpers for `approx_count_distinct(...)` and `approx_percentile(...)`. They carry approximation policy in registry metadata, lower through InQL-owned Substrait extension names, and keep DataFusion implementation-name rewrites inside the backend adapter. +- **Typed sketches:** RFC 025 adds typed HyperLogLog sketch logical values with `SketchLogicalType`, `SketchExpr`, `hll_type(...)`, `sketch_col(...)`, `hll_sketch(...)`, `hll_merge(...)`, `hll_estimate(...)`, `hll_serialize(...)`, and `hll_deserialize(...)`. Sketch metadata remains InQL-owned through registry and Substrait options; DataFusion reports a backend planning diagnostic because it has no sketch runtime implementation. +- **Typed variants:** RFC 026 adds typed semi-structured variant logical values with `VariantLogicalType`, `VariantExpr`, `variant_type(...)`, `variant_col(...)`, `variant_value(...)`, `parse_variant_json(...)`, `try_parse_variant_json(...)`, `variant_get(...)`, `typeof(...)`, and kind predicates for null, boolean, integer, float, string, timestamp, array, and object values. Variant parse helpers accept string value-or-column inputs, variant metadata remains InQL-owned through registry and Substrait options, and DataFusion reports a backend planning diagnostic because it has no variant runtime implementation. - **Function registry:** RFC 014 adds declaration-site registry decorators for the current public helper surface, including stable function references, checked signature projection, lifecycle metadata, behavior categories, alias policy, Substrait mapping categories, and checked API metadata drift validation. - **Function extension policy:** InQL RFC 024 policy metadata now distinguishes portable core functions, namespaced extension-only functions, opt-in compatibility aliases, engine-specific functions, and rejected compatibility requests without adding an extension plugin system or backend-owned semantics. - **Projection:** builder-based `with_column`, `add`, `mul`, and literal expression helpers now lower derived columns through Prism, Substrait, and Session execution. - **Substrait internals:** RFC 002 helpers are now split into focused owner modules for relation building, plan assembly, inspection, schema registry, extension bookkeeping, and expression lowering instead of one `substrait.plan` godmodule. - **Prism:** `LazyFrame` lowering applies safe canonical rewrites (`Filter(true)` elimination and adjacent `Limit`/`Project`/`OrderBy` collapse) before RFC 002 plan emission. - **Inspection:** RFCs 028–031 now have a first local evidence spine for Prism-backed `LazyFrame` plans. `inspect_plan(...)` and `inspect_lineage(...)` expose semantic targets, output schema, authored and rewritten Prism node records, lineage edges, artifact-family summaries, metadata attachment records, diagnostics shape, and explicit unsupported-evidence markers without executing or backend-binding the plan. -- **Execution observations and adapter coverage:** RFC 032 adds observed `Session` variants for `execute`, `collect`, - and `write` so callers can capture structured runtime evidence for success and failure attempts. RFC 033 adds adapter - requirement and coverage records plus `Session.check_coverage(requirements)` for explicit capability checks with - covered, partially covered, uncovered, and unknown states. +- **Execution observations and adapter coverage:** RFC 032 adds observed `Session` variants for `execute`, `collect`, and `write` so callers can capture structured runtime evidence for success and failure attempts. RFC 033 adds adapter requirement and coverage records plus `Session.check_coverage(requirements)` for explicit capability checks with covered, partially covered, uncovered, and unknown states. - **Execution:** Session-oriented read, execute, and write (reference backend per RFC 004), with `collect(...)` now producing structured `DataFrame` materialization metadata plus preview text instead of treating rendered text as the canonical contract. Session execution dispatch now routes through a backend adapter boundary over Substrait plans; DataFusion remains the first adapter rather than being encoded directly into Session state. - **Session API:** `Session.write(data, target)` now accepts typed sink descriptors such as `csv_sink(uri)` and `parquet_sink(uri)`, while the file-specific `write_csv(...)` and `write_parquet(...)` helpers remain as convenience methods. - **Documentation:** Current package behavior is documented under `docs/language/`, while RFCs remain design records rather than implementation diaries. diff --git a/docs/rfcs/016_core_aggregate_functions.md b/docs/rfcs/016_core_aggregate_functions.md index b8a031c..0935bda 100644 --- a/docs/rfcs/016_core_aggregate_functions.md +++ b/docs/rfcs/016_core_aggregate_functions.md @@ -109,8 +109,7 @@ Existing `sum` and `count` helpers should be treated as compatibility-compatible - Null and empty-input behavior can surprise authors coming from APIs that default missing sums to zero. - Result type policy for numeric aggregates is a cross-cutting dependency on scalar numeric types. -- Supporting both `count()` and `count(expr)` makes one helper carry row-count and expression-count semantics, so - tests must keep both call shapes covered. +- Supporting both `count()` and `count(expr)` makes one helper carry row-count and expression-count semantics, so tests must keep both call shapes covered. ## Layers affected diff --git a/docs/rfcs/017_aggregate_modifiers.md b/docs/rfcs/017_aggregate_modifiers.md index 352b64c..d10947d 100644 --- a/docs/rfcs/017_aggregate_modifiers.md +++ b/docs/rfcs/017_aggregate_modifiers.md @@ -115,7 +115,5 @@ Existing aggregate helpers remain valid. New compatibility helpers such as `coun ### Resolved -- `count_if(predicate)` follows aggregate `FILTER` semantics: rows where the predicate is false or null do not - contribute to the aggregate. -- The initial modifier contract records ordered aggregate input but no current core aggregate allows it. Ordered input - is rejected explicitly until an order-sensitive aggregate such as `listagg` or ordered percentile functions lands. +- `count_if(predicate)` follows aggregate `FILTER` semantics: rows where the predicate is false or null do not contribute to the aggregate. +- The initial modifier contract records ordered aggregate input but no current core aggregate allows it. Ordered input is rejected explicitly until an order-sensitive aggregate such as `listagg` or ordered percentile functions lands. diff --git a/docs/rfcs/023_approximate_sketch_functions.md b/docs/rfcs/023_approximate_sketch_functions.md index 47a6c17..b42104b 100644 --- a/docs/rfcs/023_approximate_sketch_functions.md +++ b/docs/rfcs/023_approximate_sketch_functions.md @@ -96,10 +96,8 @@ This RFC is additive. Existing exact aggregates must not change semantics when a ## Alternatives considered - **Treat sketches as binary values.** Rejected because it loses type safety and merge compatibility. -- **Expose Spark sketch names directly as core functions.** Rejected because many sketch families are specialist - extensions and require explicit state contracts. -- **Let backends choose approximate execution for exact aggregates.** Rejected because approximate results must be an - author-visible choice. +- **Expose Spark sketch names directly as core functions.** Rejected because many sketch families are specialist extensions and require explicit state contracts. +- **Let backends choose approximate execution for exact aggregates.** Rejected because approximate results must be an author-visible choice. ## Drawbacks @@ -110,51 +108,35 @@ This RFC is additive. Existing exact aggregates must not change semantics when a ## Layers affected - **InQL specification** — approximate and sketch functions must be separate from exact aggregate semantics. -- **InQL library package** — public helpers should expose approximate aggregate and sketch-state types only when - contracts are explicit. +- **InQL library package** — public helpers should expose approximate aggregate and sketch-state types only when contracts are explicit. - **Incan compiler** — typechecking must validate sketch family compatibility and aggregate positions. -- **Execution / interchange** — Prism and Substrait lowering must preserve approximate parameters, sketch state types, - and merge semantics or reject unsupported functions. +- **Execution / interchange** — Prism and Substrait lowering must preserve approximate parameters, sketch state types, and merge semantics or reject unsupported functions. - **Documentation** — docs must label approximate functions clearly and explain accuracy parameters. ## Design Decisions ### Resolved -- `approx_count_distinct(expr)` is an aggregate measure, not a scalar expression, and its helper name makes approximate - execution an explicit author choice. -- `approx_count_distinct` is registered as approximate metadata with HyperLogLog-family semantics, mergeability, and an - approximate cardinality-result interpretation. -- `approx_count_distinct` follows InQL's registered unary Substrait extension mapping. It does not expose a - user-tunable relative-error parameter because the portable mapping does not carry one. -- `approx_percentile(expr, percentile, accuracy=10000)` is an aggregate measure with t-digest-family approximation - metadata. The helper validates literal percentile and accuracy arguments before building the measure. -- `approx_percentile` output names include both percentile and accuracy parameters, so multiple percentile estimates over - the same input expression remain distinct through Prism and Substrait inspection. -- DataFusion's implementation is named `approx_distinct`; InQL keeps the InQL Substrait function name in emitted - function metadata and rewrites only the DataFusion consumer declaration at the backend adapter boundary. -- DataFusion's approximate percentile implementation is named `approx_percentile_cont`; InQL uses the same adapter-only - declaration rewrite and keeps `approx_percentile` as the portable Substrait extension name. -- `approx_count_distinct` allows aggregate-local filters and rejects an extra `distinct()` modifier because distinct - estimation is already the helper's semantics. -- `approx_percentile` allows aggregate-local filters and rejects `distinct()` and ordered input because those modifiers - are not part of the portable percentile aggregate contract. -- Sketch-state construction, merge, estimate, serialization, and deserialization helpers are delegated to InQL RFC 025. - They are not exposed as lowerable RFC 023 functions because exposing those helpers as ordinary strings or binary values - would violate the compatibility rules this RFC is meant to protect. +- `approx_count_distinct(expr)` is an aggregate measure, not a scalar expression, and its helper name makes approximate execution an explicit author choice. +- `approx_count_distinct` is registered as approximate metadata with HyperLogLog-family semantics, mergeability, and an approximate cardinality-result interpretation. +- `approx_count_distinct` follows InQL's registered unary Substrait extension mapping. It does not expose a user-tunable relative-error parameter because the portable mapping does not carry one. +- `approx_percentile(expr, percentile, accuracy=10000)` is an aggregate measure with t-digest-family approximation metadata. The helper validates literal percentile and accuracy arguments before building the measure. +- `approx_percentile` output names include both percentile and accuracy parameters, so multiple percentile estimates over the same input expression remain distinct through Prism and Substrait inspection. +- DataFusion's implementation is named `approx_distinct`; InQL keeps the InQL Substrait function name in emitted function metadata and rewrites only the DataFusion consumer declaration at the backend adapter boundary. +- DataFusion's approximate percentile implementation is named `approx_percentile_cont`; InQL uses the same adapter-only declaration rewrite and keeps `approx_percentile` as the portable Substrait extension name. +- `approx_count_distinct` allows aggregate-local filters and rejects an extra `distinct()` modifier because distinct estimation is already the helper's semantics. +- `approx_percentile` allows aggregate-local filters and rejects `distinct()` and ordered input because those modifiers are not part of the portable percentile aggregate contract. +- Sketch-state construction, merge, estimate, serialization, and deserialization helpers are delegated to InQL RFC 025. They are not exposed as lowerable RFC 023 functions because exposing those helpers as ordinary strings or binary values would violate the compatibility rules this RFC is meant to protect. ### Remaining -- InQL RFC 025 defines the follow-up design space for typed sketch state, portable serialization formats, and named - merge/estimate helpers. That work must not retrofit RFC 023 by treating untyped binary payloads as sketch values. -- A future backend-capability layer may expose backend-specific approximation knobs as engine-specific functions or - options when they cannot be represented by the portable helper signatures. +- InQL RFC 025 defines the follow-up design space for typed sketch state, portable serialization formats, and named merge/estimate helpers. That work must not retrofit RFC 023 by treating untyped binary payloads as sketch values. +- A future backend-capability layer may expose backend-specific approximation knobs as engine-specific functions or options when they cannot be represented by the portable helper signatures. ## Implementation Plan 1. Add registry approximation metadata with exact-helper defaults. -2. Add `approx_count_distinct(expr)` and `approx_percentile(expr, percentile, accuracy=10000)` under a logical approximate - function family. +2. Add `approx_count_distinct(expr)` and `approx_percentile(expr, percentile, accuracy=10000)` under a logical approximate function family. 3. Add stable Substrait anchors and keep emitted function metadata on InQL extension names. 4. Add DataFusion adapter-local declaration rewrites to the first backend's implementation names. 5. Add focused helper, registry, Substrait lowering, Prism, and DataFusion-backed session tests with materialized output. @@ -171,5 +153,4 @@ This RFC is additive. Existing exact aggregates must not change semantics when a - [x] DataFusion adapter-local approximate aggregate mappings added. - [x] Focused helper, registry, Substrait lowering, Prism, and DataFusion-backed session tests added. - [x] User-facing approximate-function docs, aggregate-builder docs, and release notes added. -- [x] Sketch-state logical types and sketch merge/estimate/serialization helpers delegated to InQL RFC 025 rather than - exposed as untyped lowerable functions. +- [x] Sketch-state logical types and sketch merge/estimate/serialization helpers delegated to InQL RFC 025 rather than exposed as untyped lowerable functions. diff --git a/docs/rfcs/025_typed_sketch_logical_values.md b/docs/rfcs/025_typed_sketch_logical_values.md index 81b2ffc..6ac3930 100644 --- a/docs/rfcs/025_typed_sketch_logical_values.md +++ b/docs/rfcs/025_typed_sketch_logical_values.md @@ -23,10 +23,8 @@ This RFC defines typed sketch logical values for InQL. Sketch helpers must not b 1. A sketch value has a logical type, even if its runtime representation is opaque to InQL. 2. Sketch construction is aggregate-shaped when it summarizes many input rows into one sketch state. 3. Sketch merge is valid only for compatible sketch values. -4. Sketch estimate, quantile, serialization, and deserialization helpers must preserve sketch-family semantics instead of - treating sketch payloads as generic bytes. -5. Backend adapters may implement, emulate, or reject sketch operations, but they must not redefine ordinary binary or - string expressions as sketch states. +4. Sketch estimate, quantile, serialization, and deserialization helpers must preserve sketch-family semantics instead of treating sketch payloads as generic bytes. +5. Backend adapters may implement, emulate, or reject sketch operations, but they must not redefine ordinary binary or string expressions as sketch states. ## Motivation @@ -38,11 +36,9 @@ If InQL accepts untyped sketch blobs, it cannot reject invalid operations such a - Define sketch logical values as first-class typed values. - Define the metadata required to compare sketch compatibility before execution. -- Define how sketch construction, merge, estimate, serialization, and deserialization helpers interact with the function - registry. +- Define how sketch construction, merge, estimate, serialization, and deserialization helpers interact with the function registry. - Keep sketch state backend-neutral in InQL semantics while allowing backend-specific execution support. -- Provide a design home for HyperLogLog, KLL, theta, count-min, and bitmap-style sketch families without forcing all - families into the first implementation. +- Provide a design home for HyperLogLog, KLL, theta, count-min, and bitmap-style sketch families without forcing all families into the first implementation. ## Non-Goals @@ -82,8 +78,7 @@ A sketch logical value must carry at least: - sketch family identity, such as HyperLogLog, KLL, theta, count-min, or bitmap; - input value domain, such as string identifiers, integer identifiers, numeric values, or categorical values; -- family parameters that affect merge compatibility, such as precision, accuracy, nominal entries, width/depth, seed, or - ordering policy; +- family parameters that affect merge compatibility, such as precision, accuracy, nominal entries, width/depth, seed, or ordering policy; - format identity and version when the value can be serialized; - nullability and ordinary column-position metadata needed by existing InQL expression and relation surfaces. @@ -127,15 +122,12 @@ The implemented first family is HyperLogLog: - `sketch_value(...)` accepts the standard scalar value-or-column input surface before attaching sketch metadata. - `hll_sketch(...)` is an aggregate measure that produces typed HyperLogLog state from scalar values or expressions. - `hll_merge(...)` is an aggregate measure over existing typed HyperLogLog state. -- `hll_estimate(...)`, `hll_serialize(...)`, and `hll_deserialize(...)` are scalar helpers over typed sketch state or - explicit serialized payloads. +- `hll_estimate(...)`, `hll_serialize(...)`, and `hll_deserialize(...)` are scalar helpers over typed sketch state or explicit serialized payloads. - `hll_deserialize(...)` accepts the standard string value-or-column input surface for explicit payloads. -- The public `SketchFamily` API exposes HyperLogLog in this implementation; additional families should add their own - family-specific type builders, serialization formats, registry policies, and tests rather than sharing HLL metadata. +- The public `SketchFamily` API exposes HyperLogLog in this implementation; additional families should add their own family-specific type builders, serialization formats, registry policies, and tests rather than sharing HLL metadata. - Function registry entries expose typed sketch policy metadata and Substrait extension mappings. - Substrait lowering carries sketch family, value domain, precision, and format in function options. -- The DataFusion adapter rejects typed sketch execution with a backend planning diagnostic. This is an adapter - capability boundary, not an InQL semantic limitation. +- The DataFusion adapter rejects typed sketch execution with a backend planning diagnostic. This is an adapter capability boundary, not an InQL semantic limitation. ### Compatibility / migration @@ -143,48 +135,33 @@ This RFC is additive. RFC 023 approximate scalar-result aggregates remain valid. ## Alternatives considered -- **Treat sketches as bytes.** Rejected because it prevents typechecking merge compatibility and moves semantic errors - into backend runtime failures. -- **Expose only scalar approximate aggregates.** Rejected as a complete long-term answer because stored and mergeable - sketches are a legitimate analytics need, especially for pre-aggregated data. -- **Copy one backend's sketch catalog directly.** Rejected because InQL needs backend-neutral semantics and capability - reporting. -- **Make sketch values ordinary structs.** Rejected unless the struct carries a distinct logical type; ordinary structs - do not by themselves encode family-specific compatibility rules. +- **Treat sketches as bytes.** Rejected because it prevents typechecking merge compatibility and moves semantic errors into backend runtime failures. +- **Expose only scalar approximate aggregates.** Rejected as a complete long-term answer because stored and mergeable sketches are a legitimate analytics need, especially for pre-aggregated data. +- **Copy one backend's sketch catalog directly.** Rejected because InQL needs backend-neutral semantics and capability reporting. +- **Make sketch values ordinary structs.** Rejected unless the struct carries a distinct logical type; ordinary structs do not by themselves encode family-specific compatibility rules. ## Drawbacks - Sketch logical values add a new kind of type metadata to expression and relation planning. - Cross-backend support will be uneven because sketch algorithms and serialized formats vary. -- Documentation must be careful not to overpromise statistical guarantees that depend on algorithm parameters and backend - implementations. +- Documentation must be careful not to overpromise statistical guarantees that depend on algorithm parameters and backend implementations. - Serialization compatibility can become a long-term maintenance burden if exposed too early. ## Layers affected -- **InQL specification** — sketch values must be distinguished from ordinary scalar, binary, string, map, and struct - values. -- **InQL library package** — public sketch helpers must register family, domain, parameter, merge, estimate, and - serialization metadata. -- **Incan compiler** — typechecking and diagnostics may need enough type information to represent sketch-valued - expressions, reject invalid operations, and preserve metadata through public helper signatures. -- **Execution / interchange** — Substrait lowering and backend adapters must preserve sketch logical type identity or - reject unsupported sketch operations before execution. -- **Documentation** — function references and RFCs must present sketch helpers as typed approximate state, not as - backend-specific blobs. +- **InQL specification** — sketch values must be distinguished from ordinary scalar, binary, string, map, and struct values. +- **InQL library package** — public sketch helpers must register family, domain, parameter, merge, estimate, and serialization metadata. +- **Incan compiler** — typechecking and diagnostics may need enough type information to represent sketch-valued expressions, reject invalid operations, and preserve metadata through public helper signatures. +- **Execution / interchange** — Substrait lowering and backend adapters must preserve sketch logical type identity or reject unsupported sketch operations before execution. +- **Documentation** — function references and RFCs must present sketch helpers as typed approximate state, not as backend-specific blobs. ## Design decisions ### Resolved -- The first public type spelling is explicit library metadata: `SketchLogicalType`, `SketchExpr`, `hll_type(...)`, - `sketch_value(...)`, and `sketch_col(...)`. -- Public sketch helpers use the same typed value-or-column input conventions as the post-RFC018 scalar catalog: source - values are accepted as primitive values or scalar expressions, while serialized sketch payloads use the string - value-or-column surface. +- The first public type spelling is explicit library metadata: `SketchLogicalType`, `SketchExpr`, `hll_type(...)`, `sketch_value(...)`, and `sketch_col(...)`. +- Public sketch helpers use the same typed value-or-column input conventions as the post-RFC018 scalar catalog: source values are accepted as primitive values or scalar expressions, while serialized sketch payloads use the string value-or-column surface. - HyperLogLog is the first implemented sketch family because it cleanly extends the distinct-count approximation surface. - HyperLogLog merge compatibility is defined by family, value domain, precision, and serialization format. -- Serialized sketch format identity is explicit and portable at the InQL logical layer. RFC 025 defines - `inql_hll_v1` as the first format identity without promising bit-for-bit compatibility with every backend runtime. -- Sketch values may be represented in authoring expressions today through `SketchExpr`. Broader table-schema logical - typing is left to RFC 026 and later schema work rather than hiding sketch state as strings or bytes. +- Serialized sketch format identity is explicit and portable at the InQL logical layer. RFC 025 defines `inql_hll_v1` as the first format identity without promising bit-for-bit compatibility with every backend runtime. +- Sketch values may be represented in authoring expressions today through `SketchExpr`. Broader table-schema logical typing is left to RFC 026 and later schema work rather than hiding sketch state as strings or bytes. diff --git a/docs/rfcs/026_semi_structured_variant_values.md b/docs/rfcs/026_semi_structured_variant_values.md index 7488916..b095214 100644 --- a/docs/rfcs/026_semi_structured_variant_values.md +++ b/docs/rfcs/026_semi_structured_variant_values.md @@ -119,9 +119,7 @@ The implemented public model is: - `VariantKind`, `VariantEncoding`, `VariantParseMode`, `VariantLogicalType`, and `VariantExpr`. - Metadata helpers: `variant_type(...)`, `variant_col(...)`, `variant_value(...)`, and `variant_types_compatible(...)`. - Parse/access helpers: `parse_variant_json(...)`, `try_parse_variant_json(...)`, and `variant_get(...)`. -- Inspection helpers: `typeof(...)` returns `StringColumnExpr`; predicates such as `is_null_value(...)`, - `is_boolean(...)`, `is_integer(...)`, `is_float(...)`, `is_string(...)`, `is_timestamp(...)`, `is_array(...)`, and - `is_object(...)` return `BoolColumnExpr`. +- Inspection helpers: `typeof(...)` returns `StringColumnExpr`; predicates such as `is_null_value(...)`, `is_boolean(...)`, `is_integer(...)`, `is_float(...)`, `is_string(...)`, `is_timestamp(...)`, `is_array(...)`, and `is_object(...)` return `BoolColumnExpr`. Each public helper is registry-backed with explicit variant policy metadata. Variant helpers lower through InQL-owned Substrait extension mappings and carry variant kind, encoding, and parse mode as scalar function options where needed. @@ -150,10 +148,6 @@ Each public helper is registry-backed with explicit variant policy metadata. Var ## Design Decisions - The public type spellings are `VariantLogicalType` and `VariantExpr`. -- Variant-returning JSON parsing uses new helper names, `parse_variant_json(...)` and `try_parse_variant_json(...)`, so - RFC 022 string-backed JSON helpers remain stable. -- The shipped kind set is the JSON-compatible family plus timestamp: null, boolean, integer, float, string, timestamp, - array, and object. Decimal, binary, date, and interval are not part of this RFC's public variant-kind contract. -- Variant path access uses `$`-rooted literal paths or string-producing dynamic path expressions through - `variant_get(...)`; missing-path runtime behavior is an execution contract for adapters and must not collapse SQL null - and variant null. +- Variant-returning JSON parsing uses new helper names, `parse_variant_json(...)` and `try_parse_variant_json(...)`, so RFC 022 string-backed JSON helpers remain stable. +- The shipped kind set is the JSON-compatible family plus timestamp: null, boolean, integer, float, string, timestamp, array, and object. Decimal, binary, date, and interval are not part of this RFC's public variant-kind contract. +- Variant path access uses `$`-rooted literal paths or string-producing dynamic path expressions through `variant_get(...)`; missing-path runtime behavior is an execution contract for adapters and must not collapse SQL null and variant null. diff --git a/docs/rfcs/032_execution_observations.md b/docs/rfcs/032_execution_observations.md index db23591..608841f 100644 --- a/docs/rfcs/032_execution_observations.md +++ b/docs/rfcs/032_execution_observations.md @@ -81,12 +81,7 @@ Quality observations, adapter coverage records, semantic profile records, and ev Existing session execution remains valid. Implementations may initially emit partial observations, but unsupported fields must be explicit rather than silently omitted when consumers request them. -The first implementation adds observed variants for `execute`, `collect`, and `write` while preserving the ordinary -`Result[...]`-returning session APIs. Observed variants return success and failure observations. Wall-clock fields are -Unix nanoseconds from Incan's `std.datetime.runtime.SystemTime`, and duration uses monotonic elapsed nanoseconds from -`std.datetime.runtime.Instant`. The model exposes adapter version, requested and observed semantic profile IDs, byte -count, and trace IDs explicitly; the initial DataFusion path reports `None` or empty values for those fields rather than -fabricating unavailable evidence. +The first implementation adds observed variants for `execute`, `collect`, and `write` while preserving the ordinary `Result[...]`-returning session APIs. Observed variants return success and failure observations. Wall-clock fields are Unix nanoseconds from Incan's `std.datetime.runtime.SystemTime`, and duration uses monotonic elapsed nanoseconds from `std.datetime.runtime.Instant`. The model exposes adapter version, requested and observed semantic profile IDs, byte count, and trace IDs explicitly; the initial DataFusion path reports `None` or empty values for those fields rather than fabricating unavailable evidence. ## Alternatives considered diff --git a/docs/rfcs/033_adapter_requirements_coverage.md b/docs/rfcs/033_adapter_requirements_coverage.md index f78e8fc..9401e05 100644 --- a/docs/rfcs/033_adapter_requirements_coverage.md +++ b/docs/rfcs/033_adapter_requirements_coverage.md @@ -104,9 +104,7 @@ Function registry entries, semi-structured functions, extensions, quality assert Existing adapters may initially report unknown coverage for capabilities they do not declare. Consumers must distinguish unknown from covered. -The first implementation provides the adapter requirement and coverage record vocabulary plus -`Session.check_coverage(requirements)`. Requirement inference from arbitrary inspection records remains part of the -remaining RFC 033 work; the current API evaluates requirements that callers pass explicitly. +The first implementation provides the adapter requirement and coverage record vocabulary plus `Session.check_coverage(requirements)`. Requirement inference from arbitrary inspection records remains part of the remaining RFC 033 work; the current API evaluates requirements that callers pass explicitly. ## Alternatives considered diff --git a/docs/whitepapers/inql_db.md b/docs/whitepapers/inql_db.md index 1880feb..70f92a1 100644 --- a/docs/whitepapers/inql_db.md +++ b/docs/whitepapers/inql_db.md @@ -363,11 +363,9 @@ This should lower through InQL planning into an InQL-DB physical plan that can c Vector search is not enough for agentic retrieval. -InQL-DB should support governed RAG stores as a first-class data pattern: retrieval tables where every returned item -carries provenance, approval state, corpus version, retrieval evidence, and policy compatibility metadata. +InQL-DB should support governed RAG stores as a first-class data pattern: retrieval tables where every returned item carries provenance, approval state, corpus version, retrieval evidence, and policy compatibility metadata. -This matters for advisory systems such as Hees.ai, where the retrieval layer is part of the safety model. A retrieved -entry is not merely text. It is an approved evidence unit with constraints. +This matters for advisory systems such as Hees.ai, where the retrieval layer is part of the safety model. A retrieved entry is not merely text. It is an approved evidence unit with constraints. A vector index answers: @@ -455,8 +453,7 @@ This makes RAG auditable rather than merely semantic. ## HyperQuant evidence-provider ledger -HyperQuant should be treated as an evidence-provider implementation behind InQL-DB and Hees.ai storage contracts, not as -the semantic owner of retrieval behavior. +HyperQuant should be treated as an evidence-provider implementation behind InQL-DB and Hees.ai storage contracts, not as the semantic owner of retrieval behavior. The storage problem is not only vector search. HyperQuant needs a durable audit ledger for evidence-provider runs: @@ -469,9 +466,7 @@ query + package/policy/corpus/index context -> provider run id and fingerprint ``` -This distinction matters because governed systems must explain more than the nearest neighbors. They must explain what -was considered, what was eligible, what was rejected, and which package, policy, corpus, index, and provider versions -controlled the run. +This distinction matters because governed systems must explain more than the nearest neighbors. They must explain what was considered, what was eligible, what was rejected, and which package, policy, corpus, index, and provider versions controlled the run. InQL-DB should support these logical records: @@ -521,9 +516,7 @@ model EvidenceProviderFingerprint: result_fingerprint: str ``` -Eligible evidence and rejected evidence may be represented as filtered views over `EvidenceCandidate`, or as separate -physical tables if the storage engine needs different retention or indexing behavior. The important contract is that -rejected evidence is first-class data, not a log message. +Eligible evidence and rejected evidence may be represented as filtered views over `EvidenceCandidate`, or as separate physical tables if the storage engine needs different retention or indexing behavior. The important contract is that rejected evidence is first-class data, not a log message. For federated domain runtimes, InQL-DB should also support a run-level grouping record: @@ -545,9 +538,7 @@ The governing rule is: Vectors nominate evidence. They do not authorize evidence. ``` -Vector similarity, quantized indexes, and approximate-nearest-neighbor search can propose candidates. Package, policy, -corpus, authority, and admissibility rules decide whether those candidates may become evidence. InQL-DB must preserve -that boundary in storage so later inspection can distinguish retrieval mechanics from governance decisions. +Vector similarity, quantized indexes, and approximate-nearest-neighbor search can propose candidates. Package, policy, corpus, authority, and admissibility rules decide whether those candidates may become evidence. InQL-DB must preserve that boundary in storage so later inspection can distinguish retrieval mechanics from governance decisions. ## CLI @@ -630,8 +621,7 @@ Recommended RFC sequence: Define vector physical type, distance functions, index files, transactionally visible indexes, and query lowering. 6. **HyperQuant evidence-provider ledger RFC** - Define evidence-provider runs, candidate evidence, rejected evidence, provider fingerprints, index provenance, - federated evidence-run grouping, and replay/debug contracts. + Define evidence-provider runs, candidate evidence, rejected evidence, provider fingerprints, index provenance, federated evidence-run grouping, and replay/debug contracts. 7. **CLI RFC** Define `inql db` commands and diagnostics. From d5a90fd58a8260e615b62ea43158cd4626b36dbb Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 19:31:32 +0200 Subject: [PATCH 04/12] docs - complete execution context reference --- docs/language/reference/execution_context.md | 130 ++++++++++++++++--- 1 file changed, 113 insertions(+), 17 deletions(-) diff --git a/docs/language/reference/execution_context.md b/docs/language/reference/execution_context.md index b9d2f5a..a753bf1 100644 --- a/docs/language/reference/execution_context.md +++ b/docs/language/reference/execution_context.md @@ -47,17 +47,64 @@ All read APIs return `LazyFrame[T]`. They create deferred logical work; they do ## Execution observations -Observed execution methods preserve the ordinary session contracts while also returning runtime evidence. They are the author-facing surface for RFC 032 execution observations. - -| API | Returns | Role | -| -------------------------------- | ------------------------ | -------------------------------------------------------------------- | -| `session.execute_observed(data)` | `ObservedLazyFrame[T]` | Execute and return `data`, `observation`, and `error` fields | -| `session.collect_observed(data)` | `ObservedDataFrame[T]` | Collect and return `data`, `observation`, and `error` fields | -| `session.write_observed(data, target)` | `ObservedWrite` | Write and return `observation` plus an optional `error` | - -The ordinary `execute`, `collect`, and `write` methods use the same execution path internally and keep returning `Result[...]` values for compact application code. Use the observed variants when an audit, governance, debugging, or verification flow needs a durable execution attempt record. - -An `ExecutionObservation` records the operation, status, backend name, optional adapter version, requested and observed semantic profile IDs, plan target, execution-attempt target, client-session context target, Unix nanosecond wall-clock start/end values from `std.datetime.runtime.SystemTime`, monotonic duration nanoseconds from `std.datetime.runtime.Instant`, row count or byte count when materialization supplies them, optional trace IDs, diagnostics, and linked coverage records when present. Observation records do not contain row payloads or backend logs by default. +Observed execution methods preserve the ordinary session contracts while also returning runtime evidence. The ordinary `execute`, `collect`, and `write` methods use the same execution path internally and keep returning `Result[...]` values for compact application code. + +| API | Input | Returns | Success data | Failure data | +| ---------------------------------------- | ------------------- | ---------------------- | ---------------------------------- | --------------------------------- | +| `session.execute_observed(data)` | `LazyFrame[T]` | `ObservedLazyFrame[T]` | `data=Some(LazyFrame[T])` | `data=None`, `error=Some(...)` | +| `session.collect_observed(data)` | `LazyFrame[T]` | `ObservedDataFrame[T]` | `data=Some(DataFrame[T])` | `data=None`, `error=Some(...)` | +| `session.write_observed(data, target)` | `BoundedDataSet[T]` | `ObservedWrite` | `error=None` | `error=Some(...)` | + +### Observed result records + +| Record | Fields | +| ---------------------- | ------------------------------------------- | +| `ObservedLazyFrame[T]` | `data: Option[LazyFrame[T]]`, `observation: ExecutionObservation`, `error: Option[SessionError]` | +| `ObservedDataFrame[T]` | `data: Option[DataFrame[T]]`, `observation: ExecutionObservation`, `error: Option[SessionError]` | +| `ObservedWrite` | `observation: ExecutionObservation`, `error: Option[SessionError]` | + +### `ExecutionObservation` + +| Field | Type | Meaning | +| --------------------------------------- | ----------------------------- | -------------------------------------------------------------- | +| `observation_id` | `str` | Stable local identifier for this observation attempt | +| `attempt_target` | `SemanticTarget` | Semantic target for the concrete execution attempt | +| `plan_target` | `SemanticTarget` | Semantic target for the plan being attempted | +| `context_targets` | `list[SemanticTarget]` | Session or binding context targets attached to the attempt | +| `operation` | `ExecutionOperationKind` | Operation family: `execute`, `collect`, or `write` | +| `status` | `ExecutionObservationStatus` | Terminal status | +| `backend_name` | `str` | Selected backend name, currently `datafusion` by default | +| `adapter_version` | `Option[str]` | Adapter version when reported by the backend | +| `requested_semantic_profile_id` | `Option[str]` | Requested semantic profile identity when one is bound | +| `observed_semantic_profile_id` | `Option[str]` | Observed semantic profile identity when the adapter reports one | +| `started_at_unix_nanoseconds` | `int` | Wall-clock start timestamp from `std.datetime.runtime.SystemTime` | +| `ended_at_unix_nanoseconds` | `int` | Wall-clock end timestamp from `std.datetime.runtime.SystemTime` | +| `duration_nanoseconds` | `int` | Monotonic elapsed duration from `std.datetime.runtime.Instant` | +| `row_count` | `Option[int]` | Materialized row count when the operation supplies one | +| `byte_count` | `Option[int]` | Byte count when the operation supplies one | +| `trace_ids` | `list[str]` | Optional external trace or telemetry correlation IDs | +| `diagnostics` | `list[ExecutionDiagnostic]` | Structured diagnostics attached to the attempt | +| `coverage_records` | `list[AdapterCoverageRecord]` | Adapter coverage records linked to the attempt | +| `evidence_refs` | `list[str]` | Additional evidence artifact references | + +Observation records do not contain row payloads or backend logs by default. The first DataFusion-backed implementation reports unavailable adapter-version, semantic-profile, byte-count, and trace evidence as `None` or `[]` rather than fabricating values. + +### Execution enums + +| Enum | Values | +| ----------------------------- | ------------------------------------------------ | +| `ExecutionOperationKind` | `Execute`, `Collect`, `Write` | +| `ExecutionObservationStatus` | `Success`, `Failure`, `Cancelled`, `Skipped`, `Unsupported` | +| `ExecutionDiagnosticSeverity` | `Info`, `Warning`, `Error` | + +### `ExecutionDiagnostic` + +| Field | Type | Meaning | +| ---------- | ----------------------------- | -------------------------------------------- | +| `severity` | `ExecutionDiagnosticSeverity` | Diagnostic severity | +| `code` | `str` | Stable diagnostic code | +| `message` | `str` | Human-readable diagnostic message | +| `target` | `Option[SemanticTarget]` | Semantic target associated with the diagnostic | ```incan observed = session.collect_observed(summary) @@ -82,14 +129,59 @@ These writes are Session-owned. They do not bypass the execution context even wh ## Adapter coverage -`session.check_coverage(requirements)` accepts explicit `AdapterRequirement` records and returns one `AdapterCoverageRecord` per requirement. This is the current RFC 033 coverage surface. It does not infer requirements from every plan shape yet; callers must pass the requirements they want evaluated. +`session.check_coverage(requirements)` accepts explicit `AdapterRequirement` records and returns one `AdapterCoverageRecord` per requirement. It does not infer requirements from every plan shape yet; callers must pass the requirements they want evaluated. + +| API | Input | Returns | +| -------------------------------------- | -------------------------- | ----------------------------- | +| `session.check_coverage(requirements)` | `list[AdapterRequirement]` | `list[AdapterCoverageRecord]` | + +### `AdapterRequirement` + +| Field | Type | Meaning | +| ---------------- | ------------------------------ | -------------------------------------------------- | +| `requirement_id` | `str` | Stable local requirement identifier | +| `target` | `SemanticTarget` | Semantic target that requires the capability | +| `capability` | `AdapterRequirementCapability` | Required adapter capability family | +| `guarantee` | `AdapterRequirementGuarantee` | Requirement strength: required, preferred, optional | +| `reason` | `str` | Human-readable reason for the requirement | +| `evidence_refs` | `list[str]` | Evidence artifacts that justify the requirement | + +### `AdapterCoverageRecord` + +| Field | Type | Meaning | +| --------------------- | ---------------------------- | ---------------------------------------------------- | +| `coverage_id` | `str` | Stable local coverage-record identifier | +| `requirement` | `AdapterRequirement` | Requirement that was evaluated | +| `adapter_name` | `str` | Adapter that was evaluated | +| `adapter_version` | `Option[str]` | Adapter version when reported | +| `semantic_profile_id` | `Option[str]` | Semantic profile identity when relevant | +| `state` | `AdapterCoverageState` | Coverage result | +| `diagnostics` | `list[ExecutionDiagnostic]` | Diagnostics explaining partial, uncovered, or unknown coverage | +| `evidence_refs` | `list[str]` | Evidence artifacts that support the coverage answer | + +### Adapter requirement enums + +| Enum | Values | +| ----------------------------- | ------ | +| `AdapterRequirementGuarantee` | `Required`, `Preferred`, `Optional` | +| `AdapterCoverageState` | `Covered`, `PartiallyCovered`, `Uncovered`, `Unknown` | +| `AdapterRequirementCapability` | `ExtensionFunction`, `VariantSemantics`, `DecimalSemantics`, `NullSemantics`, `LineagePreservation`, `AuditEmission`, `RowFilter`, `ColumnMask`, `AggregateThreshold`, `RegionBinding`, `OrderedExecution`, `SnapshotCapture`, `CanonicalDigest`, `CrossRelationReconciliation`, `IncrementalWatermark`, `VerificationEventStream`, `WaiverRecording`, `CryptographicQueryProof` | + +Coverage states are conservative. `Covered` means the selected adapter is known to cover that requirement family. `PartiallyCovered` means support depends on the concrete function, plan shape, or restriction. `Uncovered` means the selected adapter is known not to provide that guarantee. `Unknown` means InQL has not classified coverage; consumers must not treat it as enforced behavior. + +### Current DataFusion coverage classification -Coverage states are conservative: +| Capability | State | +| --------------------------------------- | ------------------ | +| `RowFilter` | `Covered` | +| `OrderedExecution` | `Covered` | +| `NullSemantics` | `Covered` | +| `ExtensionFunction` | `PartiallyCovered` | +| `LineagePreservation` | `Uncovered` | +| `AuditEmission` | `Uncovered` | +| Any other `AdapterRequirementCapability` | `Unknown` | -- `covered` means the selected adapter is known to cover that requirement family. -- `partially_covered` means support depends on the concrete function, plan shape, or restriction. -- `uncovered` means the selected adapter is known not to provide that guarantee. -- `unknown` means InQL has not classified coverage; consumers must not treat it as enforced behavior. +For non-DataFusion backends, the current implementation returns `Unknown` for every capability until that adapter declares coverage metadata. ## Active-session convenience @@ -120,5 +212,9 @@ DataFusion is the implemented execution backend. `Session` stores a backend kind - For the conceptual model behind this surface, see [Execution context (Explanation)](../explanation/execution_context.md) - For carrier semantics, see [Dataset carriers (Reference)](dataset_carriers.md) +- For execution observation design, see [RFC 032][rfc-032] +- For adapter requirement and coverage design, see [RFC 033][rfc-033] [rfc-004]: ../../rfcs/004_inql_execution_context.md +[rfc-032]: ../../rfcs/032_execution_observations.md +[rfc-033]: ../../rfcs/033_adapter_requirements_coverage.md From a69c5df7daca382520a8e6ff8b874c890a1de0c8 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 19:34:26 +0200 Subject: [PATCH 05/12] docs - add execution observation how-to --- docs/README.md | 12 ++- docs/language/README.md | 4 + .../language/explanation/execution_context.md | 22 ++++ docs/language/how-to/README.md | 8 ++ .../language/how-to/execution_observations.md | 102 ++++++++++++++++++ docs/language/reference/execution_context.md | 1 + 6 files changed, 145 insertions(+), 4 deletions(-) create mode 100644 docs/language/how-to/README.md create mode 100644 docs/language/how-to/execution_observations.md diff --git a/docs/README.md b/docs/README.md index 259e314..0f4c3d6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,6 +5,7 @@ This directory holds the public documentation for the InQL project. Use the docs tree like this: - **Language reference:** current package/API contracts under [language/reference/][language-reference] +- **Language how-to guides:** task-oriented workflows under [language/how-to/][language-how-to] - **Language explanation:** conceptual guidance and usage framing under [language/explanation/][language-explanation] - **Architecture:** repository and system boundaries in [architecture.md][architecture] - **RFCs:** design records and normative proposals in [rfcs/][rfcs] @@ -19,9 +20,10 @@ Use the docs tree like this: 1. [Language overview][language-overview] 2. [Dataset carriers (Explanation)][dataset-explanation] 3. [Execution context (Explanation)][execution-explanation] -4. [Dataset carriers (Reference)][dataset-reference] -5. [Execution context (Reference)][execution-reference] -6. [Local inspection (Reference)][inspection-reference] +4. [Capture execution observations and adapter coverage (How-to)][execution-observations-how-to] +5. [Dataset carriers (Reference)][dataset-reference] +6. [Execution context (Reference)][execution-reference] +7. [Local inspection (Reference)][inspection-reference] ### Understand the system design @@ -34,10 +36,11 @@ Use the docs tree like this: 1. [RFC index][rfcs-index] 2. [How to write an RFC][writing-rfcs] -> Note: When a standalone docs site is added, `docs/` remains the content root. The structure here should already follow the same content model used in Incan: reference, explanation, architecture/contributing, RFCs, and release notes. +> Note: When a standalone docs site is added, `docs/` remains the content root. The structure here should already follow the same content model used in Incan: reference, how-to guides, explanation, architecture/contributing, RFCs, and release notes. [language-reference]: language/reference/ +[language-how-to]: language/how-to/ [language-explanation]: language/explanation/ [architecture]: architecture.md [rfcs]: rfcs/README.md @@ -49,6 +52,7 @@ Use the docs tree like this: [execution-explanation]: language/explanation/execution_context.md [dataset-reference]: language/reference/dataset_carriers.md [execution-reference]: language/reference/execution_context.md +[execution-observations-how-to]: language/how-to/execution_observations.md [inspection-reference]: language/reference/inspection.md [rfcs-index]: rfcs/README.md [writing-rfcs]: contributing/writing_rfcs.md diff --git a/docs/language/README.md b/docs/language/README.md index e33121c..87beb2f 100644 --- a/docs/language/README.md +++ b/docs/language/README.md @@ -3,6 +3,7 @@ This section documents the current InQL package surface. - Use [reference/][reference] for API shape, signatures, and current behavior contracts. +- Use [how-to/][how-to] for concrete task workflows. - Use [explanation/][explanation] for mental models, usage framing, and tradeoffs. ## Current entry points @@ -15,6 +16,7 @@ This section documents the current InQL package surface. ### Execution and materialization +- [Capture execution observations and adapter coverage (How-to)][execution-observations-how-to] - [Execution context (Reference)][execution-reference] - [Execution context (Explanation)][execution-explanation] @@ -31,6 +33,7 @@ This section documents the current InQL package surface. [reference]: reference/ +[how-to]: how-to/ [explanation]: explanation/ [dataset-reference]: reference/dataset_carriers.md [dataset-explanation]: explanation/dataset_carriers.md @@ -38,6 +41,7 @@ This section documents the current InQL package surface. [inspection-reference]: reference/inspection.md [execution-reference]: reference/execution_context.md [execution-explanation]: explanation/execution_context.md +[execution-observations-how-to]: how-to/execution_observations.md [substrait-read-root]: reference/substrait/read_root_binding_contract.md [substrait-conformance]: reference/substrait/conformance.md [substrait-operator-catalog]: reference/substrait/operator_catalog.md diff --git a/docs/language/explanation/execution_context.md b/docs/language/explanation/execution_context.md index 9be630b..1ba687a 100644 --- a/docs/language/explanation/execution_context.md +++ b/docs/language/explanation/execution_context.md @@ -70,6 +70,26 @@ The ergonomic split is: This keeps materialization convenient while leaving sink ownership explicit at the session boundary. +## Runtime evidence is separate from plan evidence + +Plan inspection explains the relational work InQL has authored. Execution observations explain a concrete runtime attempt to run that work through a Session and backend adapter. + +That split matters because the same plan can be attempted more than once, with different backends, bindings, diagnostics, timings, or trace IDs. The plan target remains the semantic anchor. The execution attempt target records what happened in one runtime lifecycle event. + +Observed Session methods keep this separation explicit: + +- `execute_observed(...)` records an execution checkpoint without local materialization. +- `collect_observed(...)` records a materialization attempt and can include row count evidence. +- `write_observed(...)` records a sink-write attempt. + +The compact `execute(...)`, `collect(...)`, and `write(...)` methods still return `Result[...]` values for application code that does not need an evidence record. + +## Adapter coverage is explicit evidence + +Adapter coverage answers a different question from execution success. Execution success says the selected backend accepted and ran a plan attempt. Coverage says whether the selected adapter is known to provide a named capability or guarantee. + +The current coverage API is deliberately explicit: callers pass `AdapterRequirement` records to `session.check_coverage(...)`. InQL does not yet infer all requirements from arbitrary plan shapes. Unknown coverage is therefore not a soft success; it means InQL does not have evidence that the adapter enforces that capability. + ## Typical flow ```incan @@ -112,3 +132,5 @@ The materialized carrier exposes structured collection metadata: - preview text For exact API shape, see [Execution context (Reference)](../reference/execution_context.md). + +For a task-oriented workflow, see [Capture execution observations and adapter coverage](../how-to/execution_observations.md). diff --git a/docs/language/how-to/README.md b/docs/language/how-to/README.md new file mode 100644 index 0000000..3cf118e --- /dev/null +++ b/docs/language/how-to/README.md @@ -0,0 +1,8 @@ +# InQL language how-to guides + +How-to guides show concrete task workflows for the current InQL package surface. They complement the reference docs, which define API shape and behavior contracts. + +- [Capture execution observations and adapter coverage][execution-observations] + + +[execution-observations]: execution_observations.md diff --git a/docs/language/how-to/execution_observations.md b/docs/language/how-to/execution_observations.md new file mode 100644 index 0000000..4977ee9 --- /dev/null +++ b/docs/language/how-to/execution_observations.md @@ -0,0 +1,102 @@ +# Capture execution observations and adapter coverage + +This how-to shows how to collect runtime evidence for a Session operation and how to ask the selected adapter whether it covers explicit requirements. + +Use the observed Session methods when you need an auditable execution attempt record. Use `check_coverage(...)` when a tool, policy, or review step already knows which adapter capability needs to be checked. + +## Collect with an observation + +Use `collect_observed(...)` when you need materialized data and execution evidence from the same attempt. + +```incan +from pub::inql import ExecutionObservationStatus, LazyFrame, Session +from models import Order + +session = Session.default() +orders: LazyFrame[Order] = session.read_csv("orders", "orders.csv")? + +observed = session.collect_observed(orders) + +match observed.data: + Some(df) => + println(df.preview_text()) + println(f"rows={df.row_count()}") + None => + println(observed.observation.diagnostics[0].message) + +assert observed.observation.status == ExecutionObservationStatus.Success +``` + +The observed result always includes `observation`. On success, `data` contains the materialized `DataFrame[T]`. On failure, `data` is `None` and `error` contains the `SessionError`. + +## Validate execution without materializing + +Use `execute_observed(...)` when you want the same execution checkpoint as `execute(...)` but still need an observation record. + +```incan +observed = session.execute_observed(orders) + +match observed.error: + Some(err) => println(err.error_message()) + None => println(observed.observation.observation_id) +``` + +`execute_observed(...)` returns the deferred `LazyFrame[T]` on success. It does not invent a row count because it does not materialize local rows. + +## Write with an observation + +Use `write_observed(...)` when the write itself is the operation you want to audit. + +```incan +from pub::inql import csv_sink + +write_attempt = session.write_observed(orders, csv_sink("target/orders.csv")) + +match write_attempt.error: + Some(err) => println(err.error_message()) + None => println(write_attempt.observation.observation_id) +``` + +The write result has no `data` field. The output artifact is the sink side effect; the returned value carries the observation and optional error. + +## Check explicit adapter requirements + +`check_coverage(...)` does not infer requirements from a plan yet. Build the requirements that matter to the policy or workflow, then ask the selected adapter for coverage records. + +```incan +from pub::inql import ( + AdapterCoverageState, + AdapterRequirement, + AdapterRequirementCapability, + AdapterRequirementGuarantee, +) + +observed = session.collect_observed(orders) +requirement = AdapterRequirement( + requirement_id="orders-row-filter", + target=observed.observation.plan_target, + capability=AdapterRequirementCapability.RowFilter, + guarantee=AdapterRequirementGuarantee.Required, + reason="filtered order review requires adapter-side row filtering", + evidence_refs=[], +) + +coverage = session.check_coverage([requirement]) + +match coverage[0].state: + AdapterCoverageState.Covered => println("covered") + AdapterCoverageState.PartiallyCovered => println(coverage[0].diagnostics[0].message) + AdapterCoverageState.Uncovered => println(coverage[0].diagnostics[0].message) + AdapterCoverageState.Unknown => println(coverage[0].diagnostics[0].message) +``` + +Treat `Unknown` as non-enforcing. It means InQL has not classified that adapter capability; it does not mean the adapter has proven support. + +## Choose the right observed method + +- Use `execute_observed(...)` for a validation/checkpoint boundary without local materialization. +- Use `collect_observed(...)` when a local `DataFrame[T]` and row count are part of the evidence you need. +- Use `write_observed(...)` when the sink write is the operation being audited. +- Use `check_coverage(...)` for explicit adapter requirements; do not use it as a plan-requirement discovery API. + +For the complete field and enum reference, see [Execution context (Reference)](../reference/execution_context.md). diff --git a/docs/language/reference/execution_context.md b/docs/language/reference/execution_context.md index a753bf1..84c3270 100644 --- a/docs/language/reference/execution_context.md +++ b/docs/language/reference/execution_context.md @@ -211,6 +211,7 @@ DataFusion is the implemented execution backend. `Session` stores a backend kind ## Related docs - For the conceptual model behind this surface, see [Execution context (Explanation)](../explanation/execution_context.md) +- For task-oriented examples, see [Capture execution observations and adapter coverage](../how-to/execution_observations.md) - For carrier semantics, see [Dataset carriers (Reference)](dataset_carriers.md) - For execution observation design, see [RFC 032][rfc-032] - For adapter requirement and coverage design, see [RFC 033][rfc-033] From f614cf3039c1825a4f9c1bc003e756ecb9679d50 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 19:47:07 +0200 Subject: [PATCH 06/12] docs - split reference workflows into how-tos --- docs/README.md | 22 +++- docs/language/README.md | 23 ++++ docs/language/how-to/README.md | 18 +++ docs/language/how-to/approximate_metrics.md | 24 ++++ .../how-to/dataset_transformations.md | 58 +++++++++ docs/language/how-to/generator_rows.md | 37 ++++++ docs/language/how-to/inspect_plan_lineage.md | 48 +++++++ docs/language/how-to/nested_row_values.md | 23 ++++ .../how-to/normalize_semistructured_fields.md | 32 +++++ docs/language/how-to/typed_hll_sketches.md | 40 ++++++ docs/language/how-to/variant_payloads.md | 26 ++++ docs/language/how-to/window_columns.md | 29 +++++ .../language/reference/builders/aggregates.md | 22 +--- docs/language/reference/builders/filters.md | 13 +- .../reference/builders/projections.md | 13 +- docs/language/reference/dataset_methods.md | 120 +++++++----------- docs/language/reference/execution_context.md | 9 -- .../reference/functions/approximate.md | 15 +-- docs/language/reference/functions/format.md | 23 +--- .../reference/functions/generators.md | 18 +-- docs/language/reference/functions/nested.md | 15 +-- docs/language/reference/functions/sketches.md | 24 +--- docs/language/reference/functions/variants.md | 17 +-- docs/language/reference/functions/windows.md | 20 +-- docs/language/reference/inspection.md | 20 +-- 25 files changed, 443 insertions(+), 266 deletions(-) create mode 100644 docs/language/how-to/approximate_metrics.md create mode 100644 docs/language/how-to/dataset_transformations.md create mode 100644 docs/language/how-to/generator_rows.md create mode 100644 docs/language/how-to/inspect_plan_lineage.md create mode 100644 docs/language/how-to/nested_row_values.md create mode 100644 docs/language/how-to/normalize_semistructured_fields.md create mode 100644 docs/language/how-to/typed_hll_sketches.md create mode 100644 docs/language/how-to/variant_payloads.md create mode 100644 docs/language/how-to/window_columns.md diff --git a/docs/README.md b/docs/README.md index 0f4c3d6..48f1adf 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,10 +20,17 @@ Use the docs tree like this: 1. [Language overview][language-overview] 2. [Dataset carriers (Explanation)][dataset-explanation] 3. [Execution context (Explanation)][execution-explanation] -4. [Capture execution observations and adapter coverage (How-to)][execution-observations-how-to] -5. [Dataset carriers (Reference)][dataset-reference] -6. [Execution context (Reference)][execution-reference] -7. [Local inspection (Reference)][inspection-reference] +4. [Build deferred dataset transformations (How-to)][dataset-transformations-how-to] +5. [Normalize semi-structured fields (How-to)][normalize-semistructured-fields-how-to] +6. [Work with nested row values (How-to)][nested-row-values-how-to] +7. [Expand rows with generators (How-to)][generator-rows-how-to] +8. [Add window columns (How-to)][window-columns-how-to] +9. [Capture execution observations and adapter coverage (How-to)][execution-observations-how-to] +10. [Dataset carriers (Reference)][dataset-reference] +11. [Dataset methods (Reference)][dataset-methods-reference] +12. [Execution context (Reference)][execution-reference] +13. [Inspect a plan and lineage graph (How-to)][inspect-plan-lineage-how-to] +14. [Local inspection (Reference)][inspection-reference] ### Understand the system design @@ -48,11 +55,18 @@ Use the docs tree like this: [release-notes]: release_notes/ [contributing]: contributing/ [language-overview]: language/README.md +[window-columns-how-to]: language/how-to/window_columns.md [dataset-explanation]: language/explanation/dataset_carriers.md [execution-explanation]: language/explanation/execution_context.md [dataset-reference]: language/reference/dataset_carriers.md +[dataset-methods-reference]: language/reference/dataset_methods.md +[dataset-transformations-how-to]: language/how-to/dataset_transformations.md +[generator-rows-how-to]: language/how-to/generator_rows.md +[nested-row-values-how-to]: language/how-to/nested_row_values.md +[normalize-semistructured-fields-how-to]: language/how-to/normalize_semistructured_fields.md [execution-reference]: language/reference/execution_context.md [execution-observations-how-to]: language/how-to/execution_observations.md +[inspect-plan-lineage-how-to]: language/how-to/inspect_plan_lineage.md [inspection-reference]: language/reference/inspection.md [rfcs-index]: rfcs/README.md [writing-rfcs]: contributing/writing_rfcs.md diff --git a/docs/language/README.md b/docs/language/README.md index 87beb2f..552b492 100644 --- a/docs/language/README.md +++ b/docs/language/README.md @@ -10,8 +10,13 @@ This section documents the current InQL package surface. ### Core carriers +- [Build deferred dataset transformations (How-to)][dataset-transformations-how-to] +- [Expand rows with generators (How-to)][generator-rows-how-to] +- [Normalize semi-structured fields (How-to)][normalize-semistructured-fields-how-to] +- [Work with nested row values (How-to)][nested-row-values-how-to] - [Dataset carriers (Reference)][dataset-reference] - [Dataset carriers (Explanation)][dataset-explanation] +- [Dataset methods (Reference)][dataset-methods-reference] - [Query blocks (Reference)][query-blocks-reference] ### Execution and materialization @@ -20,6 +25,13 @@ This section documents the current InQL package surface. - [Execution context (Reference)][execution-reference] - [Execution context (Explanation)][execution-explanation] +### Analytical functions + +- [Add window columns (How-to)][window-columns-how-to] +- [Estimate approximate metrics (How-to)][approximate-metrics-how-to] +- [Build typed HyperLogLog sketches (How-to)][typed-hll-sketches-how-to] +- [Inspect typed variant payloads (How-to)][variant-payloads-how-to] + ### Substrait boundary - [Substrait read-root and binding contract][substrait-read-root] @@ -29,19 +41,30 @@ This section documents the current InQL package surface. ### Local evidence +- [Inspect a plan and lineage graph (How-to)][inspect-plan-lineage-how-to] - [Local inspection][inspection-reference] [reference]: reference/ [how-to]: how-to/ [explanation]: explanation/ +[approximate-metrics-how-to]: how-to/approximate_metrics.md [dataset-reference]: reference/dataset_carriers.md [dataset-explanation]: explanation/dataset_carriers.md +[dataset-methods-reference]: reference/dataset_methods.md +[dataset-transformations-how-to]: how-to/dataset_transformations.md +[generator-rows-how-to]: how-to/generator_rows.md +[nested-row-values-how-to]: how-to/nested_row_values.md +[normalize-semistructured-fields-how-to]: how-to/normalize_semistructured_fields.md [query-blocks-reference]: reference/query_blocks.md +[typed-hll-sketches-how-to]: how-to/typed_hll_sketches.md +[variant-payloads-how-to]: how-to/variant_payloads.md +[window-columns-how-to]: how-to/window_columns.md [inspection-reference]: reference/inspection.md [execution-reference]: reference/execution_context.md [execution-explanation]: explanation/execution_context.md [execution-observations-how-to]: how-to/execution_observations.md +[inspect-plan-lineage-how-to]: how-to/inspect_plan_lineage.md [substrait-read-root]: reference/substrait/read_root_binding_contract.md [substrait-conformance]: reference/substrait/conformance.md [substrait-operator-catalog]: reference/substrait/operator_catalog.md diff --git a/docs/language/how-to/README.md b/docs/language/how-to/README.md index 3cf118e..128c022 100644 --- a/docs/language/how-to/README.md +++ b/docs/language/how-to/README.md @@ -2,7 +2,25 @@ How-to guides show concrete task workflows for the current InQL package surface. They complement the reference docs, which define API shape and behavior contracts. +- [Add window columns][window-columns] +- [Build typed HyperLogLog sketches][typed-hll-sketches] - [Capture execution observations and adapter coverage][execution-observations] +- [Build deferred dataset transformations][dataset-transformations] +- [Estimate approximate metrics][approximate-metrics] +- [Expand rows with generators][generator-rows] +- [Inspect a plan and lineage graph][inspect-plan-lineage] +- [Normalize semi-structured fields][normalize-semistructured-fields] +- [Inspect typed variant payloads][variant-payloads] +- [Work with nested row values][nested-row-values] +[approximate-metrics]: approximate_metrics.md +[dataset-transformations]: dataset_transformations.md [execution-observations]: execution_observations.md +[generator-rows]: generator_rows.md +[inspect-plan-lineage]: inspect_plan_lineage.md +[nested-row-values]: nested_row_values.md +[normalize-semistructured-fields]: normalize_semistructured_fields.md +[typed-hll-sketches]: typed_hll_sketches.md +[variant-payloads]: variant_payloads.md +[window-columns]: window_columns.md diff --git a/docs/language/how-to/approximate_metrics.md b/docs/language/how-to/approximate_metrics.md new file mode 100644 index 0000000..740c97d --- /dev/null +++ b/docs/language/how-to/approximate_metrics.md @@ -0,0 +1,24 @@ +# Estimate approximate metrics + +This how-to shows how to opt in to approximate aggregate helpers when exact results are not required. + +Use approximate helpers explicitly. InQL does not silently replace exact aggregates with approximate implementations because a backend can do so. + +## Estimate distinct counts and percentiles + +Group the relation normally, then use approximate aggregate measures inside `agg(...)`. + +```incan +from pub::inql.functions import approx_count_distinct, approx_percentile, col + +summary = ( + events + .group_by([col("campaign_id")]) + .agg([ + approx_count_distinct(col("user_id")), + approx_percentile(col("latency_ms"), 0.95), + ]) +) +``` + +`approx_percentile(...)` accepts a percentile from `0.0` through `1.0` and an optional positive accuracy value. For exact helper contracts, see [Approximate functions](../reference/functions/approximate.md). diff --git a/docs/language/how-to/dataset_transformations.md b/docs/language/how-to/dataset_transformations.md new file mode 100644 index 0000000..99b2b59 --- /dev/null +++ b/docs/language/how-to/dataset_transformations.md @@ -0,0 +1,58 @@ +# Build deferred dataset transformations + +This how-to shows how to combine common carrier methods while keeping work deferred until a Session executes it. + +## Add computed columns + +Use `with_column(...)` to append a new computed column or replace an existing column by name. + +```incan +from pub::inql import LazyFrame +from pub::inql.functions import add, col, mul +from models import Order + +def enrich(orders: LazyFrame[Order]) -> LazyFrame[Order]: + return ( + orders + .with_column("amount_x2", mul(col("amount"), 2)) + .with_column("amount_plus_one", add(col("amount"), 1)) + ) +``` + +## Filter, group, and aggregate + +Use scalar helpers for row predicates and aggregate helpers for grouped measures. + +```incan +from pub::inql import LazyFrame +from pub::inql.functions import avg, col, count, eq, sum +from models import Order + +def paid_spend_by_customer(orders: LazyFrame[Order]) -> LazyFrame[Order]: + return ( + orders + .filter(eq(col("status"), "paid")) + .group_by([col("customer_id")]) + .agg([ + sum(col("amount")), + avg(col("amount")), + count(), + ]) + ) +``` + +## Sort and limit + +Use ordering helpers inside `order_by(...)`, then cap rows with `limit(...)`. + +```incan +from pub::inql.functions import col, desc + +top_orders = ( + orders + .order_by([desc(col("amount"))]) + .limit(10) +) +``` + +These transforms stay deferred for `LazyFrame[T]`. Use a `Session` to execute, collect, or write the result. For exact method signatures and schema behavior, see [Dataset methods (Reference)](../reference/dataset_methods.md). diff --git a/docs/language/how-to/generator_rows.md b/docs/language/how-to/generator_rows.md new file mode 100644 index 0000000..d41bb07 --- /dev/null +++ b/docs/language/how-to/generator_rows.md @@ -0,0 +1,37 @@ +# Expand rows with generators + +This how-to shows how to use generator helpers when nested values should reshape a relation. + +Generators return `GeneratorApplication` values. Apply them through `generate(...)` so the relation keeps its input columns and appends the generated output aliases. + +## Explode array values + +Use `explode(...)` when each array element should become a generated row. + +```incan +from pub::inql import LazyFrame +from pub::inql.functions import col, explode +from models import Order + +def order_lines(orders: LazyFrame[Order]) -> LazyFrame[Order]: + return orders.generate(explode(col("line_items"), "line_item")) +``` + +## Inline struct arrays + +Use `inline(...)` when the generated rows should expose one output column per struct field. + +```incan +from pub::inql import LazyFrame +from pub::inql.functions import array, inline, lit, named_struct +from models import Order + +def fixed_items(orders: LazyFrame[Order]) -> LazyFrame[Order]: + rows = array([ + named_struct(["sku", "quantity"], [lit("A"), lit(1)]), + named_struct(["sku", "quantity"], [lit("B"), lit(2)]), + ]) + return orders.generate(inline(rows, ["sku", "quantity"])) +``` + +For the full generator catalog and alias rules, see [Generator and table-valued functions](../reference/functions/generators.md). diff --git a/docs/language/how-to/inspect_plan_lineage.md b/docs/language/how-to/inspect_plan_lineage.md new file mode 100644 index 0000000..4037e09 --- /dev/null +++ b/docs/language/how-to/inspect_plan_lineage.md @@ -0,0 +1,48 @@ +# Inspect a plan and lineage graph + +This how-to shows how to inspect a Prism-backed lazy plan without executing it. + +Use `inspect_plan(...)` when you need the full inspection record. Use `inspect_lineage(...)` when you only need the lineage graph. + +## Build a lazy plan + +```incan +from pub::inql import LazyFrame +from pub::inql.functions import col, eq, str_lit, sum +from models import Order + +def paid_spend_summary(orders: LazyFrame[Order]) -> LazyFrame[Order]: + return ( + orders + .filter(eq(col("status"), str_lit("paid"))) + .group_by([col("customer_id")]) + .agg([sum(col("amount"))]) + ) +``` + +## Inspect the plan + +```incan +from pub::inql import inspect_plan + +summary = paid_spend_summary(orders) +inspection = inspect_plan(summary) + +println(inspection.plan_id) +println(inspection.output_fields[0].name) +``` + +`inspect_plan(...)` does not execute the plan. It reads the local Prism state behind the lazy carrier and returns plan targets, output fields, Prism nodes, lineage, artifacts, diagnostics, and unsupported-evidence markers. + +## Read lineage directly + +```incan +from pub::inql import inspect_lineage + +lineage = inspect_lineage(summary) + +for edge in lineage.edges: + println(edge.relationship.value()) +``` + +Lineage is plan-local evidence. It explains how the authored plan relates fields and relations before backend binding or execution. For exact record fields and current limits, see [Local inspection (Reference)](../reference/inspection.md). diff --git a/docs/language/how-to/nested_row_values.md b/docs/language/how-to/nested_row_values.md new file mode 100644 index 0000000..94e49d2 --- /dev/null +++ b/docs/language/how-to/nested_row_values.md @@ -0,0 +1,23 @@ +# Work with nested row values + +This how-to shows how to create and inspect nested scalar values without changing relation cardinality. + +Use nested scalar helpers when each input row should remain one output row. Use generator helpers such as `explode(...)` only when an array or struct should reshape the relation. + +## Add array-derived columns + +Build arrays with `array(...)`, then inspect them with row-level helpers such as `cardinality(...)`, `array_contains(...)`, and `element_at(...)`. + +```incan +from pub::inql.functions import array, array_contains, cardinality, col, element_at, lit + +projected = ( + events + .with_column("tags", array([lit("paid"), col("source")])) + .with_column("tag_count", cardinality(col("tags"))) + .with_column("has_paid_tag", array_contains(col("tags"), "paid")) + .with_column("first_tag", element_at(col("tags"), 1)) +) +``` + +`element_at(...)`, `array_position(...)`, and `array_slice(...)` use one-based array positions. For exact helper contracts, see [Nested data functions](../reference/functions/nested.md). diff --git a/docs/language/how-to/normalize_semistructured_fields.md b/docs/language/how-to/normalize_semistructured_fields.md new file mode 100644 index 0000000..0d8596a --- /dev/null +++ b/docs/language/how-to/normalize_semistructured_fields.md @@ -0,0 +1,32 @@ +# Normalize semi-structured fields + +This how-to shows how to derive stable string, JSON, CSV, and URL fields from scalar payload columns. + +Use format helpers when the payload should stay a scalar expression in the current row. Use typed variant helpers when the plan needs kind-aware semi-structured inspection rather than normalized text. + +## Derive normalized fields + +Hash identifiers, extract URL and JSON fields, and validate schema-bearing payloads with model type parameters. + +```incan +from pub::inql.functions import col, from_csv, from_json, get_json_object, parse_url, sha2, to_json + +model EventPayload: + type_ as "type": str + +model CsvRow: + id: int + status: str + +projected = ( + events + .with_column("user_hash", sha2(col("user_id"), 256)) + .with_column("campaign", parse_url(col("landing_page"), "utm_campaign")) + .with_column("event_type", get_json_object(col("payload"), "$.type")) + .with_column("payload_obj", from_json[EventPayload](col("payload"))) + .with_column("row_fields", from_csv[CsvRow](col("csv_line"))) + .with_column("payload_out", to_json(col("event_type"))) +) +``` + +`from_json[Model](...)` and `from_csv[Model](...)` derive their validation schema from the Incan model type argument. For the complete helper catalog, see [Format functions](../reference/functions/format.md). diff --git a/docs/language/how-to/typed_hll_sketches.md b/docs/language/how-to/typed_hll_sketches.md new file mode 100644 index 0000000..8508e92 --- /dev/null +++ b/docs/language/how-to/typed_hll_sketches.md @@ -0,0 +1,40 @@ +# Build typed HyperLogLog sketches + +This how-to shows how to create, merge, and estimate typed HyperLogLog sketch state. + +Use sketch helpers when approximate state itself needs to flow through a plan. Use `approx_count_distinct(...)` when the plan only needs one aggregate estimate. + +## Build daily sketches + +Aggregate source values into typed sketch state with `hll_sketch(...)`. + +```incan +from pub::inql.functions import col, hll_sketch + +daily = events.group_by([col("event_date")]).agg([ + hll_sketch(col("user_id"), precision=14), +]) + +literal_seed = events.group_by([col("event_date")]).agg([ + hll_sketch("anonymous-user", precision=14), +]) +``` + +## Merge and estimate sketches + +Reference sketch columns with matching logical type metadata, then merge and estimate them. + +```incan +from pub::inql.sketches import hll_estimate, hll_merge, hll_type, sketch_col + +monthly = daily.group_by([col("month")]).agg([ + hll_merge(sketch_col("hll_sketch_user_id", hll_type(precision=14))), +]) + +reported = monthly.with_column( + "estimated_users", + hll_estimate(sketch_col("hll_merge_hll_sketch_user_id", hll_type(precision=14))), +) +``` + +Sketches can merge only when family, value domain, precision, and serialization format match. For exact helper contracts, see [Sketch functions](../reference/functions/sketches.md). diff --git a/docs/language/how-to/variant_payloads.md b/docs/language/how-to/variant_payloads.md new file mode 100644 index 0000000..c1ed2c3 --- /dev/null +++ b/docs/language/how-to/variant_payloads.md @@ -0,0 +1,26 @@ +# Inspect typed variant payloads + +This how-to shows how to parse JSON text into typed variant values and inspect their shape. + +Use variant helpers when the plan needs kind-aware semi-structured inspection. Use RFC 022 JSON helpers when normalized JSON text is enough. + +## Parse and inspect a payload + +Parse once, then apply `typeof(...)`, `variant_get(...)`, and variant predicates to the typed value. + +```incan +from pub::inql.functions import col, is_array, is_null_value, parse_variant_json, typeof, variant_get + +payload = parse_variant_json(col("payload")) +literal_payload = parse_variant_json("{\"status\":\"paid\"}") + +projected = ( + events + .with_column("payload_kind", typeof(payload)) + .with_column("items_are_array", is_array(variant_get(payload, "$.items"))) + .with_column("dynamic_value", variant_get(literal_payload, col("json_path"))) + .with_column("deleted_was_variant_null", is_null_value(variant_get(payload, "$.deleted_at"))) +) +``` + +Variant predicates accept `VariantExpr` values. They do not parse strings directly. For exact helper contracts, see [Variant functions](../reference/functions/variants.md). diff --git a/docs/language/how-to/window_columns.md b/docs/language/how-to/window_columns.md new file mode 100644 index 0000000..8a82983 --- /dev/null +++ b/docs/language/how-to/window_columns.md @@ -0,0 +1,29 @@ +# Add window columns + +This how-to shows how to add relation-aware window outputs to a deferred carrier. + +Window helpers produce one output value per input row while reading related rows from a partition. Place them with `with_window_column(...)`. + +## Rank and compare rows inside a partition + +Build a window spec, call `.over(spec)` on each window helper, and attach the resulting applications as named columns. + +```incan +from pub::inql import LazyFrame +from pub::inql.functions import col, current_row, desc, lag, rank, sum, unbounded_preceding, window +from models import Order + +def ranked_orders(orders: LazyFrame[Order]) -> LazyFrame[Order]: + spec = window().partition_by([col("customer_id")]).order_by([desc(col("amount"))]) + return ( + orders + .with_window_column("customer_rank", rank().over(spec)) + .with_window_column("previous_amount", lag(col("amount")).over(spec)) + .with_window_column( + "running_amount", + sum(col("amount")).over(spec.rows_between(unbounded_preceding(), current_row())), + ) + ) +``` + +Ranking, distribution, offset, and value helpers require explicit ordering. For exact helper contracts, see [Window functions](../reference/functions/windows.md). diff --git a/docs/language/reference/builders/aggregates.md b/docs/language/reference/builders/aggregates.md index 8a73cac..3763706 100644 --- a/docs/language/reference/builders/aggregates.md +++ b/docs/language/reference/builders/aggregates.md @@ -31,27 +31,6 @@ Aggregate measures support method-style modifiers: | `filter` | `measure.filter(predicate: ColumnExpr) -> AggregateMeasure` | Apply an aggregate-local boolean predicate before aggregation. | | `order_by` | `measure.order_by(ordering: list[ColumnExpr]) -> AggregateMeasure` | Record ordered aggregate input. Core aggregates reject ordered input until an order-sensitive aggregate lands. | -## Example - -```incan -from pub::inql.functions import add, approx_count_distinct, approx_percentile, avg, col, count, count_distinct, count_if, eq, hll_sketch, lit, max, min, str_lit, sum - -grouped = orders.group_by([col("customer_id")]).agg([ - sum(add(col("amount"), 5)), - count(), - count(col("discount_code")), - count_distinct(col("product_id")), - count_if(eq(col("status"), str_lit("paid"))), - sum(col("amount")).filter(eq(col("status"), str_lit("paid"))), - avg(col("amount")), - min(col("created_at")), - max(col("created_at")), - approx_count_distinct(col("user_id")), - approx_percentile(col("latency_ms"), 0.95), - hll_sketch(col("user_id"), precision=14), -]) -``` - ## Notes - Aggregate inputs use the same scalar-expression model as filters, projections, and grouping keys. @@ -66,3 +45,4 @@ grouped = orders.group_by([col("customer_id")]).agg([ - `hll_sketch` and `hll_merge` are aggregate-shaped typed sketch helpers. They produce typed sketch state and preserve sketch family, value domain, precision, and format metadata through the registry and Substrait boundary. - Unsupported aggregate modifiers fail at lowering or backend planning; they are not ignored. - Future `.column` sugar and scoped aggregate symbols should lower to this same surface rather than replacing its semantics. +- For task-oriented usage, see [Build deferred dataset transformations](../../how-to/dataset_transformations.md). diff --git a/docs/language/reference/builders/filters.md b/docs/language/reference/builders/filters.md index e32e5ba..c9497a9 100644 --- a/docs/language/reference/builders/filters.md +++ b/docs/language/reference/builders/filters.md @@ -15,20 +15,9 @@ Current filter authoring uses the shared scalar-expression builder model. | `str_lit` | `def str_lit(value: str) -> StringLiteralExpr` | Typed string literal helper. | | `bool_lit` | `def bool_lit(value: bool) -> BoolLiteralExpr` | Typed boolean literal helper. | -## Example - -```incan -from pub::inql.functions import col, eq, gt - -filtered = ( - orders - .filter(gt(col("amount"), 100)) - .filter(eq(col("status"), "open")) -) -``` - ## Notes - Filter predicates are scalar expressions, not a separate predicate-only builder hierarchy. - Primitive values are accepted where predicate helper signatures use value-or-column aliases. Use `lit(...)` or typed literal helpers when a broad `ColumnExpr` is required explicitly. - Boolean composition belongs to the broader scalar-function surface. +- For task-oriented usage, see [Build deferred dataset transformations](../../how-to/dataset_transformations.md). diff --git a/docs/language/reference/builders/projections.md b/docs/language/reference/builders/projections.md index 5c6cdc2..c975ba7 100644 --- a/docs/language/reference/builders/projections.md +++ b/docs/language/reference/builders/projections.md @@ -26,20 +26,9 @@ def with_column(self, name: str, expr: ColumnExpr) -> Self - missing name: append at end - existing name: replace in place -## Example - -```incan -from pub::inql.functions import add, col, mul - -projected = ( - orders - .with_column("amount_x2", mul(col("amount"), 2)) - .with_column("amount_plus_one", add(col("amount"), 1)) -) -``` - ## Capability notes - `with_column(...)` is the explicit computed-column entrypoint. - Projection-list selection, query-block projection sugar, and alias-free symbolic surfaces lower to this scalar-expression model when exposed. - Numeric, string, and boolean helpers accept primitive values where their public signatures use value-or-column aliases. Use `lit(...)` for broad scalar-expression positions that specifically require a `ColumnExpr`. +- For task-oriented usage, see [Build deferred dataset transformations](../../how-to/dataset_transformations.md). diff --git a/docs/language/reference/dataset_methods.md b/docs/language/reference/dataset_methods.md index b51b334..c71c93d 100644 --- a/docs/language/reference/dataset_methods.md +++ b/docs/language/reference/dataset_methods.md @@ -2,82 +2,60 @@ This page documents the current carrier method surface. Builder-function details live under `reference/builders/`. -The Substrait helper surface behind these methods is split by semantic role: - -- `src/substrait/relations.incn` builds concrete `Rel` nodes -- `src/substrait/plans.incn` assembles `Plan` envelopes -- `src/substrait/inspect.incn` owns relation/plan inspection and output-column inference -- `src/schema_registry.incn` owns logical named-table schema binding - ## Carrier method surface -| Method | Signature | Meaning | -| ------------- | ------------------------------------------------------------ | ---------------------------------------------------------------------------------------------- | -| `filter` | `def filter(self, predicate: ColumnExpr) -> Self` | Restrict rows by a boolean scalar expression. | -| `join` | `def join(self, other: Self, on: ColumnExpr) -> Self` | Combine with another same-carrier relation using the package's scalar predicate surface. | -| `select` | `def select[U](self, assignments: list[ProjectionAssignment] = []) -> SameCarrier[U]` | Project an output row shape while preserving the carrier kind. | -| `with_column` | `def with_column(self, name: str, expr: ColumnExpr) -> Self` | Add or replace one projected column using a scalar expression. | -| `group_by` | `def group_by(self, columns: list[ColumnExpr]) -> Self` | Define grouping keys using scalar expressions. | -| `agg` | `def agg(self, measures: list[AggregateMeasure]) -> Self` | Apply aggregate measures over the current relation or current grouping. | -| `generate` | `def generate(self, generator: GeneratorApplication) -> Self` | Apply a relation-shaping generator such as `explode(...)` with explicit output aliases. | -| `with_window_column` | `def with_window_column(self, name: str, application: WindowFunctionApplication) -> Self` | Add or replace one projected column using a named window function. | -| `order_by` | `def order_by(self, columns: list[ColumnExpr]) -> Self` | Sort rows by scalar expressions or ordering helpers such as `asc(...)` and `desc(...)`. | -| `limit` | `def limit(self, n: int) -> Self` | Cap row count. | +| Method | Signature | Returns | Contract | +| -------------------- | ------------------------------------------------------------------------------- | ---------------- | -------- | +| `filter` | `def filter(self, predicate: ColumnExpr) -> Self` | Same carrier | Restrict rows by one boolean scalar expression. | +| `join` | `def join(self, other: Self, on: ColumnExpr, relation_name: str = "") -> Self` | Same carrier | Inner join with another same-carrier relation using a scalar predicate. | +| `left_join` | `def left_join(self, other: Self, on: ColumnExpr, relation_name: str = "") -> Self` | Same carrier | Left join with another same-carrier relation using a scalar predicate. | +| `select` | `def select[U](self, assignments: list[ProjectionAssignment] = []) -> SameCarrier[U]` | Same carrier kind with row type `U` | Project an output row shape while preserving carrier kind. | +| `with_column` | `def with_column(self, name: str, expr: ColumnExpr) -> Self` | Same carrier | Add or replace one projected column. | +| `group_by` | `def group_by(self, columns: list[ColumnExpr]) -> Self` | Same carrier | Define grouping keys for a following aggregate. | +| `agg` | `def agg(self, measures: list[AggregateMeasure]) -> Self` | Same carrier | Apply aggregate measures over the current relation or current grouping. | +| `generate` | `def generate(self, generator: GeneratorApplication) -> Self` | Same carrier | Apply a relation-shaping generator with explicit output aliases. | +| `with_window_column` | `def with_window_column(self, name: str, application: WindowFunctionApplication) -> Self` | Same carrier | Add or replace one projected column using a placed window function. | +| `order_by` | `def order_by(self, columns: list[ColumnExpr]) -> Self` | Same carrier | Sort rows by scalar expressions or ordering helpers. | +| `limit` | `def limit(self, n: int) -> Self` | Same carrier | Cap row count. | +| `to_substrait_plan` | `def to_substrait_plan(self) -> Plan` | `Plan` | Lower the carrier to a Substrait plan or raise on invalid lowering. | +| `try_to_substrait_plan` | `def try_to_substrait_plan(self) -> Result[Plan, SubstraitLoweringError]` | `Result[Plan, SubstraitLoweringError]` | Lower the carrier to a Substrait plan through a structured error envelope. | `SameCarrier[U]` means `DataFrame[U]` for `DataFrame[T]`, `LazyFrame[U]` for `LazyFrame[T]`, and `DataStream[U]` for `DataStream[T]`. The root `DataSet[T]` trait remains the common plan/schema contract; schema-changing projection is expressed on concrete carriers until Incan grows native trait type-family support. -## `with_column` - -### Signature - -```incan -def with_column(self, name: str, expr: ColumnExpr) -> Self -``` - -### Semantics - -- If `name` does not already exist, the new projected column is appended at the end. -- If `name` already exists, that slot is replaced in place. -- Replacement preserves ordinal position. -- The scalar-expression surface is: - - `col(name)` - - `lit(value)` - - `int_expr(...)` - - `float_expr(...)` - - `str_expr(...)` - - `bool_expr(...)` - - `add(left, right)` - - `mul(left, right)` - - `eq(left, right)` - - `gt(left, right)` - -### Example - -```incan -from pub::inql import LazyFrame -from pub::inql.functions import add, col, mul -from models import Order - -def enrich(orders: LazyFrame[Order]) -> LazyFrame[Order]: - return ( - orders - .with_column("amount_x2", mul(col("amount"), 2)) - .with_column("amount_plus_one", add(col("amount"), 1)) - ) -``` +## Method semantics + +| Method | Schema behavior | +| -------------------- | --------------- | +| `filter` | Preserves input columns. | +| `join` | Combines left and right output columns using the current join output-column contract. | +| `left_join` | Combines left and right output columns using the current left-join output-column contract. | +| `select` | Identity `select()` preserves the current planned columns; explicit assignments replace the output schema with assignment names. | +| `with_column` | Appends a missing name at the end; replaces an existing name in place while preserving ordinal position. | +| `group_by` | Produces grouped relation state; grouped output columns are finalized by `agg(...)`. | +| `agg` | Emits grouping keys plus aggregate measure outputs for grouped input, or aggregate measure outputs for global input. | +| `generate` | Preserves all input columns and appends generated output aliases. Alias collisions are rejected during planning or lowering. | +| `with_window_column` | Appends or replaces the named output column using the same add-or-replace projection semantics as `with_column(...)`. | +| `order_by` | Preserves input columns. | +| `limit` | Preserves input columns. | + +## Carrier-specific notes + +| Carrier | Notes | +| -------------- | ----- | +| `LazyFrame[T]` | Prism-backed deferred carrier. Transform methods append Prism nodes and preserve immutable branching. | +| `DataFrame[T]` | Materialized local carrier. Transform methods invalidate stale materialization and rebuild from the stored relation tree. | +| `DataStream[T]` | Streaming carrier surface is present in the type hierarchy; streaming-specific execution semantics remain future work. | + +## Expression inputs + +- Row-level methods that accept `ColumnExpr` use the shared scalar-expression model documented under [Filter builders](builders/filters.md), [Projection builders](builders/projections.md), and [Functions](functions/index.md). +- Aggregate methods use `AggregateMeasure` values from [Aggregate builders](builders/aggregates.md). +- `order_by(...)` accepts scalar expressions and ordering helpers such as `asc(...)`, `desc(...)`, `asc_nulls_first(...)`, and `desc_nulls_last(...)`. +- `generate(...)` accepts generator applications from [Generator and table-valued functions](functions/generators.md). +- `with_window_column(...)` accepts placed window function applications from [Window functions](functions/windows.md). ## Capability notes -- `join(...)` is constrained to same-carrier inputs and the `ColumnExpr` predicate surface shown in the signature. -- `select(...)` is the schema-changing projection boundary used by query blocks. Identity `select()` preserves the current row model through its surrounding expected type, while explicit assignments can retarget to a new row model. -- `generate(...)` preserves all input columns and appends generated output aliases for `explode`, `explode_outer`, `posexplode`, `posexplode_outer`, `inline`, `inline_outer`, `flatten`, and `stack` generator applications. Alias collisions are rejected during planning/lowering. -- `with_window_column(...)` supports placed ranking, distribution, offset, value, and aggregate-over-window helpers over explicit window specs. Portable helpers lower through Substrait window relations and execute through the DataFusion session adapter. -- `DataFrame[T]` exposes materialized metadata and preview text; row-level accessors belong to the materialized DataFrame API surface. -- Query-block and scoped DSL surfaces lower into these builder APIs rather than defining separate method semantics. - -## Related builder references - -- [Filter builders](builders/filters.md) -- [Aggregate builders](builders/aggregates.md) -- [Projection builders](builders/projections.md) -- [Window functions](functions/windows.md) +- `join(...)` and `left_join(...)` are constrained to same-carrier inputs and the `ColumnExpr` predicate surface shown in the signature. +- Query-block and scoped DSL surfaces lower into these carrier methods rather than defining separate method semantics. +- For task-oriented examples, see [Build deferred dataset transformations](../how-to/dataset_transformations.md). diff --git a/docs/language/reference/execution_context.md b/docs/language/reference/execution_context.md index 84c3270..9626283 100644 --- a/docs/language/reference/execution_context.md +++ b/docs/language/reference/execution_context.md @@ -106,15 +106,6 @@ Observation records do not contain row payloads or backend logs by default. The | `message` | `str` | Human-readable diagnostic message | | `target` | `Option[SemanticTarget]` | Semantic target associated with the diagnostic | -```incan -observed = session.collect_observed(summary) - -assert observed.observation.status == ExecutionObservationStatus.Success -match observed.data: - Some(df) => println(df.preview_text()) - None => println(observed.observation.diagnostics[0].message) -``` - ## Write surface | API | Returns | Notes | diff --git a/docs/language/reference/functions/approximate.md b/docs/language/reference/functions/approximate.md index bb95d02..3a33bb8 100644 --- a/docs/language/reference/functions/approximate.md +++ b/docs/language/reference/functions/approximate.md @@ -9,19 +9,6 @@ The portable RFC 023 aggregate surface is: | `approx_count_distinct(expr)` | Estimate the number of distinct non-null values produced by one expression. | | `approx_percentile(expr, percentile, accuracy=10000)` | Estimate one percentile over numeric non-null values. | -```incan -from pub::inql.functions import approx_count_distinct, approx_percentile, col - -summary = ( - events - .group_by([col("campaign_id")]) - .agg([ - approx_count_distinct(col("user_id")), - approx_percentile(col("latency_ms"), 0.95), - ]) -) -``` - `approx_count_distinct` is registered as an approximate aggregate with HyperLogLog-family metadata. The portable author contract is an approximate non-null distinct-count estimate. It does not expose a user-tunable relative-error parameter because the registered InQL Substrait extension mapping for this function is unary. Backend adapters must keep this approximation visible in capability/error handling rather than redefining exact `count_distinct` semantics. `approx_percentile` is registered as an approximate aggregate with t-digest-family metadata. `percentile` must be between `0.0` and `1.0` inclusive. `accuracy` must be positive and is carried as an explicit aggregate argument so backend capability handling can accept, emulate, or reject the requested approximation instead of silently changing semantics. Generated aggregate output names include the percentile and accuracy arguments. @@ -29,3 +16,5 @@ summary = ( Both helpers lower through registered InQL Substrait aggregate extension names. The DataFusion adapter maps `approx_count_distinct` to DataFusion's `approx_distinct` implementation and maps `approx_percentile` to `approx_percentile_cont` at the backend boundary. Sketch-state construction, merge, estimate, serialization, and deserialization are implemented by [Sketch functions](sketches.md). Those helpers use typed sketch logical values with sketch family, value domain, merge compatibility, and serialized format identity. Exposing sketch state as strings or binary payloads would violate the RFC 023 type-safety requirement. + +For task-oriented usage, see [Estimate approximate metrics](../../how-to/approximate_metrics.md). diff --git a/docs/language/reference/functions/format.md b/docs/language/reference/functions/format.md index 7f737ab..e6bd9fb 100644 --- a/docs/language/reference/functions/format.md +++ b/docs/language/reference/functions/format.md @@ -33,29 +33,10 @@ The format catalog includes deterministic hashes, URL helpers, JSON helpers, and | `from_csv[Model](expr)` | Parse a CSV row string into a logical map keyed by fields from an Incan model type. | | `to_csv(expr)` | Serialize a scalar or JSON array/object payload as a CSV row string. | -```incan -from pub::inql.functions import col, from_csv, from_json, get_json_object, parse_url, sha2, to_json - -model EventPayload: - type_ as "type": str - -model CsvRow: - id: int - status: str - -projected = ( - events - .with_column("user_hash", sha2(col("user_id"), 256)) - .with_column("campaign", parse_url(col("landing_page"), "utm_campaign")) - .with_column("event_type", get_json_object(col("payload"), "$.type")) - .with_column("payload_obj", from_json[EventPayload](col("payload"))) - .with_column("row_fields", from_csv[CsvRow](col("csv_line"))) - .with_column("payload_out", to_json(col("event_type"))) -) -``` - Hash helpers operate on UTF-8 string bytes and return lowercase hexadecimal strings. `sha2(...)` accepts `224`, `256`, `384`, and `512`; other digest lengths are rejected during expression construction. JSON helpers validate, normalize, and project payload text. CSV parsing returns logical map values instead of JSON text. Explicit-schema JSON and CSV helpers derive their schema from Incan model type parameters. These helpers do not read external files or return typed variant values. Use [Variant functions](variants.md) when a plan needs semi-structured kind inspection. The DataFusion adapter executes the full RFC 022 catalog with native DataFusion functions where available and Incan-authored adapter callbacks for helpers that DataFusion does not expose natively. + +For task-oriented usage, see [Normalize semi-structured fields](../../how-to/normalize_semistructured_fields.md). diff --git a/docs/language/reference/functions/generators.md b/docs/language/reference/functions/generators.md index b543c47..3bea878 100644 --- a/docs/language/reference/functions/generators.md +++ b/docs/language/reference/functions/generators.md @@ -2,22 +2,6 @@ Generators are relation-shaping operations. They are registry-backed like scalar and aggregate helpers, but they return `GeneratorApplication` values and must be applied through a relation method such as `generate(...)`. -```incan -from pub::inql import LazyFrame -from pub::inql.functions import array, col, explode, inline, lit, named_struct -from models import Order - -def order_lines(orders: LazyFrame[Order]) -> LazyFrame[Order]: - return orders.generate(explode(col("line_items"), "line_item")) - -def fixed_items(orders: LazyFrame[Order]) -> LazyFrame[Order]: - rows = array([ - named_struct(["sku", "quantity"], [lit("A"), lit(1)]), - named_struct(["sku", "quantity"], [lit("B"), lit(2)]), - ]) - return orders.generate(inline(rows, ["sku", "quantity"])) -``` - The explicit generator surface currently includes: | Function | Output aliases | Relation effect | @@ -34,3 +18,5 @@ The explicit generator surface currently includes: Generator applications preserve input columns and append generated columns in declaration order. Generated aliases are required, must be non-empty, and must not collide with existing input columns. Nested scalar helpers such as `array_flatten(...)` remain scalar expressions. They do not expand rows and are documented on the [nested data functions](nested.md) page. The relation-shaping `flatten(...)` helper is intentionally separate. + +For task-oriented usage, see [Expand rows with generators](../../how-to/generator_rows.md). diff --git a/docs/language/reference/functions/nested.md b/docs/language/reference/functions/nested.md index f525b9a..d9394ff 100644 --- a/docs/language/reference/functions/nested.md +++ b/docs/language/reference/functions/nested.md @@ -37,23 +37,10 @@ Generator or table-valued operations such as row-expanding `explode(...)` are se | `map_entries(map_expr)` | Return map entries. | | `named_struct(field_names, values)` | Build a struct expression with explicit field names. | -## Example - -```incan -from pub::inql.functions import array, array_contains, cardinality, col, element_at, lit - -projected = ( - events - .with_column("tags", array([lit("paid"), col("source")])) - .with_column("tag_count", cardinality(col("tags"))) - .with_column("has_paid_tag", array_contains(col("tags"), "paid")) - .with_column("first_tag", element_at(col("tags"), 1)) -) -``` - ## Semantics - Array indexing is one-based for `element_at(...)`, `array_position(...)`, and `array_slice(...)`. - `element_at(...)` currently maps to the portable array-element adapter path. Out-of-range behavior follows the current backend adapter's recoverable result until InQL has a richer static/runtime error-policy split for strict versus try-style element access. - `array_flatten(...)` is intentionally named to stay distinct from the relation-shaping generator `flatten(...)`. - Grouping or ordering by nested values is not documented as portable until equality and ordering semantics for arrays, maps, and structs are specified. +- For task-oriented usage, see [Work with nested row values](../../how-to/nested_row_values.md). diff --git a/docs/language/reference/functions/sketches.md b/docs/language/reference/functions/sketches.md index efc5ccd..55d2c08 100644 --- a/docs/language/reference/functions/sketches.md +++ b/docs/language/reference/functions/sketches.md @@ -13,30 +13,10 @@ Sketch helpers model approximate state as typed logical values, not as ordinary | `hll_serialize(sketch)` | Serialize typed HyperLogLog state explicitly. | | `hll_deserialize(payload, value_domain=SketchValueDomain.StringIdentifier, precision=14)` | Decode an explicit string payload value or scalar expression into typed HyperLogLog state. | -```incan -from pub::inql.functions import col, hll_sketch -from pub::inql.sketches import hll_estimate, hll_merge, hll_type, sketch_col - -daily = events.group_by([col("event_date")]).agg([ - hll_sketch(col("user_id"), precision=14), -]) - -literal_seed = events.group_by([col("event_date")]).agg([ - hll_sketch("anonymous-user", precision=14), -]) - -monthly = daily.group_by([col("month")]).agg([ - hll_merge(sketch_col("hll_sketch_user_id", hll_type(precision=14))), -]) - -reported = monthly.with_column( - "estimated_users", - hll_estimate(sketch_col("hll_merge_hll_sketch_user_id", hll_type(precision=14))), -) -``` - Sketch compatibility is structural. HyperLogLog sketches can merge only when family, value domain, precision, and serialization format match. `hll_deserialize(...)` requires those facts because they cannot be inferred from a payload alone. The public helper surface follows the typed value-or-column conventions used by the rest of the function catalog: `hll_sketch(...)` accepts primitive values or scalar expressions, while `hll_deserialize(...)` accepts string payload values or scalar expressions. RFC 025 helpers lower through InQL-owned Substrait extension mappings and carry sketch metadata in function options. The DataFusion adapter reports a backend planning diagnostic for typed sketch execution because it has no sketch runtime implementation. That rejection is an adapter capability boundary; the InQL plan remains typed and backend-neutral. + +For task-oriented usage, see [Build typed HyperLogLog sketches](../../how-to/typed_hll_sketches.md). diff --git a/docs/language/reference/functions/variants.md b/docs/language/reference/functions/variants.md index a729481..d88525b 100644 --- a/docs/language/reference/functions/variants.md +++ b/docs/language/reference/functions/variants.md @@ -20,21 +20,8 @@ Variant helpers model semi-structured payloads as typed logical values, not as o | `is_array(variant)` | Return whether the value is a present array. | | `is_object(variant)` | Return whether the value is a present object. | -```incan -from pub::inql.functions import col, is_array, is_null_value, parse_variant_json, typeof, variant_get - -payload = parse_variant_json(col("payload")) -literal_payload = parse_variant_json("{\"status\":\"paid\"}") - -projected = ( - events - .with_column("payload_kind", typeof(payload)) - .with_column("items_are_array", is_array(variant_get(payload, "$.items"))) - .with_column("dynamic_value", variant_get(literal_payload, col("json_path"))) - .with_column("deleted_was_variant_null", is_null_value(variant_get(payload, "$.deleted_at"))) -) -``` - `typeof(...)` accepts a `VariantExpr` value and returns a `StringColumnExpr`. Variant predicates accept `VariantExpr` values and return `BoolColumnExpr` values. They do not parse strings directly. Parse helpers accept `StrValueOrColumn` inputs; that keeps parsing, variant inspection, and RFC 022 JSON text helpers separate without forcing authors to wrap literal payloads in `lit(...)`. RFC 026 helpers lower through InQL-owned Substrait extension mappings and carry variant metadata in function options. The DataFusion adapter currently reports a backend planning diagnostic for typed variant execution because it has no variant runtime implementation. That rejection is an adapter capability boundary; the InQL plan remains typed and backend-neutral. + +For task-oriented usage, see [Inspect typed variant payloads](../../how-to/variant_payloads.md). diff --git a/docs/language/reference/functions/windows.md b/docs/language/reference/functions/windows.md index 600c673..e9f7d0e 100644 --- a/docs/language/reference/functions/windows.md +++ b/docs/language/reference/functions/windows.md @@ -2,24 +2,6 @@ Window helpers are relation-aware. A window function application produces one output value per input row while reading a partition of related rows. It is not an ordinary scalar expression and must be placed through a projection-like dataset method. -```incan -from pub::inql import LazyFrame -from pub::inql.functions import col, current_row, desc, lag, rank, sum, unbounded_preceding, window -from models import Order - -def ranked_orders(orders: LazyFrame[Order]) -> LazyFrame[Order]: - spec = window().partition_by([col("customer_id")]).order_by([desc(col("amount"))]) - return ( - orders - .with_window_column("customer_rank", rank().over(spec)) - .with_window_column("previous_amount", lag(col("amount")).over(spec)) - .with_window_column( - "running_amount", - sum(col("amount")).over(spec.rows_between(unbounded_preceding(), current_row())), - ) - ) -``` - The window helper surface includes: | Function | Meaning | Placement | @@ -40,3 +22,5 @@ The window helper surface includes: `WindowSpec.partition_by(...)` replaces the partition expressions. `WindowSpec.order_by(...)` replaces the ordering expressions. `WindowSpec.rows_between(...)` and `WindowSpec.range_between(...)` replace the frame. Ranking, distribution, offset, and value helpers require explicit ordering; missing ordering is rejected during logical lowering. `with_window_column(name, application)` preserves input columns and adds or replaces `name` using add-or-replace projection semantics. Compatible adjacent window projections lower through Substrait `ConsistentPartitionWindowRel` with registry-backed function anchors, frame bounds, invocation metadata, null-treatment options, and output aliases. The DataFusion session backend executes the portable window helpers through the Substrait adapter boundary. + +For task-oriented usage, see [Add window columns](../../how-to/window_columns.md). diff --git a/docs/language/reference/inspection.md b/docs/language/reference/inspection.md index 6a2652d..1a1ad36 100644 --- a/docs/language/reference/inspection.md +++ b/docs/language/reference/inspection.md @@ -50,26 +50,10 @@ The first Prism lineage extractor records: Lineage confidence is `Exact` when the extractor can resolve a dependency to exactly one known input field, and `Conservative` when the dependency name cannot be matched or is ambiguous in the current schema. Scalar function calls preserve argument dependencies, but their transformation kind is `Unknown` unless the node kind supplies a more specific relationship such as filter, join, aggregate, generator, window, or sort. Unsupported lineage is not represented as an empty graph; unsupported evidence families are listed separately. -## Example - -```incan -from pub::inql import LazyFrame, aggregate_as, col, eq, inspect_plan, str_lit, sum -from models import Order - -def inspect_paid_spend(orders: LazyFrame[Order]) -> None: - summary = orders - .filter(eq(col("status"), str_lit("paid"))) - .group_by([col("customer_id")]) - .agg([aggregate_as(sum(col("amount")), "total_amount")]) - - inspection = inspect_plan(summary) - assert inspection.output_fields[0].name == "customer_id" - assert inspection.output_fields[1].name == "total_amount" - assert len(inspection.lineage.edges) > 0 -``` - ## Current limits Inspection is read-only and plan-local. It does not execute the plan, inspect DataFusion physical plans, read catalog metadata, emit files, or make governance decisions. The first implementation computes local Prism plan graph, schema flow, lineage graph, public version/schema metadata attachments, diagnostics shape, and unsupported-evidence markers. Session execution observations and explicit adapter coverage checks are exposed through the execution context rather than through plan inspection. Semantic profiles, ingress mappings, client-session context, frontend coverage, quality observations, policy checkpoints, governed bundles, and external exchange bridges remain owned by their RFCs and are not silently inferred by this API. + +For a task-oriented workflow, see [Inspect a plan and lineage graph](../how-to/inspect_plan_lineage.md). From 8af44d3b579b0f59f66ecb6baa4e2dddf95424a3 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 23:30:17 +0200 Subject: [PATCH 07/12] fix - flatten observed session result flow --- src/session/types.incn | 166 ++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 93 deletions(-) diff --git a/src/session/types.incn b/src/session/types.incn index 3aefd5a..37ec793 100644 --- a/src/session/types.incn +++ b/src/session/types.incn @@ -44,6 +44,7 @@ from session.active import ( ) from session.backend_dispatch import backend_collect_plan, backend_execute_plan, backend_write_plan from session.backend_types import BackendError, BackendErrorKind, BackendRegistration +from dataset.materialization import DataFrameMaterialization from prism import prism_plan_id from substrait.errors import SubstraitLoweringError, SubstraitLoweringErrorKind from substrait.schema_registry import register_named_table_schema @@ -223,7 +224,22 @@ pub class Session: plan_target = _lazy_frame_plan_target(data.clone()) # Keep lowering and validation failures anchored to the authored Prism plan target. Backend errors still attach # to the same target, but they are produced after the adapter attempt has started. - match _plan_from_lazy_frame(data.clone()): + execute_result = _plan_from_lazy_frame(data.clone()).and_then( + (plan) => _execute_validated_plan(self._backend, self._registrations, plan), + ) + match execute_result: + Ok(_) => + return ObservedLazyFrame( + data=Some(data), + observation=_success_observation( + plan_target, + ExecutionOperationKind.Execute, + backend_name, + clock, + None, + ), + error=None, + ) Err(err) => return ObservedLazyFrame( data=None, @@ -236,47 +252,6 @@ pub class Session: ), error=Some(err), ) - Ok(plan) => - match _validate_named_table_binding(self._registrations, plan.clone()): - Err(err) => - return ObservedLazyFrame( - data=None, - observation=_failure_observation( - plan_target, - ExecutionOperationKind.Execute, - backend_name, - clock, - err.clone(), - ), - error=Some(err), - ) - Ok(_) => - match backend_execute_plan(self._backend, _to_backend_registrations(self._registrations), plan): - Ok(_) => - return ObservedLazyFrame( - data=Some(data), - observation=_success_observation( - plan_target, - ExecutionOperationKind.Execute, - backend_name, - clock, - None, - ), - error=None, - ) - Err(err) => - session_err = _session_error_from_backend_error(err) - return ObservedLazyFrame( - data=None, - observation=_failure_observation( - plan_target, - ExecutionOperationKind.Execute, - backend_name, - clock, - session_err.clone(), - ), - error=Some(session_err), - ) def collect[T with Clone](self, data: LazyFrame[T]) -> Result[DataFrame[T], SessionError]: """Validate and execute one lazy plan, returning a structured materialized DataFrame.""" @@ -291,7 +266,23 @@ pub class Session: backend_name = self.backend_name() plan_target = _lazy_frame_plan_target(data.clone()) # Materialization is the first point where row counts are available, so only successful collects record them. - match _plan_from_lazy_frame(data): + collect_result = _plan_from_lazy_frame(data).and_then( + (plan) => _collect_validated_plan[T](self._backend, self._registrations, plan), + ) + match collect_result: + Ok(frame) => + row_count = frame.row_count() + return ObservedDataFrame( + data=Some(frame), + observation=_success_observation( + plan_target, + ExecutionOperationKind.Collect, + backend_name, + clock, + Some(row_count), + ), + error=None, + ) Err(err) => return ObservedDataFrame( data=None, @@ -304,56 +295,6 @@ pub class Session: ), error=Some(err), ) - Ok(plan) => - rel = root_rel(plan.clone()) - match _validate_named_table_binding(self._registrations, plan.clone()): - Err(err) => - return ObservedDataFrame( - data=None, - observation=_failure_observation( - plan_target, - ExecutionOperationKind.Collect, - backend_name, - clock, - err.clone(), - ), - error=Some(err), - ) - Ok(_) => - match backend_collect_plan(self._backend, _to_backend_registrations(self._registrations), plan): - Ok(materialization) => - row_count = materialization.row_count - return ObservedDataFrame( - data=Some( - DataFrame( - _type_witness=_empty_type_witness(), - _materialization=materialization, - _substrait_rel=rel, - _planned_columns=relation_output_columns(rel.clone()), - ), - ), - observation=_success_observation( - plan_target, - ExecutionOperationKind.Collect, - backend_name, - clock, - Some(row_count), - ), - error=None, - ) - Err(err) => - session_err = _session_error_from_backend_error(err) - return ObservedDataFrame( - data=None, - observation=_failure_observation( - plan_target, - ExecutionOperationKind.Collect, - backend_name, - clock, - session_err.clone(), - ), - error=Some(session_err), - ) def check_coverage(self, requirements: list[AdapterRequirement]) -> list[AdapterCoverageRecord]: """Return adapter coverage records for explicit semantic requirements without inventing requirements.""" @@ -822,6 +763,45 @@ def _plan_from_bounded_dataset[T with Clone](data: BoundedDataSet[T]) -> Result[ return data.try_to_substrait_plan().map_err(_session_error_from_lowering_error) +def _execute_validated_plan( + backend: BackendSelection, + registrations: list[SessionRegistration], + plan: Plan, +) -> Result[None, SessionError]: + """Validate bindings for one plan, then execute it through the selected backend.""" + _validate_named_table_binding(registrations, plan.clone())? + return backend_execute_plan(backend, _to_backend_registrations(registrations), plan).map_err( + _session_error_from_backend_error, + ) + + +def _collect_validated_plan[T with Clone]( + backend: BackendSelection, + registrations: list[SessionRegistration], + plan: Plan, +) -> Result[DataFrame[T], SessionError]: + """Validate bindings for one plan, then collect it into a materialized DataFrame.""" + _validate_named_table_binding(registrations, plan.clone())? + materialization = backend_collect_plan(backend, _to_backend_registrations(registrations), plan.clone()).map_err( + _session_error_from_backend_error, + )? + return Ok(_data_frame_from_materialization[T](plan, materialization)) + + +def _data_frame_from_materialization[T with Clone]( + plan: Plan, + materialization: DataFrameMaterialization, +) -> DataFrame[T]: + """Build one collected DataFrame from backend materialization plus the executed logical plan.""" + rel = root_rel(plan) + return DataFrame( + _type_witness=_empty_type_witness(), + _materialization=materialization, + _substrait_rel=rel, + _planned_columns=relation_output_columns(rel.clone()), + ) + + def _invalid_registration_from_validation_error(err: ValidationError) -> SessionError: """Translate a validated input failure into a Session registration error.""" return invalid_registration(err.to_string()) From d9594ee2cb0809e2bada830ca409a080bc65f564 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 23:37:05 +0200 Subject: [PATCH 08/12] fix - avoid lazy frame clones for plan targets --- src/session/types.incn | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/session/types.incn b/src/session/types.incn index 37ec793..10983c3 100644 --- a/src/session/types.incn +++ b/src/session/types.incn @@ -46,6 +46,7 @@ from session.backend_dispatch import backend_collect_plan, backend_execute_plan, from session.backend_types import BackendError, BackendErrorKind, BackendRegistration from dataset.materialization import DataFrameMaterialization from prism import prism_plan_id +from prism.types import PrismStoreId from substrait.errors import SubstraitLoweringError, SubstraitLoweringErrorKind from substrait.schema_registry import register_named_table_schema from substrait.inspect import relation_output_columns, root_rel, read_kind_name, read_named_table_name @@ -221,7 +222,7 @@ pub class Session: """Execute one lazy plan and return structured observation evidence for success or failure.""" clock = _start_execution_clock() backend_name = self.backend_name() - plan_target = _lazy_frame_plan_target(data.clone()) + plan_target = _lazy_frame_plan_target(data._cursor.store_id, data._cursor.tip_id) # Keep lowering and validation failures anchored to the authored Prism plan target. Backend errors still attach # to the same target, but they are produced after the adapter attempt has started. execute_result = _plan_from_lazy_frame(data.clone()).and_then( @@ -264,7 +265,7 @@ pub class Session: """Collect one lazy plan and return structured observation evidence for success or failure.""" clock = _start_execution_clock() backend_name = self.backend_name() - plan_target = _lazy_frame_plan_target(data.clone()) + plan_target = _lazy_frame_plan_target(data._cursor.store_id, data._cursor.tip_id) # Materialization is the first point where row counts are available, so only successful collects record them. collect_result = _plan_from_lazy_frame(data).and_then( (plan) => _collect_validated_plan[T](self._backend, self._registrations, plan), @@ -429,9 +430,9 @@ def _start_execution_clock() -> _ExecutionClockStart: ) -def _lazy_frame_plan_target[T with Clone](data: LazyFrame[T]) -> SemanticTarget: +def _lazy_frame_plan_target(store_id: PrismStoreId, tip_id: int) -> SemanticTarget: """Build the same Prism plan target used by local inspection for a lazy frame.""" - plan_id = prism_plan_id(data._cursor.store_id, data._cursor.tip_id) + plan_id = prism_plan_id(store_id, tip_id) return SemanticTarget( kind=SemanticTargetKind.Plan, target_id=plan_id, From dcecf1f32f9f38f4a9236345484fdc33261225ab Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 23:41:56 +0200 Subject: [PATCH 09/12] fix - rely on borrow planning for observed execute --- src/session/types.incn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/session/types.incn b/src/session/types.incn index 10983c3..df1f464 100644 --- a/src/session/types.incn +++ b/src/session/types.incn @@ -225,7 +225,7 @@ pub class Session: plan_target = _lazy_frame_plan_target(data._cursor.store_id, data._cursor.tip_id) # Keep lowering and validation failures anchored to the authored Prism plan target. Backend errors still attach # to the same target, but they are produced after the adapter attempt has started. - execute_result = _plan_from_lazy_frame(data.clone()).and_then( + execute_result = _plan_from_lazy_frame(data).and_then( (plan) => _execute_validated_plan(self._backend, self._registrations, plan), ) match execute_result: From f98ac1aeec9998fd0bcf1682bb6f219e39d18928 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 3 Jul 2026 23:46:49 +0200 Subject: [PATCH 10/12] fix - flatten observed write result flow --- src/session/types.incn | 67 +++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/src/session/types.incn b/src/session/types.incn index df1f464..73e127c 100644 --- a/src/session/types.incn +++ b/src/session/types.incn @@ -351,7 +351,19 @@ pub class Session: """Run one validated write and preserve execution evidence for the write path.""" backend_name = self.backend_name() plan_target = _substrait_plan_target(plan.clone()) - match _sink_uri_from_text(target.uri): + write_result = _write_validated_plan(self._backend, self._registrations, plan, target) + match write_result: + Ok(_) => + return ObservedWrite( + observation=_success_observation( + plan_target, + ExecutionOperationKind.Write, + backend_name, + clock, + None, + ), + error=None, + ) Err(err) => return ObservedWrite( observation=_failure_observation( @@ -363,45 +375,6 @@ pub class Session: ), error=Some(err), ) - Ok(sink_uri) => - match _validate_named_table_binding(self._registrations, plan.clone()): - Err(err) => - return ObservedWrite( - observation=_failure_observation( - plan_target, - ExecutionOperationKind.Write, - backend_name, - clock, - err.clone(), - ), - error=Some(err), - ) - Ok(_) => - registrations = _to_backend_registrations(self._registrations) - match backend_write_plan(self._backend, registrations, plan, sink_uri.0, target.sink_kind): - Ok(_) => - return ObservedWrite( - observation=_success_observation( - plan_target, - ExecutionOperationKind.Write, - backend_name, - clock, - None, - ), - error=None, - ) - Err(err) => - session_err = _session_error_from_backend_error(err) - return ObservedWrite( - observation=_failure_observation( - plan_target, - ExecutionOperationKind.Write, - backend_name, - clock, - session_err.clone(), - ), - error=Some(session_err), - ) pub class SessionBuilder: @@ -789,6 +762,20 @@ def _collect_validated_plan[T with Clone]( return Ok(_data_frame_from_materialization[T](plan, materialization)) +def _write_validated_plan( + backend: BackendSelection, + registrations: list[SessionRegistration], + plan: Plan, + target: SinkTarget, +) -> Result[None, SessionError]: + """Validate sink and table bindings, then write one plan through the selected backend.""" + sink_uri = _sink_uri_from_text(target.uri)? + _validate_named_table_binding(registrations, plan.clone())? + return backend_write_plan(backend, _to_backend_registrations(registrations), plan, sink_uri.0, target.sink_kind).map_err( + _session_error_from_backend_error, + ) + + def _data_frame_from_materialization[T with Clone]( plan: Plan, materialization: DataFrameMaterialization, From 80689836ffa2062080007880b5ca6126ac9fe2c0 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Sat, 4 Jul 2026 00:16:07 +0200 Subject: [PATCH 11/12] fix - flatten datafusion backend result flow --- src/session/datafusion_backend.incn | 169 +++++++++++++++------------- 1 file changed, 93 insertions(+), 76 deletions(-) diff --git a/src/session/datafusion_backend.incn b/src/session/datafusion_backend.incn index 49159fe..f380b7c 100644 --- a/src/session/datafusion_backend.incn +++ b/src/session/datafusion_backend.incn @@ -43,6 +43,7 @@ from rust::datafusion::execution::options import ArrowReadOptions from rust::datafusion::logical_expr import ( Cast as DataFusionCast, Expr as DataFusionExpr, + LogicalPlan, LogicalPlanBuilder, Operator as DataFusionOperator, WindowFrame as DataFusionWindowFrame, @@ -176,9 +177,8 @@ pub async def datafusion_execute_async( await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? - match await df.collect(): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) + await _collect_dataframe_batches(df)? + return Ok(None) pub async def datafusion_collect_materialization_async( @@ -193,22 +193,14 @@ pub async def datafusion_collect_materialization_async( await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? - match await df.clone().collect(): - Ok(batches) => match await df.to_string(): - Ok(rendered) => - mut row_count = 0 - for batch in batches: - row_count += _rust_usize_to_int(batch.num_rows())? - return Ok( - DataFrameMaterialization( - resolved_columns=resolved_columns, - row_count=row_count, - preview_text=rendered, - ), - ) - Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) - - Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) + batches = await _collect_dataframe_batches(df.clone())? + rendered = (await df.to_string()).map_err( + (err) => backend_error(BackendErrorKind.BackendExecutionError, err.to_string()), + )? + mut row_count = 0 + for batch in batches: + row_count += _rust_usize_to_int(batch.num_rows())? + return Ok(DataFrameMaterialization(resolved_columns=resolved_columns, row_count=row_count, preview_text=rendered)) pub async def datafusion_write_csv_async( @@ -223,9 +215,10 @@ pub async def datafusion_write_csv_async( await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? - match await df.write_csv(uri, DataFrameWriteOptions.new(), None): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendSinkError, err.to_string())) + (await df.write_csv(uri, DataFrameWriteOptions.new(), None)).map_err( + (err) => backend_error(BackendErrorKind.BackendSinkError, err.to_string()), + )? + return Ok(None) pub async def datafusion_write_parquet_async( @@ -240,9 +233,10 @@ pub async def datafusion_write_parquet_async( await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? - match await df.write_parquet(uri, DataFrameWriteOptions.new(), None): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendSinkError, err.to_string())) + (await df.write_parquet(uri, DataFrameWriteOptions.new(), None)).map_err( + (err) => backend_error(BackendErrorKind.BackendSinkError, err.to_string()), + )? + return Ok(None) async def _dataframe_from_plan(ctx: SessionContext, plan: Plan) -> Result[RustDataFrame, BackendError]: @@ -354,12 +348,12 @@ async def _dataframe_from_standard_plan(ctx: SessionContext, plan: Plan) -> Resu consumer_plan = _consumer_plan_from_current_plan(plan)? state = ctx.state() - match await from_substrait_plan(state, consumer_plan): - Ok(logical_plan) => match await ctx.execute_logical_plan(logical_plan): - Ok(df) => return Ok(df) - Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) - - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + logical_plan = (await from_substrait_plan(state, consumer_plan)).map_err( + (err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()), + )? + return (await ctx.execute_logical_plan(logical_plan)).map_err( + (err) => backend_error(BackendErrorKind.BackendExecutionError, err.to_string()), + ) async def _dataframe_from_window_rel( @@ -1054,24 +1048,24 @@ async def _register_materialized_dataframe( df: RustDataFrame, ) -> Result[None, BackendError]: """Collect one bridged DataFrame into a MemTable so Substrait reads see concrete table scans.""" - match await df.clone().collect(): - Ok(batches) => - if len(batches) == 0: - # Empty generator output still needs a registered relation for the rewritten temp ReadRel to resolve. - match ctx.register_table(f"{table_name}", df.into_view()): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) - _register_materialized_schema_from_batch(f"{table_name}", batches[0]) - schema = batches[0].schema() - # Non-empty outputs are frozen into a MemTable so the stock consumer reads a concrete table instead of an - # InQL-specific generator extension relation. - match MemTable.try_new(schema, [batches]): - Ok(table) => - match ctx.register_table(f"{table_name}", Arc.new(table)): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) - Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) + batches = await _collect_dataframe_batches(df.clone())? + if len(batches) == 0: + # Empty generator output still needs a registered relation for the rewritten temp ReadRel to resolve. + ctx + .register_table(f"{table_name}", df.into_view()) + .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? + return Ok(None) + _register_materialized_schema_from_batch(f"{table_name}", batches[0]) + schema = batches[0].schema() + # Non-empty outputs are frozen into a MemTable so the stock consumer reads a concrete table instead of an + # InQL-specific generator extension relation. + table = MemTable + .try_new(schema, [batches]) + .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? + ctx + .register_table(f"{table_name}", Arc.new(table)) + .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? + return Ok(None) def _register_materialized_schema_from_batch(table_name: str, batch: RecordBatch) -> None: @@ -1592,9 +1586,9 @@ def _decode_generator_payload(extension: ExtensionSingleRel) -> Result[Generator """Decode one InQL generator extension payload.""" match extension.detail: Some(detail) => - match decode_generator_extension_payload(detail.value): - Ok(payload) => return Ok(payload) - Err(message) => return Err(backend_error(BackendErrorKind.BackendPlanningError, message)) + return decode_generator_extension_payload(detail.value).map_err( + (message) => backend_error(BackendErrorKind.BackendPlanningError, message), + ) None => return Err(backend_error(BackendErrorKind.BackendPlanningError, "generator extension is missing detail")) @@ -1631,11 +1625,33 @@ def _unnest_columns(df: RustDataFrame, columns: list[str], preserve_nulls: bool) parts = df.into_parts() state = parts.0 logical_plan = parts.1 + builder = _unnest_plan_builder(logical_plan, datafusion_columns, options)? + next_plan = _build_logical_plan(builder)? + return Ok(RustDataFrame.new(state, next_plan)) + + +async def _collect_dataframe_batches(df: RustDataFrame) -> Result[list[RecordBatch], BackendError]: + """Collect one DataFusion frame and preserve InQL's backend execution error envelope.""" + match await df.collect(): + Ok(batches) => return Ok(batches) + Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) + + +def _unnest_plan_builder( + logical_plan: LogicalPlan, + datafusion_columns: list[Column], + options: UnnestOptions, +) -> Result[LogicalPlanBuilder, BackendError]: + """Create a DataFusion unnest builder while preserving InQL's planning error envelope.""" match LogicalPlanBuilder.from(logical_plan).unnest_columns_with_options(datafusion_columns, options): - Ok(builder) => - match builder.build(): - Ok(next_plan) => return Ok(RustDataFrame.new(state, next_plan)) - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + Ok(builder) => return Ok(builder) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + + +def _build_logical_plan(builder: LogicalPlanBuilder) -> Result[LogicalPlan, BackendError]: + """Build a DataFusion logical plan while preserving InQL's planning error envelope.""" + match builder.build(): + Ok(plan) => return Ok(plan) Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) @@ -1649,10 +1665,9 @@ def _rename_flat_generator_outputs( return Err(backend_error(BackendErrorKind.BackendPlanningError, "generator payload/output arity mismatch")) mut current = df for idx, output_column in enumerate(output_columns): - match current.with_column_renamed(temp_columns[idx], output_column): - Ok(next_df) => - current = next_df - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + current = current + .with_column_renamed(temp_columns[idx], output_column) + .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? return Ok(current) @@ -1667,10 +1682,9 @@ def _rename_struct_generator_outputs( # generator output aliases. for output_column in output_columns: field_name = f"{temp_column}.{output_column}" - match current.with_column_renamed(field_name, output_column): - Ok(next_df) => - current = next_df - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + current = current + .with_column_renamed(field_name, output_column) + .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? return Ok(current) @@ -1712,19 +1726,22 @@ async def _register_one(ctx: SessionContext, logical_name: str, source: TableSou match datafusion_registration_for_source(source): DataFusionSourceRegistration.Csv => csv_opts = CsvReadOptions.new().has_header(true) - match await ctx.register_csv(logical_name, source.uri, csv_opts): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendRegistrationError, err.to_string())) + (await ctx.register_csv(logical_name, source.uri, csv_opts)).map_err( + (err) => backend_error(BackendErrorKind.BackendRegistrationError, err.to_string()), + )? + return Ok(None) DataFusionSourceRegistration.Parquet => parquet_opts = ParquetReadOptions.default() - match await ctx.register_parquet(logical_name, source.uri, parquet_opts): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendRegistrationError, err.to_string())) + (await ctx.register_parquet(logical_name, source.uri, parquet_opts)).map_err( + (err) => backend_error(BackendErrorKind.BackendRegistrationError, err.to_string()), + )? + return Ok(None) DataFusionSourceRegistration.Arrow => arrow_opts = ArrowReadOptions.default() - match await ctx.register_arrow(logical_name, source.uri, arrow_opts): - Ok(_) => return Ok(None) - Err(err) => return Err(backend_error(BackendErrorKind.BackendRegistrationError, err.to_string())) + (await ctx.register_arrow(logical_name, source.uri, arrow_opts)).map_err( + (err) => backend_error(BackendErrorKind.BackendRegistrationError, err.to_string()), + )? + return Ok(None) def datafusion_registration_for_source(source: TableSource) -> DataFusionSourceRegistration: @@ -1738,9 +1755,9 @@ def datafusion_registration_for_source(source: TableSource) -> DataFusionSourceR def _consumer_plan_from_current_plan(plan: Plan) -> Result[ConsumerPlan, BackendError]: """Decode the producer-side plan bytes into DataFusion's consumer Plan type.""" encoded = _datafusion_producer_plan(plan).encode_to_vec() - match ConsumerPlan.decode(encoded.as_slice()): - Ok(decoded) => return Ok(decoded) - Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + return ConsumerPlan + .decode(encoded.as_slice()) + .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) def _datafusion_producer_plan(plan: Plan) -> Plan: From 83dd8ac0dcb48fc4ff7b51444d846dbcff8a33ae Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Sat, 4 Jul 2026 02:17:14 +0200 Subject: [PATCH 12/12] fix - make datafusion result boundaries metadata safe --- src/session/datafusion_backend.incn | 145 ++++++++++++++++++---------- 1 file changed, 92 insertions(+), 53 deletions(-) diff --git a/src/session/datafusion_backend.incn b/src/session/datafusion_backend.incn index f380b7c..768db03 100644 --- a/src/session/datafusion_backend.incn +++ b/src/session/datafusion_backend.incn @@ -194,9 +194,7 @@ pub async def datafusion_collect_materialization_async( df = await _dataframe_from_plan(ctx, plan)? batches = await _collect_dataframe_batches(df.clone())? - rendered = (await df.to_string()).map_err( - (err) => backend_error(BackendErrorKind.BackendExecutionError, err.to_string()), - )? + rendered = await _render_dataframe_preview(df)? mut row_count = 0 for batch in batches: row_count += _rust_usize_to_int(batch.num_rows())? @@ -215,10 +213,7 @@ pub async def datafusion_write_csv_async( await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? - (await df.write_csv(uri, DataFrameWriteOptions.new(), None)).map_err( - (err) => backend_error(BackendErrorKind.BackendSinkError, err.to_string()), - )? - return Ok(None) + return await _write_dataframe_csv(df, uri) pub async def datafusion_write_parquet_async( @@ -233,10 +228,7 @@ pub async def datafusion_write_parquet_async( await _register_sources(ctx, registrations)? df = await _dataframe_from_plan(ctx, plan)? - (await df.write_parquet(uri, DataFrameWriteOptions.new(), None)).map_err( - (err) => backend_error(BackendErrorKind.BackendSinkError, err.to_string()), - )? - return Ok(None) + return await _write_dataframe_parquet(df, uri) async def _dataframe_from_plan(ctx: SessionContext, plan: Plan) -> Result[RustDataFrame, BackendError]: @@ -347,13 +339,24 @@ async def _dataframe_from_standard_plan(ctx: SessionContext, plan: Plan) -> Resu None => pass consumer_plan = _consumer_plan_from_current_plan(plan)? - state = ctx.state() - logical_plan = (await from_substrait_plan(state, consumer_plan)).map_err( - (err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()), - )? - return (await ctx.execute_logical_plan(logical_plan)).map_err( - (err) => backend_error(BackendErrorKind.BackendExecutionError, err.to_string()), - ) + return await _dataframe_from_consumer_plan(ctx, consumer_plan) + + +async def _dataframe_from_consumer_plan( + ctx: SessionContext, + consumer_plan: ConsumerPlan, +) -> Result[RustDataFrame, BackendError]: + """Build a DataFusion frame from a consumer plan without leaking adapter errors into callers.""" + match await from_substrait_plan(ctx.state(), consumer_plan): + Ok(logical_plan) => return await _execute_logical_plan(ctx, logical_plan) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + + +async def _execute_logical_plan(ctx: SessionContext, logical_plan: LogicalPlan) -> Result[RustDataFrame, BackendError]: + """Execute one DataFusion logical plan while preserving InQL's backend error envelope.""" + match await ctx.execute_logical_plan(logical_plan): + Ok(df) => return Ok(df) + Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) async def _dataframe_from_window_rel( @@ -1051,21 +1054,32 @@ async def _register_materialized_dataframe( batches = await _collect_dataframe_batches(df.clone())? if len(batches) == 0: # Empty generator output still needs a registered relation for the rewritten temp ReadRel to resolve. - ctx - .register_table(f"{table_name}", df.into_view()) - .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? - return Ok(None) + return _register_materialized_view(ctx, table_name, df) _register_materialized_schema_from_batch(f"{table_name}", batches[0]) schema = batches[0].schema() # Non-empty outputs are frozen into a MemTable so the stock consumer reads a concrete table instead of an # InQL-specific generator extension relation. - table = MemTable - .try_new(schema, [batches]) - .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? - ctx - .register_table(f"{table_name}", Arc.new(table)) - .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? - return Ok(None) + match MemTable.try_new(schema, [batches]): + Ok(table) => return _register_materialized_memtable(ctx, table_name, table) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + + +def _register_materialized_view(ctx: SessionContext, table_name: str, df: RustDataFrame) -> Result[None, BackendError]: + """Register an empty materialized relation through DataFusion's view provider.""" + match ctx.register_table(f"{table_name}", df.into_view()): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + + +def _register_materialized_memtable( + ctx: SessionContext, + table_name: str, + table: MemTable, +) -> Result[None, BackendError]: + """Register a non-empty materialized relation through a concrete DataFusion MemTable.""" + match ctx.register_table(f"{table_name}", Arc.new(table)): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) def _register_materialized_schema_from_batch(table_name: str, batch: RecordBatch) -> None: @@ -1586,9 +1600,9 @@ def _decode_generator_payload(extension: ExtensionSingleRel) -> Result[Generator """Decode one InQL generator extension payload.""" match extension.detail: Some(detail) => - return decode_generator_extension_payload(detail.value).map_err( - (message) => backend_error(BackendErrorKind.BackendPlanningError, message), - ) + match decode_generator_extension_payload(detail.value): + Ok(payload) => return Ok(payload) + Err(message) => return Err(backend_error(BackendErrorKind.BackendPlanningError, message)) None => return Err(backend_error(BackendErrorKind.BackendPlanningError, "generator extension is missing detail")) @@ -1637,6 +1651,27 @@ async def _collect_dataframe_batches(df: RustDataFrame) -> Result[list[RecordBat Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) +async def _render_dataframe_preview(df: RustDataFrame) -> Result[str, BackendError]: + """Render one DataFusion preview table while preserving InQL's backend error envelope.""" + match await df.to_string(): + Ok(rendered) => return Ok(rendered) + Err(err) => return Err(backend_error(BackendErrorKind.BackendExecutionError, err.to_string())) + + +async def _write_dataframe_csv(df: RustDataFrame, uri: str) -> Result[None, BackendError]: + """Write one DataFusion frame to CSV while preserving InQL's sink error envelope.""" + match await df.write_csv(uri, DataFrameWriteOptions.new(), None): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendSinkError, err.to_string())) + + +async def _write_dataframe_parquet(df: RustDataFrame, uri: str) -> Result[None, BackendError]: + """Write one DataFusion frame to Parquet while preserving InQL's sink error envelope.""" + match await df.write_parquet(uri, DataFrameWriteOptions.new(), None): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendSinkError, err.to_string())) + + def _unnest_plan_builder( logical_plan: LogicalPlan, datafusion_columns: list[Column], @@ -1655,6 +1690,17 @@ def _build_logical_plan(builder: LogicalPlanBuilder) -> Result[LogicalPlan, Back Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) +def _rename_generator_output_column( + df: RustDataFrame, + source_name: str, + output_name: str, +) -> Result[RustDataFrame, BackendError]: + """Rename one generated DataFusion output column while preserving InQL's planning error envelope.""" + match df.with_column_renamed(source_name, output_name): + Ok(next_df) => return Ok(next_df) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + + def _rename_flat_generator_outputs( df: RustDataFrame, temp_columns: list[str], @@ -1665,9 +1711,7 @@ def _rename_flat_generator_outputs( return Err(backend_error(BackendErrorKind.BackendPlanningError, "generator payload/output arity mismatch")) mut current = df for idx, output_column in enumerate(output_columns): - current = current - .with_column_renamed(temp_columns[idx], output_column) - .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? + current = _rename_generator_output_column(current, temp_columns[idx], output_column)? return Ok(current) @@ -1682,9 +1726,7 @@ def _rename_struct_generator_outputs( # generator output aliases. for output_column in output_columns: field_name = f"{temp_column}.{output_column}" - current = current - .with_column_renamed(field_name, output_column) - .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string()))? + current = _rename_generator_output_column(current, field_name, output_column)? return Ok(current) @@ -1726,22 +1768,19 @@ async def _register_one(ctx: SessionContext, logical_name: str, source: TableSou match datafusion_registration_for_source(source): DataFusionSourceRegistration.Csv => csv_opts = CsvReadOptions.new().has_header(true) - (await ctx.register_csv(logical_name, source.uri, csv_opts)).map_err( - (err) => backend_error(BackendErrorKind.BackendRegistrationError, err.to_string()), - )? - return Ok(None) + match await ctx.register_csv(logical_name, source.uri, csv_opts): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendRegistrationError, err.to_string())) DataFusionSourceRegistration.Parquet => parquet_opts = ParquetReadOptions.default() - (await ctx.register_parquet(logical_name, source.uri, parquet_opts)).map_err( - (err) => backend_error(BackendErrorKind.BackendRegistrationError, err.to_string()), - )? - return Ok(None) + match await ctx.register_parquet(logical_name, source.uri, parquet_opts): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendRegistrationError, err.to_string())) DataFusionSourceRegistration.Arrow => arrow_opts = ArrowReadOptions.default() - (await ctx.register_arrow(logical_name, source.uri, arrow_opts)).map_err( - (err) => backend_error(BackendErrorKind.BackendRegistrationError, err.to_string()), - )? - return Ok(None) + match await ctx.register_arrow(logical_name, source.uri, arrow_opts): + Ok(_) => return Ok(None) + Err(err) => return Err(backend_error(BackendErrorKind.BackendRegistrationError, err.to_string())) def datafusion_registration_for_source(source: TableSource) -> DataFusionSourceRegistration: @@ -1755,9 +1794,9 @@ def datafusion_registration_for_source(source: TableSource) -> DataFusionSourceR def _consumer_plan_from_current_plan(plan: Plan) -> Result[ConsumerPlan, BackendError]: """Decode the producer-side plan bytes into DataFusion's consumer Plan type.""" encoded = _datafusion_producer_plan(plan).encode_to_vec() - return ConsumerPlan - .decode(encoded.as_slice()) - .map_err((err) => backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) + match ConsumerPlan.decode(encoded.as_slice()): + Ok(decoded) => return Ok(decoded) + Err(err) => return Err(backend_error(BackendErrorKind.BackendPlanningError, err.to_string())) def _datafusion_producer_plan(plan: Plan) -> Plan: