vectorize-io · cdbartholomew · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -3442,6 +3442,8 @@ async def retain_batch_async(
                 llm_input_tokens=total_usage.input_tokens,
                 llm_output_tokens=total_usage.output_tokens,
                 llm_total_tokens=total_usage.total_tokens,
+                llm_cached_input_tokens=getattr(total_usage, "cached_tokens", 0) or 0,
+                llm_thoughts_tokens=getattr(total_usage, "thoughts_tokens", 0) or 0,
                 processed_content_tokens=total_processed_content_tokens,
             )
             try:

diff --git a/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py b/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py
@@ -424,6 +424,7 @@ def _build_generation_config(use_cache: bool) -> "genai_types.GenerateContentCon
                         output_tokens=output_tokens,
                         total_tokens=input_tokens + output_tokens,
                         cached_tokens=cached_tokens,
+                        thoughts_tokens=thoughts_tokens,
                     )
                     return result, token_usage
                 return result
@@ -762,6 +763,8 @@ def _build_tools_config(use_cache: bool) -> "genai_types.GenerateContentConfig":
                     finish_reason=finish_reason,
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
+                    cached_tokens=cached_input_tokens,
+                    thoughts_tokens=thoughts_tokens,
                 )
 
             except genai_errors.APIError as e:

diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -141,7 +141,7 @@ async def _generate_structured_output(
     response_schema: dict,
     llm_config: "LLMProvider",
     reflect_id: str,
-) -> tuple[dict[str, Any] | None, int, int]:
+) -> tuple[dict[str, Any] | None, int, int, int, int]:
     """Generate structured output from an answer using the provided JSON schema.
 
     Args:
@@ -151,7 +151,7 @@ async def _generate_structured_output(
         reflect_id: Reflect ID for logging
 
     Returns:
-        Tuple of (structured_output, input_tokens, output_tokens).
+        Tuple of (structured_output, input_tokens, output_tokens, cached_tokens, thoughts_tokens).
         structured_output is None if generation fails.
     """
     try:
@@ -186,7 +186,7 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
 
         if not fields:
             logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output")
-            return None, 0, 0
+            return None, 0, 0, 0, 0
 
         DynamicModel = create_model("StructuredResponse", **fields)
 
@@ -259,7 +259,13 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
                 logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output")
 
         logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields")
-        return structured_output, usage.input_tokens, usage.output_tokens
+        return (
+            structured_output,
+            usage.input_tokens,
+            usage.output_tokens,
+            getattr(usage, "cached_tokens", 0) or 0,
+            getattr(usage, "thoughts_tokens", 0) or 0,
+        )
 
     except Exception as e:
         logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}")
@@ -435,9 +441,14 @@ async def run_reflect_agent(
     llm_trace: list[dict[str, Any]] = []
     context_history: list[dict[str, Any]] = []  # For final prompt fallback
 
-    # Token usage tracking - accumulate across all LLM calls
+    # Token usage tracking - accumulate across all LLM calls.
+    # cached_tokens and thoughts_tokens are surfaced for cost attribution
+    # and prompt-cache tuning. Both are subsets of (or parallel to) the
+    # input/output counts and are NOT double-counted in total_tokens.
     total_input_tokens = 0
     total_output_tokens = 0
+    total_cached_tokens = 0
+    total_thoughts_tokens = 0
 
     # Track available IDs for validation (prevents hallucinated citations)
     available_memory_ids: set[str] = set()
@@ -460,6 +471,8 @@ def _get_usage() -> TokenUsageSummary:
             input_tokens=total_input_tokens,
             output_tokens=total_output_tokens,
             total_tokens=total_input_tokens + total_output_tokens,
+            cached_tokens=total_cached_tokens,
+            thoughts_tokens=total_thoughts_tokens,
         )
 
     def _log_completion(answer: str, iterations: int, forced: bool = False):
@@ -526,6 +539,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -539,11 +554,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -588,6 +609,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -600,11 +623,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
 
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -661,6 +690,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             consecutive_errors = 0
             total_input_tokens += result.input_tokens
             total_output_tokens += result.output_tokens
+            total_cached_tokens += getattr(result, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(result, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": f"agent_{iteration + 1}",
@@ -709,6 +740,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -722,11 +755,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -783,6 +822,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                     )
                     total_input_tokens += rewrite_usage.input_tokens
                     total_output_tokens += rewrite_usage.output_tokens
+                    total_cached_tokens += getattr(rewrite_usage, "cached_tokens", 0) or 0
+                    total_thoughts_tokens += getattr(rewrite_usage, "thoughts_tokens", 0) or 0
                     llm_trace.append(
                         {
                             "scope": "final_rewrite",
@@ -796,11 +837,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                 # Generate structured output if schema provided
                 structured_output = None
                 if response_schema and answer:
-                    structured_output, struct_in, struct_out = await _generate_structured_output(
-                        answer, response_schema, llm_config, reflect_id
-                    )
+                    (
+                        structured_output,
+                        struct_in,
+                        struct_out,
+                        struct_cached,
+                        struct_thoughts,
+                    ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                     total_input_tokens += struct_in
                     total_output_tokens += struct_out
+                    total_cached_tokens += struct_cached
+                    total_thoughts_tokens += struct_thoughts
 
                 _log_completion(answer, iteration + 1)
                 return ReflectAgentResult(
@@ -835,6 +882,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -848,11 +897,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -1147,14 +1202,16 @@ async def _process_done_tool(
     structured_output = None
     final_usage = usage
     if response_schema and llm_config and answer:
-        structured_output, struct_in, struct_out = await _generate_structured_output(
+        structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
             answer, response_schema, llm_config, reflect_id
         )
         # Add structured output tokens to usage
         final_usage = TokenUsageSummary(
             input_tokens=usage.input_tokens + struct_in,
             output_tokens=usage.output_tokens + struct_out,
             total_tokens=usage.total_tokens + struct_in + struct_out,
+            cached_tokens=usage.cached_tokens + struct_cached,
+            thoughts_tokens=usage.thoughts_tokens + struct_thoughts,
         )
 
     log_completion(answer, iterations)

diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
@@ -78,9 +78,20 @@ class DirectiveInfo(BaseModel):
 class TokenUsageSummary(BaseModel):
     """Total token usage across all LLM calls."""
 
-    input_tokens: int = Field(default=0, description="Total input tokens used")
-    output_tokens: int = Field(default=0, description="Total output tokens used")
-    total_tokens: int = Field(default=0, description="Total tokens (input + output)")
+    input_tokens: int = Field(default=0, description="Total input tokens used (includes any cached prefix tokens)")
+    output_tokens: int = Field(default=0, description="Total visible output tokens used (excludes reasoning/thoughts)")
+    total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
+    cached_tokens: int = Field(
+        default=0,
+        description="Cached/cache-read prompt tokens summed across calls. Subset of input_tokens.",
+    )
+    thoughts_tokens: int = Field(
+        default=0,
+        description=(
+            "Reasoning/thinking tokens summed across calls. Billed at the output rate by some providers "
+            "but not part of visible output."
+        ),
+    )
 
 
 class ReflectAgentResult(BaseModel):

diff --git a/hindsight-api-slim/hindsight_api/engine/response_models.py b/hindsight-api-slim/hindsight_api/engine/response_models.py
@@ -31,8 +31,20 @@ class LLMToolCallResult(BaseModel):
     content: str | None = Field(default=None, description="Text content if any")
     tool_calls: list[LLMToolCall] = Field(default_factory=list, description="Tool calls requested by the LLM")
     finish_reason: str | None = Field(default=None, description="Reason the LLM stopped: 'stop', 'tool_calls', etc.")
-    input_tokens: int = Field(default=0, description="Input tokens used in this call")
-    output_tokens: int = Field(default=0, description="Output tokens used in this call")
+    input_tokens: int = Field(
+        default=0,
+        description="Input tokens used in this call (includes any cached prefix tokens reported by the provider)",
+    )
+    output_tokens: int = Field(
+        default=0, description="Visible output tokens used in this call (excludes reasoning/thoughts)"
+    )
+    cached_tokens: int = Field(
+        default=0, description="Cached prefix tokens, when reported by the provider. Subset of input_tokens."
+    )
+    thoughts_tokens: int = Field(
+        default=0,
+        description="Reasoning/thinking tokens. Billed at the output rate by some providers but not part of visible output.",
+    )
 
 
 class ToolCallTrace(BaseModel):
@@ -91,9 +103,18 @@ class TokenUsage(BaseModel):
     )
 
     input_tokens: int = Field(default=0, description="Number of input/prompt tokens consumed")
-    output_tokens: int = Field(default=0, description="Number of output/completion tokens generated")
-    total_tokens: int = Field(default=0, description="Total tokens (input + output)")
+    output_tokens: int = Field(
+        default=0, description="Number of visible output/completion tokens generated (excludes reasoning/thoughts)"
+    )
+    total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
     cached_tokens: int = Field(default=0, description="Cached/cache-read prompt tokens, when reported by the provider")
+    thoughts_tokens: int = Field(
+        default=0,
+        description=(
+            "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers "
+            "(e.g. Gemini 2.5+ family) but not surfaced in the visible response."
+        ),
+    )
 
     def __add__(self, other: "TokenUsage") -> "TokenUsage":
         """Allow aggregating token usage from multiple calls."""
@@ -102,6 +123,7 @@ def __add__(self, other: "TokenUsage") -> "TokenUsage":
             output_tokens=self.output_tokens + other.output_tokens,
             total_tokens=self.total_tokens + other.total_tokens,
             cached_tokens=self.cached_tokens + other.cached_tokens,
+            thoughts_tokens=self.thoughts_tokens + other.thoughts_tokens,
         )
 
 

diff --git a/hindsight-api-slim/hindsight_api/extensions/operation_validator.py b/hindsight-api-slim/hindsight_api/extensions/operation_validator.py
@@ -208,6 +208,16 @@ class RetainResult:
     llm_input_tokens: int | None = None
     llm_output_tokens: int | None = None
     llm_total_tokens: int | None = None
+    # Diagnostic token splits surfaced for cost attribution and prompt-cache
+    # tuning. ``llm_cached_input_tokens`` is the subset of llm_input_tokens
+    # served from the provider's prompt cache (e.g. Gemini's
+    # cached_content_token_count). ``llm_thoughts_tokens`` is reasoning tokens
+    # that are billed at the output rate by some providers (Gemini 2.5+) but
+    # are not part of the visible response. Both default to None when the
+    # engine/provider didn't report them; downstream metering extensions
+    # should treat None as 0.
+    llm_cached_input_tokens: int | None = None
+    llm_thoughts_tokens: int | None = None
     # Content tokens the retain pipeline actually processed, after
     # chunk-level content-hash deduplication. Semantics:
     #   None — no dedup signal available (e.g. a first-time retain or a