Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions hindsight-api-slim/hindsight_api/engine/memory_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3442,6 +3442,8 @@ async def retain_batch_async(
llm_input_tokens=total_usage.input_tokens,
llm_output_tokens=total_usage.output_tokens,
llm_total_tokens=total_usage.total_tokens,
llm_cached_input_tokens=getattr(total_usage, "cached_tokens", 0) or 0,
llm_thoughts_tokens=getattr(total_usage, "thoughts_tokens", 0) or 0,
processed_content_tokens=total_processed_content_tokens,
)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ def _build_generation_config(use_cache: bool) -> "genai_types.GenerateContentCon
output_tokens=output_tokens,
total_tokens=input_tokens + output_tokens,
cached_tokens=cached_tokens,
thoughts_tokens=thoughts_tokens,
)
return result, token_usage
return result
Expand Down Expand Up @@ -762,6 +763,8 @@ def _build_tools_config(use_cache: bool) -> "genai_types.GenerateContentConfig":
finish_reason=finish_reason,
input_tokens=input_tokens,
output_tokens=output_tokens,
cached_tokens=cached_input_tokens,
thoughts_tokens=thoughts_tokens,
)

except genai_errors.APIError as e:
Expand Down
99 changes: 78 additions & 21 deletions hindsight-api-slim/hindsight_api/engine/reflect/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ async def _generate_structured_output(
response_schema: dict,
llm_config: "LLMProvider",
reflect_id: str,
) -> tuple[dict[str, Any] | None, int, int]:
) -> tuple[dict[str, Any] | None, int, int, int, int]:
"""Generate structured output from an answer using the provided JSON schema.

Args:
Expand All @@ -151,7 +151,7 @@ async def _generate_structured_output(
reflect_id: Reflect ID for logging

Returns:
Tuple of (structured_output, input_tokens, output_tokens).
Tuple of (structured_output, input_tokens, output_tokens, cached_tokens, thoughts_tokens).
structured_output is None if generation fails.
"""
try:
Expand Down Expand Up @@ -186,7 +186,7 @@ def _json_schema_type_to_python(field_schema: dict) -> type:

if not fields:
logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output")
return None, 0, 0
return None, 0, 0, 0, 0

DynamicModel = create_model("StructuredResponse", **fields)

Expand Down Expand Up @@ -259,7 +259,13 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output")

logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields")
return structured_output, usage.input_tokens, usage.output_tokens
return (
structured_output,
usage.input_tokens,
usage.output_tokens,
getattr(usage, "cached_tokens", 0) or 0,
getattr(usage, "thoughts_tokens", 0) or 0,
)

except Exception as e:
logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}")
Expand Down Expand Up @@ -435,9 +441,14 @@ async def run_reflect_agent(
llm_trace: list[dict[str, Any]] = []
context_history: list[dict[str, Any]] = [] # For final prompt fallback

# Token usage tracking - accumulate across all LLM calls
# Token usage tracking - accumulate across all LLM calls.
# cached_tokens and thoughts_tokens are surfaced for cost attribution
# and prompt-cache tuning. Both are subsets of (or parallel to) the
# input/output counts and are NOT double-counted in total_tokens.
total_input_tokens = 0
total_output_tokens = 0
total_cached_tokens = 0
total_thoughts_tokens = 0

# Track available IDs for validation (prevents hallucinated citations)
available_memory_ids: set[str] = set()
Expand All @@ -460,6 +471,8 @@ def _get_usage() -> TokenUsageSummary:
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
total_tokens=total_input_tokens + total_output_tokens,
cached_tokens=total_cached_tokens,
thoughts_tokens=total_thoughts_tokens,
)

def _log_completion(answer: str, iterations: int, forced: bool = False):
Expand Down Expand Up @@ -526,6 +539,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
llm_duration = int((time.time() - llm_start) * 1000)
total_input_tokens += usage.input_tokens
total_output_tokens += usage.output_tokens
total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
llm_trace.append(
{
"scope": "final",
Expand All @@ -539,11 +554,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
# Generate structured output if schema provided
structured_output = None
if response_schema and answer:
structured_output, struct_in, struct_out = await _generate_structured_output(
answer, response_schema, llm_config, reflect_id
)
(
structured_output,
struct_in,
struct_out,
struct_cached,
struct_thoughts,
) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
total_input_tokens += struct_in
total_output_tokens += struct_out
total_cached_tokens += struct_cached
total_thoughts_tokens += struct_thoughts

_log_completion(answer, iteration + 1, forced=True)
return ReflectAgentResult(
Expand Down Expand Up @@ -588,6 +609,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
llm_duration = int((time.time() - llm_start) * 1000)
total_input_tokens += usage.input_tokens
total_output_tokens += usage.output_tokens
total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
llm_trace.append(
{
"scope": "final",
Expand All @@ -600,11 +623,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):

structured_output = None
if response_schema and answer:
structured_output, struct_in, struct_out = await _generate_structured_output(
answer, response_schema, llm_config, reflect_id
)
(
structured_output,
struct_in,
struct_out,
struct_cached,
struct_thoughts,
) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
total_input_tokens += struct_in
total_output_tokens += struct_out
total_cached_tokens += struct_cached
total_thoughts_tokens += struct_thoughts

_log_completion(answer, iteration + 1, forced=True)
return ReflectAgentResult(
Expand Down Expand Up @@ -661,6 +690,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
consecutive_errors = 0
total_input_tokens += result.input_tokens
total_output_tokens += result.output_tokens
total_cached_tokens += getattr(result, "cached_tokens", 0) or 0
total_thoughts_tokens += getattr(result, "thoughts_tokens", 0) or 0
llm_trace.append(
{
"scope": f"agent_{iteration + 1}",
Expand Down Expand Up @@ -709,6 +740,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
llm_duration = int((time.time() - llm_start) * 1000)
total_input_tokens += usage.input_tokens
total_output_tokens += usage.output_tokens
total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
llm_trace.append(
{
"scope": "final",
Expand All @@ -722,11 +755,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
# Generate structured output if schema provided
structured_output = None
if response_schema and answer:
structured_output, struct_in, struct_out = await _generate_structured_output(
answer, response_schema, llm_config, reflect_id
)
(
structured_output,
struct_in,
struct_out,
struct_cached,
struct_thoughts,
) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
total_input_tokens += struct_in
total_output_tokens += struct_out
total_cached_tokens += struct_cached
total_thoughts_tokens += struct_thoughts

_log_completion(answer, iteration + 1, forced=True)
return ReflectAgentResult(
Expand Down Expand Up @@ -783,6 +822,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
)
total_input_tokens += rewrite_usage.input_tokens
total_output_tokens += rewrite_usage.output_tokens
total_cached_tokens += getattr(rewrite_usage, "cached_tokens", 0) or 0
total_thoughts_tokens += getattr(rewrite_usage, "thoughts_tokens", 0) or 0
llm_trace.append(
{
"scope": "final_rewrite",
Expand All @@ -796,11 +837,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
# Generate structured output if schema provided
structured_output = None
if response_schema and answer:
structured_output, struct_in, struct_out = await _generate_structured_output(
answer, response_schema, llm_config, reflect_id
)
(
structured_output,
struct_in,
struct_out,
struct_cached,
struct_thoughts,
) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
total_input_tokens += struct_in
total_output_tokens += struct_out
total_cached_tokens += struct_cached
total_thoughts_tokens += struct_thoughts

_log_completion(answer, iteration + 1)
return ReflectAgentResult(
Expand Down Expand Up @@ -835,6 +882,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
llm_duration = int((time.time() - llm_start) * 1000)
total_input_tokens += usage.input_tokens
total_output_tokens += usage.output_tokens
total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
llm_trace.append(
{
"scope": "final",
Expand All @@ -848,11 +897,17 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
# Generate structured output if schema provided
structured_output = None
if response_schema and answer:
structured_output, struct_in, struct_out = await _generate_structured_output(
answer, response_schema, llm_config, reflect_id
)
(
structured_output,
struct_in,
struct_out,
struct_cached,
struct_thoughts,
) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
total_input_tokens += struct_in
total_output_tokens += struct_out
total_cached_tokens += struct_cached
total_thoughts_tokens += struct_thoughts

_log_completion(answer, iteration + 1, forced=True)
return ReflectAgentResult(
Expand Down Expand Up @@ -1147,14 +1202,16 @@ async def _process_done_tool(
structured_output = None
final_usage = usage
if response_schema and llm_config and answer:
structured_output, struct_in, struct_out = await _generate_structured_output(
structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
answer, response_schema, llm_config, reflect_id
)
# Add structured output tokens to usage
final_usage = TokenUsageSummary(
input_tokens=usage.input_tokens + struct_in,
output_tokens=usage.output_tokens + struct_out,
total_tokens=usage.total_tokens + struct_in + struct_out,
cached_tokens=usage.cached_tokens + struct_cached,
thoughts_tokens=usage.thoughts_tokens + struct_thoughts,
)

log_completion(answer, iterations)
Expand Down
17 changes: 14 additions & 3 deletions hindsight-api-slim/hindsight_api/engine/reflect/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,20 @@ class DirectiveInfo(BaseModel):
class TokenUsageSummary(BaseModel):
"""Total token usage across all LLM calls."""

input_tokens: int = Field(default=0, description="Total input tokens used")
output_tokens: int = Field(default=0, description="Total output tokens used")
total_tokens: int = Field(default=0, description="Total tokens (input + output)")
input_tokens: int = Field(default=0, description="Total input tokens used (includes any cached prefix tokens)")
output_tokens: int = Field(default=0, description="Total visible output tokens used (excludes reasoning/thoughts)")
total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
cached_tokens: int = Field(
default=0,
description="Cached/cache-read prompt tokens summed across calls. Subset of input_tokens.",
)
thoughts_tokens: int = Field(
default=0,
description=(
"Reasoning/thinking tokens summed across calls. Billed at the output rate by some providers "
"but not part of visible output."
),
)


class ReflectAgentResult(BaseModel):
Expand Down
30 changes: 26 additions & 4 deletions hindsight-api-slim/hindsight_api/engine/response_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,20 @@ class LLMToolCallResult(BaseModel):
content: str | None = Field(default=None, description="Text content if any")
tool_calls: list[LLMToolCall] = Field(default_factory=list, description="Tool calls requested by the LLM")
finish_reason: str | None = Field(default=None, description="Reason the LLM stopped: 'stop', 'tool_calls', etc.")
input_tokens: int = Field(default=0, description="Input tokens used in this call")
output_tokens: int = Field(default=0, description="Output tokens used in this call")
input_tokens: int = Field(
default=0,
description="Input tokens used in this call (includes any cached prefix tokens reported by the provider)",
)
output_tokens: int = Field(
default=0, description="Visible output tokens used in this call (excludes reasoning/thoughts)"
)
cached_tokens: int = Field(
default=0, description="Cached prefix tokens, when reported by the provider. Subset of input_tokens."
)
thoughts_tokens: int = Field(
default=0,
description="Reasoning/thinking tokens. Billed at the output rate by some providers but not part of visible output.",
)


class ToolCallTrace(BaseModel):
Expand Down Expand Up @@ -91,9 +103,18 @@ class TokenUsage(BaseModel):
)

input_tokens: int = Field(default=0, description="Number of input/prompt tokens consumed")
output_tokens: int = Field(default=0, description="Number of output/completion tokens generated")
total_tokens: int = Field(default=0, description="Total tokens (input + output)")
output_tokens: int = Field(
default=0, description="Number of visible output/completion tokens generated (excludes reasoning/thoughts)"
)
total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
cached_tokens: int = Field(default=0, description="Cached/cache-read prompt tokens, when reported by the provider")
thoughts_tokens: int = Field(
default=0,
description=(
"Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers "
"(e.g. Gemini 2.5+ family) but not surfaced in the visible response."
),
)

def __add__(self, other: "TokenUsage") -> "TokenUsage":
"""Allow aggregating token usage from multiple calls."""
Expand All @@ -102,6 +123,7 @@ def __add__(self, other: "TokenUsage") -> "TokenUsage":
output_tokens=self.output_tokens + other.output_tokens,
total_tokens=self.total_tokens + other.total_tokens,
cached_tokens=self.cached_tokens + other.cached_tokens,
thoughts_tokens=self.thoughts_tokens + other.thoughts_tokens,
)


Expand Down
10 changes: 10 additions & 0 deletions hindsight-api-slim/hindsight_api/extensions/operation_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,16 @@ class RetainResult:
llm_input_tokens: int | None = None
llm_output_tokens: int | None = None
llm_total_tokens: int | None = None
# Diagnostic token splits surfaced for cost attribution and prompt-cache
# tuning. ``llm_cached_input_tokens`` is the subset of llm_input_tokens
# served from the provider's prompt cache (e.g. Gemini's
# cached_content_token_count). ``llm_thoughts_tokens`` is reasoning tokens
# that are billed at the output rate by some providers (Gemini 2.5+) but
# are not part of the visible response. Both default to None when the
# engine/provider didn't report them; downstream metering extensions
# should treat None as 0.
llm_cached_input_tokens: int | None = None
llm_thoughts_tokens: int | None = None
# Content tokens the retain pipeline actually processed, after
# chunk-level content-hash deduplication. Semantics:
# None — no dedup signal available (e.g. a first-time retain or a
Expand Down
Loading