Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion crates/crw-renderer/src/cdp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,8 @@ pub struct CdpRenderer {
/// requiring a specific content selector to mount + networkIdle(0). Keeps the
/// mandatory content gate (never snapshots an empty CSR shell). Default off.
fast_ready: bool,
/// UA for `Network.setUserAgentOverride`; empty = no override (browser default).
user_agent: String,
}

impl CdpRenderer {
Expand All @@ -472,6 +474,7 @@ impl CdpRenderer {
challenge_max_retries: CHALLENGE_MAX_RETRIES,
spa_selector_max: Duration::from_millis(SPA_SELECTOR_MAX_MS),
fast_ready: false,
user_agent: String::new(),
}
}

Expand All @@ -483,6 +486,15 @@ impl CdpRenderer {
self
}

/// Set the User-Agent the CDP renderer presents (via
/// `Network.setUserAgentOverride`). Pass the same `effective_ua` the HTTP
/// fetcher uses so a JS-rendered page sees a modern UA, not the browser's
/// default — and HTTP/CDP UAs match (a mismatch is itself a bot tell).
pub fn with_user_agent(mut self, ua: &str) -> Self {
self.user_agent = ua.to_string();
self
}

/// Override the post-navigate challenge-clear retry count (default 3).
/// Set lower (or 0) to trim the anti-bot tail; anti-bot recovery is then the
/// stealth/auto-egress tier's job (the Firecrawl/Spider approach).
Expand Down Expand Up @@ -2088,6 +2100,23 @@ impl CdpRenderer {
)
.await?;

// Present a modern UA on the CDP path too (the HTTP fetcher already does,
// but renderers otherwise send the browser's own — often stale — UA, which
// trips "your browser is outdated" gates). Session-scoped (so pooled
// contexts don't leak it) and best-effort: lightpanda may not implement
// this method, and a failure must NOT abort an otherwise-fine render —
// hence `.ok()`, not `?`. Empty UA = skip (keep the browser default).
if !self.user_agent.is_empty() {
conn.send_recv(
"Network.setUserAgentOverride",
serde_json::json!({ "userAgent": self.user_agent }),
Some(&session_id),
self.page_timeout,
)
.await
.ok();
}

// Subscribe to events BEFORE navigating so we don't miss loadEventFired.
let events_rx = conn.subscribe();

Expand Down Expand Up @@ -2595,7 +2624,7 @@ fn is_spa_text_ready(text_len: i64) -> bool {

#[cfg(test)]
mod tests {
use super::{build_auth_response, is_content_stable};
use super::{CdpRenderer, build_auth_response, is_content_stable};

#[test]
fn auth_response_provides_credentials_when_creds_set() {
Expand Down Expand Up @@ -2760,4 +2789,16 @@ mod tests {
assert!(!is_capturable_mime("application/javascript"));
assert!(!is_capturable_mime(""));
}

#[test]
fn user_agent_default_empty_and_builder_sets_it() {
// Default = empty → fetch_inner skips the override (browser default).
let r = CdpRenderer::new("chrome", "ws://127.0.0.1:9222", 1000, 1);
assert_eq!(r.user_agent, "", "default UA must be empty (no override)");
// Builder threads the effective UA through so CDP matches the HTTP path.
let ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/150.0.0.0 Safari/537.36";
let r = r.with_user_agent(ua);
assert_eq!(r.user_agent, ua);
}
}
32 changes: 20 additions & 12 deletions crates/crw-renderer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,12 +458,15 @@ impl FallbackRenderer {

if want(RendererMode::Lightpanda) {
if let Some(lp) = &config.lightpanda {
js_renderers.push(Arc::new(cdp::CdpRenderer::new(
"lightpanda",
&lp.ws_url,
config.lightpanda_timeout(),
config.pool_size,
)));
js_renderers.push(Arc::new(
cdp::CdpRenderer::new(
"lightpanda",
&lp.ws_url,
config.lightpanda_timeout(),
config.pool_size,
)
.with_user_agent(&effective_ua),
));
} else if matches!(config.mode, RendererMode::Lightpanda) {
return Err(CrwError::ConfigError(
"renderer.mode = \"lightpanda\" but [renderer.lightpanda] ws_url is not \
Expand All @@ -476,12 +479,15 @@ impl FallbackRenderer {
if let Some(pw) = &config.playwright {
// Playwright is treated as a "chrome-equivalent" tier —
// same timeout budget, same kind of work.
js_renderers.push(Arc::new(cdp::CdpRenderer::new(
"playwright",
&pw.ws_url,
config.chrome_timeout(),
config.pool_size,
)));
js_renderers.push(Arc::new(
cdp::CdpRenderer::new(
"playwright",
&pw.ws_url,
config.chrome_timeout(),
config.pool_size,
)
.with_user_agent(&effective_ua),
));
} else if matches!(config.mode, RendererMode::Playwright) {
return Err(CrwError::ConfigError(
"renderer.mode = \"playwright\" but [renderer.playwright] ws_url is not \
Expand All @@ -500,6 +506,7 @@ impl FallbackRenderer {
config.chrome_timeout(),
config.pool_size,
)
.with_user_agent(&effective_ua)
.with_nav_budget(config.chrome_nav_budget_ms)
.with_challenge_retries(
config
Expand Down Expand Up @@ -589,6 +596,7 @@ impl FallbackRenderer {
config.chrome_proxy_timeout(),
config.pool_size,
)
.with_user_agent(&effective_ua)
.with_nav_budget(config.chrome_nav_budget_ms)
.with_challenge_retries(
config
Expand Down
Loading