diff --git a/crates/crw-renderer/src/cdp.rs b/crates/crw-renderer/src/cdp.rs index 94129dbf..a0e087e2 100644 --- a/crates/crw-renderer/src/cdp.rs +++ b/crates/crw-renderer/src/cdp.rs @@ -450,6 +450,8 @@ pub struct CdpRenderer { /// requiring a specific content selector to mount + networkIdle(0). Keeps the /// mandatory content gate (never snapshots an empty CSR shell). Default off. fast_ready: bool, + /// UA for `Network.setUserAgentOverride`; empty = no override (browser default). + user_agent: String, } impl CdpRenderer { @@ -472,6 +474,7 @@ impl CdpRenderer { challenge_max_retries: CHALLENGE_MAX_RETRIES, spa_selector_max: Duration::from_millis(SPA_SELECTOR_MAX_MS), fast_ready: false, + user_agent: String::new(), } } @@ -483,6 +486,15 @@ impl CdpRenderer { self } + /// Set the User-Agent the CDP renderer presents (via + /// `Network.setUserAgentOverride`). Pass the same `effective_ua` the HTTP + /// fetcher uses so a JS-rendered page sees a modern UA, not the browser's + /// default — and HTTP/CDP UAs match (a mismatch is itself a bot tell). + pub fn with_user_agent(mut self, ua: &str) -> Self { + self.user_agent = ua.to_string(); + self + } + /// Override the post-navigate challenge-clear retry count (default 3). /// Set lower (or 0) to trim the anti-bot tail; anti-bot recovery is then the /// stealth/auto-egress tier's job (the Firecrawl/Spider approach). @@ -2088,6 +2100,23 @@ impl CdpRenderer { ) .await?; + // Present a modern UA on the CDP path too (the HTTP fetcher already does, + // but renderers otherwise send the browser's own — often stale — UA, which + // trips "your browser is outdated" gates). Session-scoped (so pooled + // contexts don't leak it) and best-effort: lightpanda may not implement + // this method, and a failure must NOT abort an otherwise-fine render — + // hence `.ok()`, not `?`. Empty UA = skip (keep the browser default). + if !self.user_agent.is_empty() { + conn.send_recv( + "Network.setUserAgentOverride", + serde_json::json!({ "userAgent": self.user_agent }), + Some(&session_id), + self.page_timeout, + ) + .await + .ok(); + } + // Subscribe to events BEFORE navigating so we don't miss loadEventFired. let events_rx = conn.subscribe(); @@ -2595,7 +2624,7 @@ fn is_spa_text_ready(text_len: i64) -> bool { #[cfg(test)] mod tests { - use super::{build_auth_response, is_content_stable}; + use super::{CdpRenderer, build_auth_response, is_content_stable}; #[test] fn auth_response_provides_credentials_when_creds_set() { @@ -2760,4 +2789,16 @@ mod tests { assert!(!is_capturable_mime("application/javascript")); assert!(!is_capturable_mime("")); } + + #[test] + fn user_agent_default_empty_and_builder_sets_it() { + // Default = empty → fetch_inner skips the override (browser default). + let r = CdpRenderer::new("chrome", "ws://127.0.0.1:9222", 1000, 1); + assert_eq!(r.user_agent, "", "default UA must be empty (no override)"); + // Builder threads the effective UA through so CDP matches the HTTP path. + let ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ + (KHTML, like Gecko) Chrome/150.0.0.0 Safari/537.36"; + let r = r.with_user_agent(ua); + assert_eq!(r.user_agent, ua); + } } diff --git a/crates/crw-renderer/src/lib.rs b/crates/crw-renderer/src/lib.rs index 6c166d5e..6ee1531a 100644 --- a/crates/crw-renderer/src/lib.rs +++ b/crates/crw-renderer/src/lib.rs @@ -458,12 +458,15 @@ impl FallbackRenderer { if want(RendererMode::Lightpanda) { if let Some(lp) = &config.lightpanda { - js_renderers.push(Arc::new(cdp::CdpRenderer::new( - "lightpanda", - &lp.ws_url, - config.lightpanda_timeout(), - config.pool_size, - ))); + js_renderers.push(Arc::new( + cdp::CdpRenderer::new( + "lightpanda", + &lp.ws_url, + config.lightpanda_timeout(), + config.pool_size, + ) + .with_user_agent(&effective_ua), + )); } else if matches!(config.mode, RendererMode::Lightpanda) { return Err(CrwError::ConfigError( "renderer.mode = \"lightpanda\" but [renderer.lightpanda] ws_url is not \ @@ -476,12 +479,15 @@ impl FallbackRenderer { if let Some(pw) = &config.playwright { // Playwright is treated as a "chrome-equivalent" tier — // same timeout budget, same kind of work. - js_renderers.push(Arc::new(cdp::CdpRenderer::new( - "playwright", - &pw.ws_url, - config.chrome_timeout(), - config.pool_size, - ))); + js_renderers.push(Arc::new( + cdp::CdpRenderer::new( + "playwright", + &pw.ws_url, + config.chrome_timeout(), + config.pool_size, + ) + .with_user_agent(&effective_ua), + )); } else if matches!(config.mode, RendererMode::Playwright) { return Err(CrwError::ConfigError( "renderer.mode = \"playwright\" but [renderer.playwright] ws_url is not \ @@ -500,6 +506,7 @@ impl FallbackRenderer { config.chrome_timeout(), config.pool_size, ) + .with_user_agent(&effective_ua) .with_nav_budget(config.chrome_nav_budget_ms) .with_challenge_retries( config @@ -589,6 +596,7 @@ impl FallbackRenderer { config.chrome_proxy_timeout(), config.pool_size, ) + .with_user_agent(&effective_ua) .with_nav_budget(config.chrome_nav_budget_ms) .with_challenge_retries( config