Krawl/config.yaml at main · BlessedRebuS/Krawl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Krawl Honeypot Configuration

# Deployment mode: "standalone" (SQLite + in-memory cache) or "scalable" (PostgreSQL + Redis)
mode: standalone

# PostgreSQL settings (only used in scalable mode)
postgres:
  host: "localhost"
  port: 5432
  user: "krawl"
  password: "krawl"
  database: "krawl"

# Redis settings (only used in scalable mode)
redis:
  host: "localhost"
  port: 6379
  db: 0
  password: null
  # Cache TTL settings (seconds) — tune for your traffic volume
  cache_ttl: 600    # Dashboard warmup data (default: 10 minutes)
  hot_ttl: 30       # Hot-path data like ban info, IP categories (default: 30 seconds)
  table_ttl: 120    # Paginated dashboard tables (default: 2 minutes)

server:
  port: 5000
  delay: 100  # Response delay in milliseconds

  # manually set the server header, if null a random one will be used.
  server_header: null

links:
  min_length: 5
  max_length: 15
  min_per_page: 5
  max_per_page: 10
  char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
  max_counter: 10

canary:
  token_url: null  # Optional canary token URL
  token_tries: 10

dashboard:
  # if set to "null" this will Auto-generates random path if not set
  # can be set to "/dashboard" or similar <-- note this MUST include a forward slash
  # secret_path: super-secret-dashboard-path
  secret_path: null

  # Password for accessing protected dashboard panels.
  # If null, a random password will be generated and printed in the logs.
  # Can also be set via KRAWL_DASHBOARD_PASSWORD env var.
  password: null

  # Enable background cache warmup task for the dashboard.
  # When enabled, dashboard data is pre-computed every 5 minutes for instant page loads.
  # When disabled, all dashboard queries hit the database directly on each request.
  # In scalable mode with Redis, consider disabling this — paginated table caching
  # already reduces DB load without needing a background warmup task.
  cache_warmup: true

backups:
  path: "backups"
  cron: "*/30 * * * *"
  enabled: false

logging:
  level: "DEBUG"  # DEBUG, INFO, WARNING, ERROR, CRITICAL

database:
  path: "data/krawl.db"
  retention_days: 30
  # Only persist suspicious requests to the access log.
  # When enabled, non-suspicious requests still increment IP stats counters
  # but won't be stored as individual access log entries, saving disk I/O.
  persist_suspicious_only: false

behavior:
  probability_error_codes: 0  # 0-100 percentage

analyzer:
  http_risky_methods_threshold: 0.1
  violated_robots_threshold: 0.1
  uneven_request_timing_threshold: 0.5
  uneven_request_timing_time_window_seconds: 300
  user_agents_used_threshold: 2
  attack_urls_threshold: 1

crawl:
  infinite_pages_for_malicious: true
  max_pages_limit: 250
  ban_duration_seconds: 600

tarpit:
  enabled: false  # Opt-in: trap AI agents with slow responses and random text
  delay_seconds: 5  # Extra delay (seconds) added to each response when tarpit is active
ai:
  enabled: false
  provider: "openrouter"  # "openrouter" or "openai"
  api_key: ""
  model: "" # for example nvidia/nemotron-3-super-120b-a12b:free or gpt-5.1-mini
  timeout: 60  # Request timeout in seconds for API calls
  max_daily_requests: 100  # Limit the number of AI-generated responses per day
  reasoning:
    enabled: true
    effort: "medium"  # "none", "minimal", "low", "medium", "high", "xhigh" (doc: https://openrouter.ai/docs/guides/best-practices/reasoning-tokens)
  prompt: |
    Your goal is to create a plausible but fake intentionally vulnerable page that might appear on a real server, that can distract attackers.
    Your input will be a query path, that the attacker asked for.

    Follow this rules:
    1. You must output ONLY the HTML, nothing else
    2. Include realistic content if necessary (links, text, forms, etc.)
    3. Do not add markdown, code blocks, or explanations
    4. Do not include any file in the html, generate everything needed in one single file
    5. Include proper HTML structure with head and body tags
    6. If the request is a common attack vector (e.g., SQLi, XSS), include fake data in response
    7. If the request has a file extension, generate a RAW content relevant to that type (e.g. a fake json for .json requests)

    Path: {path}{query_part}
    Generate the complete HTML page.