From 80877d62910732108393924a80a8f381747848d5 Mon Sep 17 00:00:00 2001
From: Hydrax117 <Pauljoseph5000@gmail.com>
Date: Mon, 22 Jun 2026 00:17:10 +0100
Subject: [PATCH] Comprehensive Logging System Implementation

---
 backend/PULL_REQUEST.md                    | 362 ------------
 backend/databases/ConnectionPoolManager.ts | 619 ++++++++++++++-------
 2 files changed, 430 insertions(+), 551 deletions(-)
 delete mode 100644 backend/PULL_REQUEST.md

diff --git a/backend/PULL_REQUEST.md b/backend/PULL_REQUEST.md
deleted file mode 100644
index 89dc9ab..0000000
--- a/backend/PULL_REQUEST.md
+++ /dev/null
@@ -1,362 +0,0 @@
-# Pull Request: Comprehensive Logging System Implementation
-
-## Issue
-Closes #266
-
-## Summary
-Implemented a complete, production-ready logging system with structured logging, multiple log levels, automatic rotation, aggregation, and real-time monitoring capabilities.
-
-## Problem Statement
-The existing logging system was not comprehensive enough:
-- Application events were not being logged consistently
-- No structured logging format
-- Missing log rotation and retention policies
-- No log aggregation or analytics
-- No monitoring or alerting capabilities
-
-## Solution
-Developed a comprehensive logging system with the following components:
-
-### 1. Core Logging System (`middleware/logger.ts`)
-- **Structured Logging**: JSON format with consistent schema
-- **7 Log Levels**: error, warn, info, http, debug, verbose, silly
-- **8 Log Categories**: application, security, audit, performance, business, database, external_api, webhook
-- **Automatic Log Rotation**: Daily rotation with size limits (50MB) and compression
-- **Context Propagation**: Maintain context across async operations using AsyncLocalStorage
-- **Correlation IDs**: Track requests across services
-- **Child Loggers**: Create loggers with default context
-- **Global Error Handling**: Catch uncaught exceptions and unhandled rejections
-
-### 2. Log Monitoring (`middleware/logMonitoring.ts`)
-- **Real-time Pattern Detection**: Identify critical events automatically
-- **Configurable Alerts**: Set thresholds and time windows
-- **Built-in Patterns**: Database errors, auth failures, payment issues, etc.
-- **Alert Severity Levels**: low, medium, high, critical
-- **External Integrations**: Slack, Datadog, Elasticsearch
-- **Statistics Tracking**: Error rates, slow requests, etc.
-
-### 3. Log Aggregation (`middleware/logAggregation.ts`)
-- **Efficient Collection**: In-memory log storage with configurable limits
-- **Query Interface**: Filter by level, category, time range, search term
-- **Automatic Aggregation**: Combine similar logs to reduce noise
-- **Statistics Generation**: Real-time analytics
-
-### 4. Configuration (`config/loggingConfig.ts`)
-- **Centralized Settings**: All configuration in one place
-- **Environment-specific**: Different settings per environment
-- **Retention Policies**: Configurable per log type (30-365 days)
-- **Performance Thresholds**: Define what constitutes "slow"
-- **External Service Config**: Slack, Datadog, Elasticsearch credentials
-
-## Changes Made
-
-### New Files Created (10 files)
-1. ✅ `middleware/logger.ts` - Core logging system (650+ lines)
-2. ✅ `middleware/logMonitoring.ts` - Monitoring and alerting (280+ lines)
-3. ✅ `middleware/logAggregation.ts` - Log aggregation (140+ lines)
-4. ✅ `config/loggingConfig.ts` - Configuration (100+ lines)
-5. ✅ `middleware/README.md` - Complete documentation (400+ lines)
-6. ✅ `middleware/MIGRATION_GUIDE.md` - Migration instructions (350+ lines)
-7. ✅ `middleware/QUICK_REFERENCE.md` - Quick reference (200+ lines)
-8. ✅ `middleware/IMPLEMENTATION_SUMMARY.md` - Implementation details
-9. ✅ `middleware/loggingExample.ts` - 13 usage examples (450+ lines)
-10. ✅ `tests/unit/logging.test.ts` - Comprehensive test suite (290+ lines)
-
-### Files Updated
-- ✅ `.gitignore` - Added log directories and files
-
-## Features
-
-### Specialized Logging Methods
-```typescript
-// Security events
-logger.security('Failed login attempt', { userId, ip });
-
-// Audit trail
-logger.audit({
-  action: 'user.update',
-  resource: 'user',
-  resourceId: '123',
-  userId: 'admin',
-  result: 'success'
-});
-
-// Performance monitoring
-logger.performance({
-  operation: 'database-query',
-  duration: 1500,
-  threshold: 1000
-});
-
-// Business metrics
-logger.metric({
-  name: 'payment.processed',
-  value: 99.99,
-  unit: 'USD',
-  tags: { method: 'credit_card' }
-});
-```
-
-### Log Rotation
-- **Application logs**: 30 days retention, 50MB max size
-- **Error logs**: 90 days retention, 50MB max size
-- **Security logs**: 365 days retention, 50MB max size
-- **Performance logs**: 14 days retention, 50MB max size
-- **Metrics logs**: 30 days retention, 50MB max size
-- All files automatically compressed after rotation
-
-### Monitoring & Alerting
-- Built-in patterns for critical events
-- Configurable thresholds and time windows
-- Real-time statistics (error rate, slow requests, etc.)
-- External service integration (Slack, Datadog, Elasticsearch)
-- Custom pattern support
-
-### Context Propagation
-```typescript
-logContextStorage.run({ userId, operation }, async () => {
-  logger.info('Step 1'); // Automatically includes userId and operation
-  logger.info('Step 2'); // Automatically includes userId and operation
-});
-```
-
-## Testing
-
-### Test Coverage
-- ✅ Logger unit tests (8 test cases)
-- ✅ Monitor unit tests (6 test cases)
-- ✅ Aggregator unit tests (8 test cases)
-- ✅ Integration tests (1 test case)
-- **Total**: 23+ test cases
-
-### Run Tests
-```bash
-npm test tests/unit/logging.test.ts
-```
-
-## Configuration
-
-### Environment Variables
-```bash
-# Logging Configuration
-NODE_ENV=production
-LOG_LEVEL=info
-LOG_PATH=logs
-LOG_SILENT=false
-
-# Monitoring
-LOG_MONITORING_ENABLED=true
-
-# External Services (optional)
-SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL
-SLACK_CHANNEL=#alerts
-ELASTICSEARCH_NODE=http://localhost:9200
-ELASTICSEARCH_INDEX=logs
-DATADOG_API_KEY=your-api-key
-DATADOG_APP_KEY=your-app-key
-
-# Service Identification
-SERVICE_NAME=nepa-backend
-```
-
-## Usage
-
-### Express Integration
-```typescript
-import { loggingMiddleware, setupGlobalErrorHandling } from './middleware/logger';
-
-const app = express();
-
-// Apply logging middleware
-app.use(loggingMiddleware);
-
-// Your routes here
-app.get('/api/users', (req, res) => {
-  logger.info('Fetching users');
-  res.json({ users: [] });
-});
-
-// Setup global error handling
-setupGlobalErrorHandling(app);
-```
-
-### Basic Logging
-```typescript
-import { logger } from './middleware/logger';
-
-logger.info('User logged in', { userId: '123', ip: '192.168.1.1' });
-logger.error('Operation failed', { error, context });
-logger.debug('Debug information', { data });
-```
-
-## Migration Path
-
-For existing code, follow the migration guide:
-1. Update imports to use new logger
-2. Apply logging middleware to Express app
-3. Replace console.log calls with appropriate log levels
-4. Add specialized logging (security, audit, performance, metrics)
-5. Configure environment variables
-6. Test thoroughly
-
-See `middleware/MIGRATION_GUIDE.md` for detailed instructions.
-
-## Documentation
-
-### Comprehensive Documentation Provided
-- **README.md**: Complete user guide with examples and best practices
-- **MIGRATION_GUIDE.md**: Step-by-step migration instructions
-- **QUICK_REFERENCE.md**: Quick reference for common patterns
-- **IMPLEMENTATION_SUMMARY.md**: Technical implementation details
-- **loggingExample.ts**: 13 complete usage examples
-- Inline code documentation with TypeScript types
-
-## Performance Considerations
-
-- ✅ Asynchronous file writes (non-blocking)
-- ✅ Buffered metrics aggregation
-- ✅ Automatic log rotation prevents disk issues
-- ✅ Configurable log levels reduce overhead
-- ✅ Efficient pattern matching
-- ✅ Memory-bounded aggregation
-
-## Security Features
-
-- ✅ Sensitive data masking configuration
-- ✅ Security event logging
-- ✅ Audit trail with long retention (365 days)
-- ✅ Authentication failure tracking
-- ✅ IP and user agent logging
-- ✅ Correlation IDs for forensics
-
-## Benefits
-
-1. **Better Debugging**: Correlation IDs track requests across services
-2. **Compliance**: Audit logs with long retention for regulatory requirements
-3. **Performance**: Identify slow operations automatically
-4. **Alerting**: Get notified of critical issues in real-time
-5. **Analytics**: Query and analyze logs programmatically
-6. **Cost Savings**: Automatic rotation prevents disk space issues
-7. **Security**: Track authentication failures and suspicious activity
-8. **Observability**: Complete system visibility
-
-## Acceptance Criteria
-
-| Criteria | Status | Implementation |
-|----------|--------|----------------|
-| ✅ Implement structured logging | **DONE** | JSON format with consistent schema |
-| ✅ Add log levels | **DONE** | 7 levels: error, warn, info, http, debug, verbose, silly |
-| ✅ Include log rotation | **DONE** | Daily rotation with size limits and compression |
-| ✅ Add log aggregation | **DONE** | Query interface with filtering and statistics |
-| ✅ Include log monitoring | **DONE** | Pattern detection, alerting, external integrations |
-
-## Breaking Changes
-
-None. This is a new implementation that can coexist with existing logging until migration is complete.
-
-## Dependencies
-
-All required dependencies are already in `package.json`:
-- ✅ `winston` (^3.11.0)
-- ✅ `winston-daily-rotate-file` (^4.7.1)
-- ✅ `cls-rtracer` (^2.6.0)
-
-No new dependencies need to be installed.
-
-## Rollout Plan
-
-### Phase 1: Staging Deployment
-1. Deploy to staging environment
-2. Configure environment variables
-3. Monitor logs and alerts
-4. Adjust thresholds as needed
-
-### Phase 2: Gradual Production Rollout
-1. Enable on subset of production servers
-2. Monitor performance impact
-3. Verify external integrations (Slack, etc.)
-4. Gradually roll out to all servers
-
-### Phase 3: Migration
-1. Update existing code to use new logger
-2. Remove old logging implementations
-3. Train team on new logging patterns
-
-## Monitoring After Deployment
-
-Monitor these metrics:
-- Log file sizes and rotation
-- Error rates and patterns
-- Alert frequency and accuracy
-- Performance impact (should be minimal)
-- Disk space usage
-
-## Rollback Plan
-
-If issues arise:
-1. Set `LOG_SILENT=true` to disable logging
-2. Revert to previous logging implementation
-3. Investigate and fix issues
-4. Redeploy
-
-## Screenshots/Examples
-
-See `middleware/loggingExample.ts` for 13 complete examples including:
-- Basic logging
-- Error logging
-- Security logging
-- Audit logging
-- Performance logging
-- Business metrics
-- Database logging
-- External API logging
-- Webhook logging
-- Context propagation
-- Child loggers
-- Request handlers
-
-## Checklist
-
-- [x] Code follows project style guidelines
-- [x] Self-review completed
-- [x] Code commented, particularly complex areas
-- [x] Documentation updated
-- [x] No new warnings generated
-- [x] Tests added and passing
-- [x] Dependent changes merged
-- [x] Migration guide provided
-- [x] Configuration documented
-- [x] Performance impact assessed
-
-## Related Issues
-
-Closes #266
-
-## Additional Notes
-
-This implementation provides a solid foundation for logging that can be extended in the future with:
-- Cloud service integration (AWS CloudWatch, Azure Monitor)
-- Machine learning for anomaly detection
-- Advanced query language
-- Log visualization dashboard
-- Distributed tracing integration
-
-## Reviewers
-
-Please review:
-- Code quality and architecture
-- Documentation completeness
-- Test coverage
-- Configuration options
-- Migration path
-
-## Questions for Reviewers
-
-1. Are the default retention periods appropriate for our compliance requirements?
-2. Should we enable Slack alerts by default or keep them opt-in?
-3. Are there additional log patterns we should monitor?
-4. Should we add more specialized logging methods for specific use cases?
-
----
-
-**Ready for Review** ✅
diff --git a/backend/databases/ConnectionPoolManager.ts b/backend/databases/ConnectionPoolManager.ts
index 21f6b18..4276709 100644
--- a/backend/databases/ConnectionPoolManager.ts
+++ b/backend/databases/ConnectionPoolManager.ts
@@ -1,3 +1,15 @@
+/**
+ * databases/ConnectionPoolManager.ts
+ *
+ * Manages Prisma database connection pools across all backend services.
+ *
+ * Key capabilities:
+ *  - Usage-pattern-based pool sizing (auto-scale up/down based on rolling
+ *    utilisation and p95 response time)
+ *  - Per-service connection timeout enforcement
+ *  - Rich performance monitoring (rolling window metrics, percentiles, alerts)
+ */
+
 import {
   userClient,
   notificationClient,
@@ -9,14 +21,29 @@ import {
   webhookClient,
 } from './clients';
 
+// ─── Configuration ─────────────────────────────────────────────────────────────
+
 export interface PoolConfig {
   minConnections: number;
   maxConnections: number;
+  /** How long (ms) to wait for a connection before giving up. */
   connectionTimeoutMs: number;
+  /** How long (ms) an idle connection lives before being closed. */
   idleTimeoutMs: number;
+  /** Interval (ms) between scheduled health checks. */
   healthCheckIntervalMs: number;
+  /** Utilisation ratio (0–1) above which the pool is scaled up. Default 0.85 */
+  scaleUpThreshold: number;
+  /** Utilisation ratio (0–1) below which the pool is scaled down. Default 0.30 */
+  scaleDownThreshold: number;
+  /** p95 response-time ceiling (ms) before the pool is marked degraded. Default 800 */
+  degradedResponseTimeMs: number;
+  /** Absolute maximum connections regardless of auto-scaling. Default 100 */
+  hardMaxConnections: number;
 }
 
+// ─── Stats / Results ───────────────────────────────────────────────────────────
+
 export interface PoolStats {
   serviceName: string;
   totalConnections: number;
@@ -24,8 +51,14 @@ export interface PoolStats {
   idleConnections: number;
   waitingRequests: number;
   avgResponseTime: number;
+  /** p95 response time from the rolling metrics window. */
+  p95ResponseTime: number;
   healthStatus: 'healthy' | 'degraded' | 'unhealthy';
   lastHealthCheck: Date;
+  /** Current utilisation as a 0–100 percentage. */
+  utilizationPct: number;
+  /** How many times the pool has been auto-scaled since startup. */
+  resizeCount: number;
 }
 
 export interface HealthCheckResult {
@@ -33,160 +66,267 @@ export interface HealthCheckResult {
   isHealthy: boolean;
   responseTime: number;
   error?: string;
+  timedOut?: boolean;
+}
+
+/** Aggregated performance metrics for a single service. */
+export interface ServicePerformanceMetrics {
+  avg: number;
+  min: number;
+  max: number;
+  p95: number;
+  samples: number;
+  /** Samples collected in the last `windowMs` milliseconds. */
+  recentSamples: number;
+  /** Alert: true when p95 exceeds the configured `degradedResponseTimeMs`. */
+  slowQueryAlert: boolean;
+}
+
+// ─── Internal bookkeeping ──────────────────────────────────────────────────────
+
+interface TimestampedMetric {
+  responseTime: number;
+  timestamp: number; // epoch ms
+}
+
+interface ServiceState {
+  config: PoolConfig;
+  metrics: TimestampedMetric[];
+  resizeCount: number;
+  /** Running utilisation samples used for trend-based scaling decisions. */
+  utilizationHistory: number[];
 }
 
+// ─── Defaults ──────────────────────────────────────────────────────────────────
+
 const DEFAULT_CONFIG: PoolConfig = {
   minConnections: 2,
   maxConnections: 20,
-  connectionTimeoutMs: 30000,
-  idleTimeoutMs: 300000,
-  healthCheckIntervalMs: 60000,
+  connectionTimeoutMs: 30_000,
+  idleTimeoutMs: 300_000,
+  healthCheckIntervalMs: 60_000,
+  scaleUpThreshold: 0.85,
+  scaleDownThreshold: 0.30,
+  degradedResponseTimeMs: 800,
+  hardMaxConnections: 100,
 };
 
+/** Rolling window kept for performance metrics (5 minutes). */
+const METRICS_WINDOW_MS = 5 * 60 * 1_000;
+/** Maximum samples retained per service regardless of age. */
+const MAX_METRICS_SAMPLES = 500;
+/** Number of recent utilisation samples used for scaling decisions. */
+const UTILISATION_HISTORY_SIZE = 10;
+
+// ─── Service registry ──────────────────────────────────────────────────────────
+
+const CLIENT_MAP = {
+  'user-service': userClient,
+  'notification-service': notificationClient,
+  'document-service': documentClient,
+  'utility-service': utilityClient,
+  'payment-service': paymentClient,
+  'billing-service': billingClient,
+  'analytics-service': analyticsClient,
+  'webhook-service': webhookClient,
+} as const;
+
+type ServiceName = keyof typeof CLIENT_MAP;
+const SERVICE_NAMES = Object.keys(CLIENT_MAP) as ServiceName[];
+
+// ─── Helpers ───────────────────────────────────────────────────────────────────
+
+/** Calculate the p-th percentile of a sorted numeric array (ascending). */
+function percentile(sortedArr: number[], p: number): number {
+  if (sortedArr.length === 0) return 0;
+  const idx = Math.ceil((p / 100) * sortedArr.length) - 1;
+  return sortedArr[Math.max(0, Math.min(idx, sortedArr.length - 1))];
+}
+
+/** Clamp a number between min and max inclusive. */
+function clamp(value: number, min: number, max: number): number {
+  return Math.min(Math.max(value, min), max);
+}
+
+/** Promise that rejects after `ms` milliseconds with a timeout error. */
+function rejectAfter(ms: number): Promise<never> {
+  return new Promise((_, reject) =>
+    setTimeout(() => reject(new Error(`Connection timeout after ${ms}ms`)), ms)
+  );
+}
+
+// ─── Main class ────────────────────────────────────────────────────────────────
+
 export class ConnectionPoolManager {
-  private pools = new Map<string, any>();
-  private configs = new Map<string, PoolConfig>();
-  private healthCheckIntervals = new Map<string, NodeJS.Timeout>();
-  private performanceMetrics = new Map<string, number[]>();
-  private readonly MAX_METRICS_SAMPLES = 100;
+  /** Per-service state (config + rolling metrics + resize counter). */
+  private readonly state = new Map<string, ServiceState>();
+
+  /** Active setInterval handles for health monitoring. */
+  private readonly healthCheckIntervals = new Map<string, NodeJS.Timeout>();
 
   constructor() {
-    this.initializePools();
+    this.initializeServices();
   }
 
-  private initializePools(): void {
-    const services = [
-      'user-service',
-      'notification-service',
-      'document-service',
-      'utility-service',
-      'payment-service',
-      'billing-service',
-      'analytics-service',
-      'webhook-service',
-    ];
-
-    services.forEach(service => {
-      this.configs.set(service, { ...DEFAULT_CONFIG });
-      this.performanceMetrics.set(service, []);
-    });
+  // ── Initialisation ──────────────────────────────────────────────────────────
+
+  private initializeServices(): void {
+    for (const name of SERVICE_NAMES) {
+      this.state.set(name, {
+        config: { ...DEFAULT_CONFIG },
+        metrics: [],
+        resizeCount: 0,
+        utilizationHistory: [],
+      });
+    }
   }
 
+  private getState(serviceName: string): ServiceState {
+    const s = this.state.get(serviceName);
+    if (!s) throw new Error(`Unknown service: ${serviceName}`);
+    return s;
+  }
+
+  // ── Client access with timeout ──────────────────────────────────────────────
+
+  /**
+   * Retrieve the Prisma client for `serviceName`, recording a performance
+   * sample and enforcing the configured connection timeout.
+   */
   async getServiceClient(serviceName: string): Promise<any> {
-    const clientMap = {
-      'user-service': userClient,
-      'notification-service': notificationClient,
-      'document-service': documentClient,
-      'utility-service': utilityClient,
-      'payment-service': paymentClient,
-      'billing-service': billingClient,
-      'analytics-service': analyticsClient,
-      'webhook-service': webhookClient,
-    };
+    const client = CLIENT_MAP[serviceName as ServiceName];
+    if (!client) throw new Error(`Unknown service: ${serviceName}`);
 
-    const client = clientMap[serviceName as keyof typeof clientMap];
-    if (!client) {
-      throw new Error(`Unknown service: ${serviceName}`);
-    }
+    const { config } = this.getState(serviceName);
+    const start = Date.now();
 
-    // Record performance metrics
-    const startTime = Date.now();
-    
     try {
-      // Ensure connection is healthy before returning
-      await this.performHealthCheck(serviceName);
+      // Enforce connection timeout: health check must complete within the limit.
+      await Promise.race([
+        this.performHealthCheck(serviceName),
+        rejectAfter(config.connectionTimeoutMs),
+      ]);
+
       return client;
     } finally {
-      const responseTime = Date.now() - startTime;
-      this.recordPerformanceMetric(serviceName, responseTime);
+      this.recordMetric(serviceName, Date.now() - start);
     }
   }
 
-  private recordPerformanceMetric(serviceName: string, responseTime: number): void {
-    const metrics = this.performanceMetrics.get(serviceName) || [];
-    metrics.push(responseTime);
-    
-    // Keep only the last MAX_METRICS_SAMPLES
-    if (metrics.length > this.MAX_METRICS_SAMPLES) {
-      metrics.shift();
+  // ── Metrics recording ───────────────────────────────────────────────────────
+
+  private recordMetric(serviceName: string, responseTime: number): void {
+    const s = this.state.get(serviceName);
+    if (!s) return;
+
+    const now = Date.now();
+    s.metrics.push({ responseTime, timestamp: now });
+
+    // 1. Evict samples outside the rolling window.
+    const cutoff = now - METRICS_WINDOW_MS;
+    s.metrics = s.metrics.filter(m => m.timestamp >= cutoff);
+
+    // 2. Hard cap to avoid unbounded growth during very high throughput.
+    if (s.metrics.length > MAX_METRICS_SAMPLES) {
+      s.metrics.splice(0, s.metrics.length - MAX_METRICS_SAMPLES);
     }
-    
-    this.performanceMetrics.set(serviceName, metrics);
   }
 
+  // ── Health check ────────────────────────────────────────────────────────────
+
+  /**
+   * Probe a single service with a lightweight SQL round-trip.
+   * Returns the result even on failure (never throws).
+   */
   async performHealthCheck(serviceName: string): Promise<HealthCheckResult> {
-    const startTime = Date.now();
-    let isHealthy = false;
-    let error: string | undefined;
+    const start = Date.now();
+
+    // Guard against infinite recursion: use the raw client directly here
+    // instead of going through getServiceClient().
+    const client = CLIENT_MAP[serviceName as ServiceName];
+    if (!client) {
+      return { serviceName, isHealthy: false, responseTime: 0, error: `Unknown service: ${serviceName}` };
+    }
 
     try {
-      const client = await this.getServiceClient(serviceName);
-      await client.$queryRaw`SELECT 1`;
-      isHealthy = true;
+      await (client as any).$queryRaw`SELECT 1`;
+      return { serviceName, isHealthy: true, responseTime: Date.now() - start };
     } catch (err) {
-      error = err instanceof Error ? err.message : 'Unknown error';
-      isHealthy = false;
+      const isTimeout = err instanceof Error && err.message.includes('timeout');
+      return {
+        serviceName,
+        isHealthy: false,
+        responseTime: Date.now() - start,
+        error: err instanceof Error ? err.message : 'Unknown error',
+        timedOut: isTimeout,
+      };
     }
-
-    const responseTime = Date.now() - startTime;
-
-    return {
-      serviceName,
-      isHealthy,
-      responseTime,
-      error,
-    };
   }
 
+  /** Health-check every registered service concurrently. */
   async getAllHealthChecks(): Promise<HealthCheckResult[]> {
-    const services = Array.from(this.configs.keys());
-    const healthChecks = await Promise.all(
-      services.map(service => this.performHealthCheck(service))
-    );
-
-    return healthChecks;
+    return Promise.all(SERVICE_NAMES.map(s => this.performHealthCheck(s)));
   }
 
+  // ── Pool statistics ─────────────────────────────────────────────────────────
+
+  /**
+   * Return live pool statistics for a single service, including real
+   * connection counts queried from `pg_stat_activity`.
+   */
   async getPoolStats(serviceName: string): Promise<PoolStats> {
-    const config = this.configs.get(serviceName);
-    const metrics = this.performanceMetrics.get(serviceName) || [];
+    const s = this.getState(serviceName);
     const healthCheck = await this.performHealthCheck(serviceName);
 
-    // Calculate average response time
-    const avgResponseTime = metrics.length > 0 
-      ? metrics.reduce((sum, time) => sum + time, 0) / metrics.length 
-      : 0;
-
-    // Get connection statistics from database
+    // ── Connection counts ──
     let activeConnections = 0;
     let idleConnections = 0;
     let totalConnections = 0;
 
     try {
-      const client = await this.getServiceClient(serviceName);
+      const client = CLIENT_MAP[serviceName as ServiceName] as any;
       const result = await client.$queryRaw<any[]>`
-        SELECT 
-          count(*) FILTER (WHERE state = 'active') as active,
-          count(*) FILTER (WHERE state = 'idle') as idle,
-          count(*) as total
+        SELECT
+          count(*) FILTER (WHERE state = 'active') AS active,
+          count(*) FILTER (WHERE state = 'idle')   AS idle,
+          count(*)                                  AS total
         FROM pg_stat_activity
         WHERE datname = current_database()
       `;
+      const row = result[0];
+      activeConnections = Number(row?.active) || 0;
+      idleConnections   = Number(row?.idle)   || 0;
+      totalConnections  = Number(row?.total)  || 0;
+    } catch {
+      // pg_stat_activity may be unavailable in some environments; carry on.
+    }
+
+    // ── Derived metrics ──
+    const times = s.metrics.map(m => m.responseTime).sort((a, b) => a - b);
+    const avg   = times.length ? times.reduce((x, y) => x + y, 0) / times.length : 0;
+    const p95   = percentile(times, 95);
 
-      const stats = result[0];
-      activeConnections = Number(stats.active) || 0;
-      idleConnections = Number(stats.idle) || 0;
-      totalConnections = Number(stats.total) || 0;
-    } catch (error) {
-      console.error(`Error getting pool stats for ${serviceName}:`, error);
+    const utilizationPct = s.config.maxConnections > 0
+      ? Math.round((totalConnections / s.config.maxConnections) * 100)
+      : 0;
+
+    // Track utilisation for trend-based scaling.
+    s.utilizationHistory.push(utilizationPct);
+    if (s.utilizationHistory.length > UTILISATION_HISTORY_SIZE) {
+      s.utilizationHistory.shift();
     }
 
-    // Determine health status
-    let healthStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy';
+    // ── Health status ──
+    let healthStatus: 'healthy' | 'degraded' | 'unhealthy';
     if (!healthCheck.isHealthy) {
       healthStatus = 'unhealthy';
-    } else if (avgResponseTime > 1000 || totalConnections > (config?.maxConnections || 20) * 0.8) {
+    } else if (
+      p95 > s.config.degradedResponseTimeMs ||
+      totalConnections > s.config.maxConnections * s.config.scaleUpThreshold
+    ) {
       healthStatus = 'degraded';
+    } else {
+      healthStatus = 'healthy';
     }
 
     return {
@@ -194,140 +334,241 @@ export class ConnectionPoolManager {
       totalConnections,
       activeConnections,
       idleConnections,
-      waitingRequests: 0, // Would need additional monitoring
-      avgResponseTime,
+      waitingRequests: 0,
+      avgResponseTime: Math.round(avg),
+      p95ResponseTime: p95,
       healthStatus,
       lastHealthCheck: new Date(),
+      utilizationPct,
+      resizeCount: s.resizeCount,
     };
   }
 
+  /** Return pool statistics for every service concurrently. */
   async getAllPoolStats(): Promise<PoolStats[]> {
-    const services = Array.from(this.configs.keys());
-    const stats = await Promise.all(
-      services.map(service => this.getPoolStats(service))
-    );
-
-    return stats;
+    return Promise.all(SERVICE_NAMES.map(s => this.getPoolStats(s)));
   }
 
-  updatePoolConfig(serviceName: string, config: Partial<PoolConfig>): void {
-    const currentConfig = this.configs.get(serviceName);
-    if (currentConfig) {
-      this.configs.set(serviceName, { ...currentConfig, ...config });
-    }
+  // ── Configuration management ────────────────────────────────────────────────
+
+  /** Partially update a service's pool configuration at runtime. */
+  updatePoolConfig(serviceName: string, patch: Partial<PoolConfig>): void {
+    const s = this.state.get(serviceName);
+    if (!s) return; // silently ignore unknown services (matches existing behaviour)
+    s.config = { ...s.config, ...patch };
   }
 
+  // ── Usage-pattern-based auto-scaling ───────────────────────────────────────
+
+  /**
+   * Inspect recent utilisation trends for every service and adjust
+   * `maxConnections` up or down accordingly.
+   *
+   * Scaling rules:
+   *  - Scale UP  when the *average* utilisation over the history window
+   *              exceeds `scaleUpThreshold` (default 85 %).
+   *  - Scale DOWN when the *average* utilisation is below `scaleDownThreshold`
+   *              (default 30 %) AND the current max is above the default.
+   *
+   * A 20 % headroom factor is added on scale-up so the pool isn't
+   * immediately under pressure again.
+   */
   async autoResizePools(): Promise<void> {
-    const stats = await this.getAllPoolStats();
-    
-    for (const stat of stats) {
-      const config = this.configs.get(stat.serviceName);
-      if (!config) continue;
-
-      // Auto-resize logic
-      if (stat.totalConnections > config.maxConnections * 0.9) {
-        // Consider increasing max connections
-        const newMax = Math.min(config.maxConnections * 1.5, 50);
-        this.updatePoolConfig(stat.serviceName, { maxConnections: newMax });
-        console.log(`🔧 Auto-resized ${stat.serviceName} max connections to ${newMax}`);
-      } else if (stat.totalConnections < config.minConnections * 2 && config.maxConnections > DEFAULT_CONFIG.maxConnections) {
-        // Consider decreasing max connections
-        const newMax = Math.max(config.maxConnections * 0.8, DEFAULT_CONFIG.maxConnections);
-        this.updatePoolConfig(stat.serviceName, { maxConnections: newMax });
-        console.log(`🔧 Auto-resized ${stat.serviceName} max connections to ${newMax}`);
+    const allStats = await this.getAllPoolStats();
+
+    for (const stat of allStats) {
+      const s = this.state.get(stat.serviceName);
+      if (!s) continue;
+
+      const { config } = s;
+      const history = s.utilizationHistory;
+      if (history.length < 2) continue; // not enough data yet
+
+      const avgUtilisation = history.reduce((a, b) => a + b, 0) / history.length;
+
+      if (avgUtilisation / 100 >= config.scaleUpThreshold) {
+        // ── Scale up ──
+        const desired = Math.round(config.maxConnections * 1.2); // +20 % headroom
+        const newMax  = clamp(desired, config.maxConnections + 1, config.hardMaxConnections);
+
+        if (newMax !== config.maxConnections) {
+          this.updatePoolConfig(stat.serviceName, { maxConnections: newMax });
+          s.resizeCount++;
+          console.log(
+            `[PoolManager] ↑ ${stat.serviceName}: maxConnections ${config.maxConnections} → ${newMax}` +
+            ` (avg utilisation ${avgUtilisation.toFixed(1)} %)`
+          );
+        }
+      } else if (avgUtilisation / 100 <= config.scaleDownThreshold && config.maxConnections > DEFAULT_CONFIG.maxConnections) {
+        // ── Scale down ──
+        const desired = Math.round(config.maxConnections * 0.8); // -20 %
+        const newMax  = clamp(desired, DEFAULT_CONFIG.maxConnections, config.maxConnections - 1);
+
+        if (newMax !== config.maxConnections) {
+          this.updatePoolConfig(stat.serviceName, { maxConnections: newMax });
+          s.resizeCount++;
+          console.log(
+            `[PoolManager] ↓ ${stat.serviceName}: maxConnections ${config.maxConnections} → ${newMax}` +
+            ` (avg utilisation ${avgUtilisation.toFixed(1)} %)`
+          );
+        }
       }
     }
   }
 
-  startHealthMonitoring(intervalMs: number = 60000): void {
-    console.log(`🏥 Starting health monitoring (interval: ${intervalMs}ms)`);
-    
-    // Clear existing intervals
-    this.healthCheckIntervals.forEach(interval => clearInterval(interval));
-    this.healthCheckIntervals.clear();
+  // ── Health monitoring ───────────────────────────────────────────────────────
 
-    // Start new health monitoring
-    const interval = setInterval(async () => {
-      try {
-        const healthChecks = await this.getAllHealthChecks();
-        const unhealthyServices = healthChecks.filter(check => !check.isHealthy);
-        
-        if (unhealthyServices.length > 0) {
-          console.warn(`⚠️ Unhealthy services detected:`, unhealthyServices);
-        }
+  /**
+   * Start a periodic background loop that:
+   *  1. Health-checks all services.
+   *  2. Logs any unhealthy services.
+   *  3. Emits performance alerts when p95 response times cross the threshold.
+   *  4. Calls `autoResizePools()` to keep pool sizes in line with demand.
+   */
+  startHealthMonitoring(intervalMs: number = DEFAULT_CONFIG.healthCheckIntervalMs): void {
+    console.log(`[PoolManager] Starting health monitoring (interval: ${intervalMs}ms)`);
+
+    // Clear any existing intervals first.
+    this.stopHealthMonitoring();
 
-        // Auto-resize pools based on current load
-        await this.autoResizePools();
-      } catch (error) {
-        console.error('Health monitoring error:', error);
+    const tick = setInterval(async () => {
+      try {
+        await this.runMonitoringCycle();
+      } catch (err) {
+        console.error('[PoolManager] Health monitoring error:', err);
       }
     }, intervalMs);
 
-    this.healthCheckIntervals.set('global', interval);
+    this.healthCheckIntervals.set('global', tick);
+  }
+
+  /** Single monitoring cycle — exposed so it can be called imperatively too. */
+  async runMonitoringCycle(): Promise<void> {
+    const [healthChecks, perfMetrics] = await Promise.all([
+      this.getAllHealthChecks(),
+      this.getPerformanceMetrics(),
+    ]);
+
+    // Report unhealthy services.
+    const unhealthy = healthChecks.filter(h => !h.isHealthy);
+    if (unhealthy.length > 0) {
+      console.warn('[PoolManager] Unhealthy services:', unhealthy.map(h => ({
+        service: h.serviceName,
+        error: h.error,
+        timedOut: h.timedOut,
+      })));
+    }
+
+    // Report slow-query alerts.
+    for (const [service, m] of Object.entries(perfMetrics)) {
+      if (m.slowQueryAlert) {
+        console.warn(
+          `[PoolManager] Slow-query alert on ${service}: ` +
+          `p95=${m.p95}ms (threshold: ${this.state.get(service)?.config.degradedResponseTimeMs}ms)`
+        );
+      }
+    }
+
+    await this.autoResizePools();
   }
 
   stopHealthMonitoring(): void {
-    this.healthCheckIntervals.forEach(interval => clearInterval(interval));
+    this.healthCheckIntervals.forEach(i => clearInterval(i));
     this.healthCheckIntervals.clear();
-    console.log('🛑 Health monitoring stopped');
+    console.log('[PoolManager] Health monitoring stopped');
   }
 
-  async getPerformanceMetrics(): Promise<Record<string, { avg: number; min: number; max: number; samples: number }>> {
-    const metrics: Record<string, { avg: number; min: number; max: number; samples: number }> = {};
+  // ── Performance metrics ─────────────────────────────────────────────────────
+
+  /**
+   * Return aggregated performance metrics for all services.
+   * Metrics are computed from the rolling 5-minute window only.
+   */
+  getPerformanceMetrics(): Record<string, ServicePerformanceMetrics> {
+    const result: Record<string, ServicePerformanceMetrics> = {};
+    const now = Date.now();
+    const recentCutoff = now - 60_000; // last 60 s = "recent"
+
+    for (const [serviceName, s] of this.state.entries()) {
+      const all    = s.metrics.map(m => m.responseTime);
+      const recent = s.metrics.filter(m => m.timestamp >= recentCutoff).map(m => m.responseTime);
 
-    for (const [serviceName, responseTimes] of this.performanceMetrics.entries()) {
-      if (responseTimes.length === 0) {
-        metrics[serviceName] = { avg: 0, min: 0, max: 0, samples: 0 };
+      if (all.length === 0) {
+        result[serviceName] = { avg: 0, min: 0, max: 0, p95: 0, samples: 0, recentSamples: 0, slowQueryAlert: false };
         continue;
       }
 
-      const avg = responseTimes.reduce((sum, time) => sum + time, 0) / responseTimes.length;
-      const min = Math.min(...responseTimes);
-      const max = Math.max(...responseTimes);
-
-      metrics[serviceName] = {
-        avg: Math.round(avg),
-        min,
-        max,
-        samples: responseTimes.length,
+      const sorted = [...all].sort((a, b) => a - b);
+      const avg    = Math.round(all.reduce((x, y) => x + y, 0) / all.length);
+      const p95    = percentile(sorted, 95);
+
+      result[serviceName] = {
+        avg,
+        min: sorted[0],
+        max: sorted[sorted.length - 1],
+        p95,
+        samples: all.length,
+        recentSamples: recent.length,
+        slowQueryAlert: p95 > s.config.degradedResponseTimeMs,
       };
     }
 
-    return metrics;
+    return result;
   }
 
+  // ── Reporting ───────────────────────────────────────────────────────────────
+
+  /** Print a formatted summary of pool stats and performance to the console. */
   async logDetailedStats(): Promise<void> {
-    const stats = await this.getAllPoolStats();
-    const performanceMetrics = await this.getPerformanceMetrics();
-
-    console.log('\n📊 Detailed Connection Pool Statistics:');
-    console.table(stats);
-    
-    console.log('\n⚡ Performance Metrics (response times in ms):');
-    console.table(performanceMetrics);
+    const [stats, perfMetrics] = await Promise.all([
+      this.getAllPoolStats(),
+      this.getPerformanceMetrics(),
+    ]);
+
+    console.log('\n[PoolManager] ── Pool Statistics ─────────────────────────');
+    console.table(
+      stats.map(s => ({
+        service: s.serviceName,
+        total: s.totalConnections,
+        active: s.activeConnections,
+        idle: s.idleConnections,
+        utilPct: `${s.utilizationPct}%`,
+        status: s.healthStatus,
+        resizes: s.resizeCount,
+      }))
+    );
+
+    console.log('\n[PoolManager] ── Performance Metrics (ms) ───────────────');
+    console.table(
+      Object.entries(perfMetrics).map(([svc, m]) => ({
+        service: svc,
+        avg: m.avg,
+        min: m.min,
+        max: m.max,
+        p95: m.p95,
+        samples: m.samples,
+        recentSamples: m.recentSamples,
+        alert: m.slowQueryAlert ? '⚠ slow' : 'ok',
+      }))
+    );
   }
 
+  // ── Cleanup ─────────────────────────────────────────────────────────────────
+
+  /** Stop monitoring and gracefully disconnect every Prisma client. */
   async cleanup(): Promise<void> {
     this.stopHealthMonitoring();
-    
-    // Disconnect all clients
-    const clients = [
-      userClient,
-      notificationClient,
-      documentClient,
-      utilityClient,
-      paymentClient,
-      billingClient,
-      analyticsClient,
-      webhookClient,
-    ];
 
     await Promise.all(
-      clients.map(client => client ? (client as any).$disconnect().catch(console.error) : Promise.resolve())
+      Object.values(CLIENT_MAP).map(client =>
+        client ? (client as any).$disconnect().catch(console.error) : Promise.resolve()
+      )
     );
 
-    console.log('🧹 Connection pool manager cleaned up');
+    console.log('[PoolManager] Cleanup complete');
   }
 }
 
+// ─── Singleton export ──────────────────────────────────────────────────────────
+
 export default new ConnectionPoolManager();