From 80877d62910732108393924a80a8f381747848d5 Mon Sep 17 00:00:00 2001 From: Hydrax117 Date: Mon, 22 Jun 2026 00:17:10 +0100 Subject: [PATCH] Comprehensive Logging System Implementation --- backend/PULL_REQUEST.md | 362 ------------ backend/databases/ConnectionPoolManager.ts | 619 ++++++++++++++------- 2 files changed, 430 insertions(+), 551 deletions(-) delete mode 100644 backend/PULL_REQUEST.md diff --git a/backend/PULL_REQUEST.md b/backend/PULL_REQUEST.md deleted file mode 100644 index 89dc9ab..0000000 --- a/backend/PULL_REQUEST.md +++ /dev/null @@ -1,362 +0,0 @@ -# Pull Request: Comprehensive Logging System Implementation - -## Issue -Closes #266 - -## Summary -Implemented a complete, production-ready logging system with structured logging, multiple log levels, automatic rotation, aggregation, and real-time monitoring capabilities. - -## Problem Statement -The existing logging system was not comprehensive enough: -- Application events were not being logged consistently -- No structured logging format -- Missing log rotation and retention policies -- No log aggregation or analytics -- No monitoring or alerting capabilities - -## Solution -Developed a comprehensive logging system with the following components: - -### 1. Core Logging System (`middleware/logger.ts`) -- **Structured Logging**: JSON format with consistent schema -- **7 Log Levels**: error, warn, info, http, debug, verbose, silly -- **8 Log Categories**: application, security, audit, performance, business, database, external_api, webhook -- **Automatic Log Rotation**: Daily rotation with size limits (50MB) and compression -- **Context Propagation**: Maintain context across async operations using AsyncLocalStorage -- **Correlation IDs**: Track requests across services -- **Child Loggers**: Create loggers with default context -- **Global Error Handling**: Catch uncaught exceptions and unhandled rejections - -### 2. Log Monitoring (`middleware/logMonitoring.ts`) -- **Real-time Pattern Detection**: Identify critical events automatically -- **Configurable Alerts**: Set thresholds and time windows -- **Built-in Patterns**: Database errors, auth failures, payment issues, etc. -- **Alert Severity Levels**: low, medium, high, critical -- **External Integrations**: Slack, Datadog, Elasticsearch -- **Statistics Tracking**: Error rates, slow requests, etc. - -### 3. Log Aggregation (`middleware/logAggregation.ts`) -- **Efficient Collection**: In-memory log storage with configurable limits -- **Query Interface**: Filter by level, category, time range, search term -- **Automatic Aggregation**: Combine similar logs to reduce noise -- **Statistics Generation**: Real-time analytics - -### 4. Configuration (`config/loggingConfig.ts`) -- **Centralized Settings**: All configuration in one place -- **Environment-specific**: Different settings per environment -- **Retention Policies**: Configurable per log type (30-365 days) -- **Performance Thresholds**: Define what constitutes "slow" -- **External Service Config**: Slack, Datadog, Elasticsearch credentials - -## Changes Made - -### New Files Created (10 files) -1. ✅ `middleware/logger.ts` - Core logging system (650+ lines) -2. ✅ `middleware/logMonitoring.ts` - Monitoring and alerting (280+ lines) -3. ✅ `middleware/logAggregation.ts` - Log aggregation (140+ lines) -4. ✅ `config/loggingConfig.ts` - Configuration (100+ lines) -5. ✅ `middleware/README.md` - Complete documentation (400+ lines) -6. ✅ `middleware/MIGRATION_GUIDE.md` - Migration instructions (350+ lines) -7. ✅ `middleware/QUICK_REFERENCE.md` - Quick reference (200+ lines) -8. ✅ `middleware/IMPLEMENTATION_SUMMARY.md` - Implementation details -9. ✅ `middleware/loggingExample.ts` - 13 usage examples (450+ lines) -10. ✅ `tests/unit/logging.test.ts` - Comprehensive test suite (290+ lines) - -### Files Updated -- ✅ `.gitignore` - Added log directories and files - -## Features - -### Specialized Logging Methods -```typescript -// Security events -logger.security('Failed login attempt', { userId, ip }); - -// Audit trail -logger.audit({ - action: 'user.update', - resource: 'user', - resourceId: '123', - userId: 'admin', - result: 'success' -}); - -// Performance monitoring -logger.performance({ - operation: 'database-query', - duration: 1500, - threshold: 1000 -}); - -// Business metrics -logger.metric({ - name: 'payment.processed', - value: 99.99, - unit: 'USD', - tags: { method: 'credit_card' } -}); -``` - -### Log Rotation -- **Application logs**: 30 days retention, 50MB max size -- **Error logs**: 90 days retention, 50MB max size -- **Security logs**: 365 days retention, 50MB max size -- **Performance logs**: 14 days retention, 50MB max size -- **Metrics logs**: 30 days retention, 50MB max size -- All files automatically compressed after rotation - -### Monitoring & Alerting -- Built-in patterns for critical events -- Configurable thresholds and time windows -- Real-time statistics (error rate, slow requests, etc.) -- External service integration (Slack, Datadog, Elasticsearch) -- Custom pattern support - -### Context Propagation -```typescript -logContextStorage.run({ userId, operation }, async () => { - logger.info('Step 1'); // Automatically includes userId and operation - logger.info('Step 2'); // Automatically includes userId and operation -}); -``` - -## Testing - -### Test Coverage -- ✅ Logger unit tests (8 test cases) -- ✅ Monitor unit tests (6 test cases) -- ✅ Aggregator unit tests (8 test cases) -- ✅ Integration tests (1 test case) -- **Total**: 23+ test cases - -### Run Tests -```bash -npm test tests/unit/logging.test.ts -``` - -## Configuration - -### Environment Variables -```bash -# Logging Configuration -NODE_ENV=production -LOG_LEVEL=info -LOG_PATH=logs -LOG_SILENT=false - -# Monitoring -LOG_MONITORING_ENABLED=true - -# External Services (optional) -SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL -SLACK_CHANNEL=#alerts -ELASTICSEARCH_NODE=http://localhost:9200 -ELASTICSEARCH_INDEX=logs -DATADOG_API_KEY=your-api-key -DATADOG_APP_KEY=your-app-key - -# Service Identification -SERVICE_NAME=nepa-backend -``` - -## Usage - -### Express Integration -```typescript -import { loggingMiddleware, setupGlobalErrorHandling } from './middleware/logger'; - -const app = express(); - -// Apply logging middleware -app.use(loggingMiddleware); - -// Your routes here -app.get('/api/users', (req, res) => { - logger.info('Fetching users'); - res.json({ users: [] }); -}); - -// Setup global error handling -setupGlobalErrorHandling(app); -``` - -### Basic Logging -```typescript -import { logger } from './middleware/logger'; - -logger.info('User logged in', { userId: '123', ip: '192.168.1.1' }); -logger.error('Operation failed', { error, context }); -logger.debug('Debug information', { data }); -``` - -## Migration Path - -For existing code, follow the migration guide: -1. Update imports to use new logger -2. Apply logging middleware to Express app -3. Replace console.log calls with appropriate log levels -4. Add specialized logging (security, audit, performance, metrics) -5. Configure environment variables -6. Test thoroughly - -See `middleware/MIGRATION_GUIDE.md` for detailed instructions. - -## Documentation - -### Comprehensive Documentation Provided -- **README.md**: Complete user guide with examples and best practices -- **MIGRATION_GUIDE.md**: Step-by-step migration instructions -- **QUICK_REFERENCE.md**: Quick reference for common patterns -- **IMPLEMENTATION_SUMMARY.md**: Technical implementation details -- **loggingExample.ts**: 13 complete usage examples -- Inline code documentation with TypeScript types - -## Performance Considerations - -- ✅ Asynchronous file writes (non-blocking) -- ✅ Buffered metrics aggregation -- ✅ Automatic log rotation prevents disk issues -- ✅ Configurable log levels reduce overhead -- ✅ Efficient pattern matching -- ✅ Memory-bounded aggregation - -## Security Features - -- ✅ Sensitive data masking configuration -- ✅ Security event logging -- ✅ Audit trail with long retention (365 days) -- ✅ Authentication failure tracking -- ✅ IP and user agent logging -- ✅ Correlation IDs for forensics - -## Benefits - -1. **Better Debugging**: Correlation IDs track requests across services -2. **Compliance**: Audit logs with long retention for regulatory requirements -3. **Performance**: Identify slow operations automatically -4. **Alerting**: Get notified of critical issues in real-time -5. **Analytics**: Query and analyze logs programmatically -6. **Cost Savings**: Automatic rotation prevents disk space issues -7. **Security**: Track authentication failures and suspicious activity -8. **Observability**: Complete system visibility - -## Acceptance Criteria - -| Criteria | Status | Implementation | -|----------|--------|----------------| -| ✅ Implement structured logging | **DONE** | JSON format with consistent schema | -| ✅ Add log levels | **DONE** | 7 levels: error, warn, info, http, debug, verbose, silly | -| ✅ Include log rotation | **DONE** | Daily rotation with size limits and compression | -| ✅ Add log aggregation | **DONE** | Query interface with filtering and statistics | -| ✅ Include log monitoring | **DONE** | Pattern detection, alerting, external integrations | - -## Breaking Changes - -None. This is a new implementation that can coexist with existing logging until migration is complete. - -## Dependencies - -All required dependencies are already in `package.json`: -- ✅ `winston` (^3.11.0) -- ✅ `winston-daily-rotate-file` (^4.7.1) -- ✅ `cls-rtracer` (^2.6.0) - -No new dependencies need to be installed. - -## Rollout Plan - -### Phase 1: Staging Deployment -1. Deploy to staging environment -2. Configure environment variables -3. Monitor logs and alerts -4. Adjust thresholds as needed - -### Phase 2: Gradual Production Rollout -1. Enable on subset of production servers -2. Monitor performance impact -3. Verify external integrations (Slack, etc.) -4. Gradually roll out to all servers - -### Phase 3: Migration -1. Update existing code to use new logger -2. Remove old logging implementations -3. Train team on new logging patterns - -## Monitoring After Deployment - -Monitor these metrics: -- Log file sizes and rotation -- Error rates and patterns -- Alert frequency and accuracy -- Performance impact (should be minimal) -- Disk space usage - -## Rollback Plan - -If issues arise: -1. Set `LOG_SILENT=true` to disable logging -2. Revert to previous logging implementation -3. Investigate and fix issues -4. Redeploy - -## Screenshots/Examples - -See `middleware/loggingExample.ts` for 13 complete examples including: -- Basic logging -- Error logging -- Security logging -- Audit logging -- Performance logging -- Business metrics -- Database logging -- External API logging -- Webhook logging -- Context propagation -- Child loggers -- Request handlers - -## Checklist - -- [x] Code follows project style guidelines -- [x] Self-review completed -- [x] Code commented, particularly complex areas -- [x] Documentation updated -- [x] No new warnings generated -- [x] Tests added and passing -- [x] Dependent changes merged -- [x] Migration guide provided -- [x] Configuration documented -- [x] Performance impact assessed - -## Related Issues - -Closes #266 - -## Additional Notes - -This implementation provides a solid foundation for logging that can be extended in the future with: -- Cloud service integration (AWS CloudWatch, Azure Monitor) -- Machine learning for anomaly detection -- Advanced query language -- Log visualization dashboard -- Distributed tracing integration - -## Reviewers - -Please review: -- Code quality and architecture -- Documentation completeness -- Test coverage -- Configuration options -- Migration path - -## Questions for Reviewers - -1. Are the default retention periods appropriate for our compliance requirements? -2. Should we enable Slack alerts by default or keep them opt-in? -3. Are there additional log patterns we should monitor? -4. Should we add more specialized logging methods for specific use cases? - ---- - -**Ready for Review** ✅ diff --git a/backend/databases/ConnectionPoolManager.ts b/backend/databases/ConnectionPoolManager.ts index 21f6b18..4276709 100644 --- a/backend/databases/ConnectionPoolManager.ts +++ b/backend/databases/ConnectionPoolManager.ts @@ -1,3 +1,15 @@ +/** + * databases/ConnectionPoolManager.ts + * + * Manages Prisma database connection pools across all backend services. + * + * Key capabilities: + * - Usage-pattern-based pool sizing (auto-scale up/down based on rolling + * utilisation and p95 response time) + * - Per-service connection timeout enforcement + * - Rich performance monitoring (rolling window metrics, percentiles, alerts) + */ + import { userClient, notificationClient, @@ -9,14 +21,29 @@ import { webhookClient, } from './clients'; +// ─── Configuration ───────────────────────────────────────────────────────────── + export interface PoolConfig { minConnections: number; maxConnections: number; + /** How long (ms) to wait for a connection before giving up. */ connectionTimeoutMs: number; + /** How long (ms) an idle connection lives before being closed. */ idleTimeoutMs: number; + /** Interval (ms) between scheduled health checks. */ healthCheckIntervalMs: number; + /** Utilisation ratio (0–1) above which the pool is scaled up. Default 0.85 */ + scaleUpThreshold: number; + /** Utilisation ratio (0–1) below which the pool is scaled down. Default 0.30 */ + scaleDownThreshold: number; + /** p95 response-time ceiling (ms) before the pool is marked degraded. Default 800 */ + degradedResponseTimeMs: number; + /** Absolute maximum connections regardless of auto-scaling. Default 100 */ + hardMaxConnections: number; } +// ─── Stats / Results ─────────────────────────────────────────────────────────── + export interface PoolStats { serviceName: string; totalConnections: number; @@ -24,8 +51,14 @@ export interface PoolStats { idleConnections: number; waitingRequests: number; avgResponseTime: number; + /** p95 response time from the rolling metrics window. */ + p95ResponseTime: number; healthStatus: 'healthy' | 'degraded' | 'unhealthy'; lastHealthCheck: Date; + /** Current utilisation as a 0–100 percentage. */ + utilizationPct: number; + /** How many times the pool has been auto-scaled since startup. */ + resizeCount: number; } export interface HealthCheckResult { @@ -33,160 +66,267 @@ export interface HealthCheckResult { isHealthy: boolean; responseTime: number; error?: string; + timedOut?: boolean; +} + +/** Aggregated performance metrics for a single service. */ +export interface ServicePerformanceMetrics { + avg: number; + min: number; + max: number; + p95: number; + samples: number; + /** Samples collected in the last `windowMs` milliseconds. */ + recentSamples: number; + /** Alert: true when p95 exceeds the configured `degradedResponseTimeMs`. */ + slowQueryAlert: boolean; +} + +// ─── Internal bookkeeping ────────────────────────────────────────────────────── + +interface TimestampedMetric { + responseTime: number; + timestamp: number; // epoch ms +} + +interface ServiceState { + config: PoolConfig; + metrics: TimestampedMetric[]; + resizeCount: number; + /** Running utilisation samples used for trend-based scaling decisions. */ + utilizationHistory: number[]; } +// ─── Defaults ────────────────────────────────────────────────────────────────── + const DEFAULT_CONFIG: PoolConfig = { minConnections: 2, maxConnections: 20, - connectionTimeoutMs: 30000, - idleTimeoutMs: 300000, - healthCheckIntervalMs: 60000, + connectionTimeoutMs: 30_000, + idleTimeoutMs: 300_000, + healthCheckIntervalMs: 60_000, + scaleUpThreshold: 0.85, + scaleDownThreshold: 0.30, + degradedResponseTimeMs: 800, + hardMaxConnections: 100, }; +/** Rolling window kept for performance metrics (5 minutes). */ +const METRICS_WINDOW_MS = 5 * 60 * 1_000; +/** Maximum samples retained per service regardless of age. */ +const MAX_METRICS_SAMPLES = 500; +/** Number of recent utilisation samples used for scaling decisions. */ +const UTILISATION_HISTORY_SIZE = 10; + +// ─── Service registry ────────────────────────────────────────────────────────── + +const CLIENT_MAP = { + 'user-service': userClient, + 'notification-service': notificationClient, + 'document-service': documentClient, + 'utility-service': utilityClient, + 'payment-service': paymentClient, + 'billing-service': billingClient, + 'analytics-service': analyticsClient, + 'webhook-service': webhookClient, +} as const; + +type ServiceName = keyof typeof CLIENT_MAP; +const SERVICE_NAMES = Object.keys(CLIENT_MAP) as ServiceName[]; + +// ─── Helpers ─────────────────────────────────────────────────────────────────── + +/** Calculate the p-th percentile of a sorted numeric array (ascending). */ +function percentile(sortedArr: number[], p: number): number { + if (sortedArr.length === 0) return 0; + const idx = Math.ceil((p / 100) * sortedArr.length) - 1; + return sortedArr[Math.max(0, Math.min(idx, sortedArr.length - 1))]; +} + +/** Clamp a number between min and max inclusive. */ +function clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); +} + +/** Promise that rejects after `ms` milliseconds with a timeout error. */ +function rejectAfter(ms: number): Promise { + return new Promise((_, reject) => + setTimeout(() => reject(new Error(`Connection timeout after ${ms}ms`)), ms) + ); +} + +// ─── Main class ──────────────────────────────────────────────────────────────── + export class ConnectionPoolManager { - private pools = new Map(); - private configs = new Map(); - private healthCheckIntervals = new Map(); - private performanceMetrics = new Map(); - private readonly MAX_METRICS_SAMPLES = 100; + /** Per-service state (config + rolling metrics + resize counter). */ + private readonly state = new Map(); + + /** Active setInterval handles for health monitoring. */ + private readonly healthCheckIntervals = new Map(); constructor() { - this.initializePools(); + this.initializeServices(); } - private initializePools(): void { - const services = [ - 'user-service', - 'notification-service', - 'document-service', - 'utility-service', - 'payment-service', - 'billing-service', - 'analytics-service', - 'webhook-service', - ]; - - services.forEach(service => { - this.configs.set(service, { ...DEFAULT_CONFIG }); - this.performanceMetrics.set(service, []); - }); + // ── Initialisation ────────────────────────────────────────────────────────── + + private initializeServices(): void { + for (const name of SERVICE_NAMES) { + this.state.set(name, { + config: { ...DEFAULT_CONFIG }, + metrics: [], + resizeCount: 0, + utilizationHistory: [], + }); + } } + private getState(serviceName: string): ServiceState { + const s = this.state.get(serviceName); + if (!s) throw new Error(`Unknown service: ${serviceName}`); + return s; + } + + // ── Client access with timeout ────────────────────────────────────────────── + + /** + * Retrieve the Prisma client for `serviceName`, recording a performance + * sample and enforcing the configured connection timeout. + */ async getServiceClient(serviceName: string): Promise { - const clientMap = { - 'user-service': userClient, - 'notification-service': notificationClient, - 'document-service': documentClient, - 'utility-service': utilityClient, - 'payment-service': paymentClient, - 'billing-service': billingClient, - 'analytics-service': analyticsClient, - 'webhook-service': webhookClient, - }; + const client = CLIENT_MAP[serviceName as ServiceName]; + if (!client) throw new Error(`Unknown service: ${serviceName}`); - const client = clientMap[serviceName as keyof typeof clientMap]; - if (!client) { - throw new Error(`Unknown service: ${serviceName}`); - } + const { config } = this.getState(serviceName); + const start = Date.now(); - // Record performance metrics - const startTime = Date.now(); - try { - // Ensure connection is healthy before returning - await this.performHealthCheck(serviceName); + // Enforce connection timeout: health check must complete within the limit. + await Promise.race([ + this.performHealthCheck(serviceName), + rejectAfter(config.connectionTimeoutMs), + ]); + return client; } finally { - const responseTime = Date.now() - startTime; - this.recordPerformanceMetric(serviceName, responseTime); + this.recordMetric(serviceName, Date.now() - start); } } - private recordPerformanceMetric(serviceName: string, responseTime: number): void { - const metrics = this.performanceMetrics.get(serviceName) || []; - metrics.push(responseTime); - - // Keep only the last MAX_METRICS_SAMPLES - if (metrics.length > this.MAX_METRICS_SAMPLES) { - metrics.shift(); + // ── Metrics recording ─────────────────────────────────────────────────────── + + private recordMetric(serviceName: string, responseTime: number): void { + const s = this.state.get(serviceName); + if (!s) return; + + const now = Date.now(); + s.metrics.push({ responseTime, timestamp: now }); + + // 1. Evict samples outside the rolling window. + const cutoff = now - METRICS_WINDOW_MS; + s.metrics = s.metrics.filter(m => m.timestamp >= cutoff); + + // 2. Hard cap to avoid unbounded growth during very high throughput. + if (s.metrics.length > MAX_METRICS_SAMPLES) { + s.metrics.splice(0, s.metrics.length - MAX_METRICS_SAMPLES); } - - this.performanceMetrics.set(serviceName, metrics); } + // ── Health check ──────────────────────────────────────────────────────────── + + /** + * Probe a single service with a lightweight SQL round-trip. + * Returns the result even on failure (never throws). + */ async performHealthCheck(serviceName: string): Promise { - const startTime = Date.now(); - let isHealthy = false; - let error: string | undefined; + const start = Date.now(); + + // Guard against infinite recursion: use the raw client directly here + // instead of going through getServiceClient(). + const client = CLIENT_MAP[serviceName as ServiceName]; + if (!client) { + return { serviceName, isHealthy: false, responseTime: 0, error: `Unknown service: ${serviceName}` }; + } try { - const client = await this.getServiceClient(serviceName); - await client.$queryRaw`SELECT 1`; - isHealthy = true; + await (client as any).$queryRaw`SELECT 1`; + return { serviceName, isHealthy: true, responseTime: Date.now() - start }; } catch (err) { - error = err instanceof Error ? err.message : 'Unknown error'; - isHealthy = false; + const isTimeout = err instanceof Error && err.message.includes('timeout'); + return { + serviceName, + isHealthy: false, + responseTime: Date.now() - start, + error: err instanceof Error ? err.message : 'Unknown error', + timedOut: isTimeout, + }; } - - const responseTime = Date.now() - startTime; - - return { - serviceName, - isHealthy, - responseTime, - error, - }; } + /** Health-check every registered service concurrently. */ async getAllHealthChecks(): Promise { - const services = Array.from(this.configs.keys()); - const healthChecks = await Promise.all( - services.map(service => this.performHealthCheck(service)) - ); - - return healthChecks; + return Promise.all(SERVICE_NAMES.map(s => this.performHealthCheck(s))); } + // ── Pool statistics ───────────────────────────────────────────────────────── + + /** + * Return live pool statistics for a single service, including real + * connection counts queried from `pg_stat_activity`. + */ async getPoolStats(serviceName: string): Promise { - const config = this.configs.get(serviceName); - const metrics = this.performanceMetrics.get(serviceName) || []; + const s = this.getState(serviceName); const healthCheck = await this.performHealthCheck(serviceName); - // Calculate average response time - const avgResponseTime = metrics.length > 0 - ? metrics.reduce((sum, time) => sum + time, 0) / metrics.length - : 0; - - // Get connection statistics from database + // ── Connection counts ── let activeConnections = 0; let idleConnections = 0; let totalConnections = 0; try { - const client = await this.getServiceClient(serviceName); + const client = CLIENT_MAP[serviceName as ServiceName] as any; const result = await client.$queryRaw` - SELECT - count(*) FILTER (WHERE state = 'active') as active, - count(*) FILTER (WHERE state = 'idle') as idle, - count(*) as total + SELECT + count(*) FILTER (WHERE state = 'active') AS active, + count(*) FILTER (WHERE state = 'idle') AS idle, + count(*) AS total FROM pg_stat_activity WHERE datname = current_database() `; + const row = result[0]; + activeConnections = Number(row?.active) || 0; + idleConnections = Number(row?.idle) || 0; + totalConnections = Number(row?.total) || 0; + } catch { + // pg_stat_activity may be unavailable in some environments; carry on. + } + + // ── Derived metrics ── + const times = s.metrics.map(m => m.responseTime).sort((a, b) => a - b); + const avg = times.length ? times.reduce((x, y) => x + y, 0) / times.length : 0; + const p95 = percentile(times, 95); - const stats = result[0]; - activeConnections = Number(stats.active) || 0; - idleConnections = Number(stats.idle) || 0; - totalConnections = Number(stats.total) || 0; - } catch (error) { - console.error(`Error getting pool stats for ${serviceName}:`, error); + const utilizationPct = s.config.maxConnections > 0 + ? Math.round((totalConnections / s.config.maxConnections) * 100) + : 0; + + // Track utilisation for trend-based scaling. + s.utilizationHistory.push(utilizationPct); + if (s.utilizationHistory.length > UTILISATION_HISTORY_SIZE) { + s.utilizationHistory.shift(); } - // Determine health status - let healthStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy'; + // ── Health status ── + let healthStatus: 'healthy' | 'degraded' | 'unhealthy'; if (!healthCheck.isHealthy) { healthStatus = 'unhealthy'; - } else if (avgResponseTime > 1000 || totalConnections > (config?.maxConnections || 20) * 0.8) { + } else if ( + p95 > s.config.degradedResponseTimeMs || + totalConnections > s.config.maxConnections * s.config.scaleUpThreshold + ) { healthStatus = 'degraded'; + } else { + healthStatus = 'healthy'; } return { @@ -194,140 +334,241 @@ export class ConnectionPoolManager { totalConnections, activeConnections, idleConnections, - waitingRequests: 0, // Would need additional monitoring - avgResponseTime, + waitingRequests: 0, + avgResponseTime: Math.round(avg), + p95ResponseTime: p95, healthStatus, lastHealthCheck: new Date(), + utilizationPct, + resizeCount: s.resizeCount, }; } + /** Return pool statistics for every service concurrently. */ async getAllPoolStats(): Promise { - const services = Array.from(this.configs.keys()); - const stats = await Promise.all( - services.map(service => this.getPoolStats(service)) - ); - - return stats; + return Promise.all(SERVICE_NAMES.map(s => this.getPoolStats(s))); } - updatePoolConfig(serviceName: string, config: Partial): void { - const currentConfig = this.configs.get(serviceName); - if (currentConfig) { - this.configs.set(serviceName, { ...currentConfig, ...config }); - } + // ── Configuration management ──────────────────────────────────────────────── + + /** Partially update a service's pool configuration at runtime. */ + updatePoolConfig(serviceName: string, patch: Partial): void { + const s = this.state.get(serviceName); + if (!s) return; // silently ignore unknown services (matches existing behaviour) + s.config = { ...s.config, ...patch }; } + // ── Usage-pattern-based auto-scaling ─────────────────────────────────────── + + /** + * Inspect recent utilisation trends for every service and adjust + * `maxConnections` up or down accordingly. + * + * Scaling rules: + * - Scale UP when the *average* utilisation over the history window + * exceeds `scaleUpThreshold` (default 85 %). + * - Scale DOWN when the *average* utilisation is below `scaleDownThreshold` + * (default 30 %) AND the current max is above the default. + * + * A 20 % headroom factor is added on scale-up so the pool isn't + * immediately under pressure again. + */ async autoResizePools(): Promise { - const stats = await this.getAllPoolStats(); - - for (const stat of stats) { - const config = this.configs.get(stat.serviceName); - if (!config) continue; - - // Auto-resize logic - if (stat.totalConnections > config.maxConnections * 0.9) { - // Consider increasing max connections - const newMax = Math.min(config.maxConnections * 1.5, 50); - this.updatePoolConfig(stat.serviceName, { maxConnections: newMax }); - console.log(`🔧 Auto-resized ${stat.serviceName} max connections to ${newMax}`); - } else if (stat.totalConnections < config.minConnections * 2 && config.maxConnections > DEFAULT_CONFIG.maxConnections) { - // Consider decreasing max connections - const newMax = Math.max(config.maxConnections * 0.8, DEFAULT_CONFIG.maxConnections); - this.updatePoolConfig(stat.serviceName, { maxConnections: newMax }); - console.log(`🔧 Auto-resized ${stat.serviceName} max connections to ${newMax}`); + const allStats = await this.getAllPoolStats(); + + for (const stat of allStats) { + const s = this.state.get(stat.serviceName); + if (!s) continue; + + const { config } = s; + const history = s.utilizationHistory; + if (history.length < 2) continue; // not enough data yet + + const avgUtilisation = history.reduce((a, b) => a + b, 0) / history.length; + + if (avgUtilisation / 100 >= config.scaleUpThreshold) { + // ── Scale up ── + const desired = Math.round(config.maxConnections * 1.2); // +20 % headroom + const newMax = clamp(desired, config.maxConnections + 1, config.hardMaxConnections); + + if (newMax !== config.maxConnections) { + this.updatePoolConfig(stat.serviceName, { maxConnections: newMax }); + s.resizeCount++; + console.log( + `[PoolManager] ↑ ${stat.serviceName}: maxConnections ${config.maxConnections} → ${newMax}` + + ` (avg utilisation ${avgUtilisation.toFixed(1)} %)` + ); + } + } else if (avgUtilisation / 100 <= config.scaleDownThreshold && config.maxConnections > DEFAULT_CONFIG.maxConnections) { + // ── Scale down ── + const desired = Math.round(config.maxConnections * 0.8); // -20 % + const newMax = clamp(desired, DEFAULT_CONFIG.maxConnections, config.maxConnections - 1); + + if (newMax !== config.maxConnections) { + this.updatePoolConfig(stat.serviceName, { maxConnections: newMax }); + s.resizeCount++; + console.log( + `[PoolManager] ↓ ${stat.serviceName}: maxConnections ${config.maxConnections} → ${newMax}` + + ` (avg utilisation ${avgUtilisation.toFixed(1)} %)` + ); + } } } } - startHealthMonitoring(intervalMs: number = 60000): void { - console.log(`🏥 Starting health monitoring (interval: ${intervalMs}ms)`); - - // Clear existing intervals - this.healthCheckIntervals.forEach(interval => clearInterval(interval)); - this.healthCheckIntervals.clear(); + // ── Health monitoring ─────────────────────────────────────────────────────── - // Start new health monitoring - const interval = setInterval(async () => { - try { - const healthChecks = await this.getAllHealthChecks(); - const unhealthyServices = healthChecks.filter(check => !check.isHealthy); - - if (unhealthyServices.length > 0) { - console.warn(`⚠️ Unhealthy services detected:`, unhealthyServices); - } + /** + * Start a periodic background loop that: + * 1. Health-checks all services. + * 2. Logs any unhealthy services. + * 3. Emits performance alerts when p95 response times cross the threshold. + * 4. Calls `autoResizePools()` to keep pool sizes in line with demand. + */ + startHealthMonitoring(intervalMs: number = DEFAULT_CONFIG.healthCheckIntervalMs): void { + console.log(`[PoolManager] Starting health monitoring (interval: ${intervalMs}ms)`); + + // Clear any existing intervals first. + this.stopHealthMonitoring(); - // Auto-resize pools based on current load - await this.autoResizePools(); - } catch (error) { - console.error('Health monitoring error:', error); + const tick = setInterval(async () => { + try { + await this.runMonitoringCycle(); + } catch (err) { + console.error('[PoolManager] Health monitoring error:', err); } }, intervalMs); - this.healthCheckIntervals.set('global', interval); + this.healthCheckIntervals.set('global', tick); + } + + /** Single monitoring cycle — exposed so it can be called imperatively too. */ + async runMonitoringCycle(): Promise { + const [healthChecks, perfMetrics] = await Promise.all([ + this.getAllHealthChecks(), + this.getPerformanceMetrics(), + ]); + + // Report unhealthy services. + const unhealthy = healthChecks.filter(h => !h.isHealthy); + if (unhealthy.length > 0) { + console.warn('[PoolManager] Unhealthy services:', unhealthy.map(h => ({ + service: h.serviceName, + error: h.error, + timedOut: h.timedOut, + }))); + } + + // Report slow-query alerts. + for (const [service, m] of Object.entries(perfMetrics)) { + if (m.slowQueryAlert) { + console.warn( + `[PoolManager] Slow-query alert on ${service}: ` + + `p95=${m.p95}ms (threshold: ${this.state.get(service)?.config.degradedResponseTimeMs}ms)` + ); + } + } + + await this.autoResizePools(); } stopHealthMonitoring(): void { - this.healthCheckIntervals.forEach(interval => clearInterval(interval)); + this.healthCheckIntervals.forEach(i => clearInterval(i)); this.healthCheckIntervals.clear(); - console.log('🛑 Health monitoring stopped'); + console.log('[PoolManager] Health monitoring stopped'); } - async getPerformanceMetrics(): Promise> { - const metrics: Record = {}; + // ── Performance metrics ───────────────────────────────────────────────────── + + /** + * Return aggregated performance metrics for all services. + * Metrics are computed from the rolling 5-minute window only. + */ + getPerformanceMetrics(): Record { + const result: Record = {}; + const now = Date.now(); + const recentCutoff = now - 60_000; // last 60 s = "recent" + + for (const [serviceName, s] of this.state.entries()) { + const all = s.metrics.map(m => m.responseTime); + const recent = s.metrics.filter(m => m.timestamp >= recentCutoff).map(m => m.responseTime); - for (const [serviceName, responseTimes] of this.performanceMetrics.entries()) { - if (responseTimes.length === 0) { - metrics[serviceName] = { avg: 0, min: 0, max: 0, samples: 0 }; + if (all.length === 0) { + result[serviceName] = { avg: 0, min: 0, max: 0, p95: 0, samples: 0, recentSamples: 0, slowQueryAlert: false }; continue; } - const avg = responseTimes.reduce((sum, time) => sum + time, 0) / responseTimes.length; - const min = Math.min(...responseTimes); - const max = Math.max(...responseTimes); - - metrics[serviceName] = { - avg: Math.round(avg), - min, - max, - samples: responseTimes.length, + const sorted = [...all].sort((a, b) => a - b); + const avg = Math.round(all.reduce((x, y) => x + y, 0) / all.length); + const p95 = percentile(sorted, 95); + + result[serviceName] = { + avg, + min: sorted[0], + max: sorted[sorted.length - 1], + p95, + samples: all.length, + recentSamples: recent.length, + slowQueryAlert: p95 > s.config.degradedResponseTimeMs, }; } - return metrics; + return result; } + // ── Reporting ─────────────────────────────────────────────────────────────── + + /** Print a formatted summary of pool stats and performance to the console. */ async logDetailedStats(): Promise { - const stats = await this.getAllPoolStats(); - const performanceMetrics = await this.getPerformanceMetrics(); - - console.log('\n📊 Detailed Connection Pool Statistics:'); - console.table(stats); - - console.log('\n⚡ Performance Metrics (response times in ms):'); - console.table(performanceMetrics); + const [stats, perfMetrics] = await Promise.all([ + this.getAllPoolStats(), + this.getPerformanceMetrics(), + ]); + + console.log('\n[PoolManager] ── Pool Statistics ─────────────────────────'); + console.table( + stats.map(s => ({ + service: s.serviceName, + total: s.totalConnections, + active: s.activeConnections, + idle: s.idleConnections, + utilPct: `${s.utilizationPct}%`, + status: s.healthStatus, + resizes: s.resizeCount, + })) + ); + + console.log('\n[PoolManager] ── Performance Metrics (ms) ───────────────'); + console.table( + Object.entries(perfMetrics).map(([svc, m]) => ({ + service: svc, + avg: m.avg, + min: m.min, + max: m.max, + p95: m.p95, + samples: m.samples, + recentSamples: m.recentSamples, + alert: m.slowQueryAlert ? '⚠ slow' : 'ok', + })) + ); } + // ── Cleanup ───────────────────────────────────────────────────────────────── + + /** Stop monitoring and gracefully disconnect every Prisma client. */ async cleanup(): Promise { this.stopHealthMonitoring(); - - // Disconnect all clients - const clients = [ - userClient, - notificationClient, - documentClient, - utilityClient, - paymentClient, - billingClient, - analyticsClient, - webhookClient, - ]; await Promise.all( - clients.map(client => client ? (client as any).$disconnect().catch(console.error) : Promise.resolve()) + Object.values(CLIENT_MAP).map(client => + client ? (client as any).$disconnect().catch(console.error) : Promise.resolve() + ) ); - console.log('🧹 Connection pool manager cleaned up'); + console.log('[PoolManager] Cleanup complete'); } } +// ─── Singleton export ────────────────────────────────────────────────────────── + export default new ConnectionPoolManager();