From a9ab5af9b30e4a0114dc0a299dac09084b5af7f2 Mon Sep 17 00:00:00 2001 From: MAGARET Date: Sat, 27 Jun 2026 10:53:20 +0000 Subject: [PATCH] docs: add monitoring and alerting guide --- deploy/monitoring/alert-rules.yml | 20 +++++ docs/operations/monitoring-and-alerting.md | 100 +++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 deploy/monitoring/alert-rules.yml create mode 100644 docs/operations/monitoring-and-alerting.md diff --git a/deploy/monitoring/alert-rules.yml b/deploy/monitoring/alert-rules.yml new file mode 100644 index 0000000..ccd40eb --- /dev/null +++ b/deploy/monitoring/alert-rules.yml @@ -0,0 +1,20 @@ +groups: + - name: pulsar-contract-alerts + rules: + - alert: HighPaymentErrorRate + expr: rate(payment_errors_total[5m]) / rate(payment_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: Payment processing error rate is above threshold + description: Investigate the contract or upstream token integration immediately. + + - alert: HighPaymentLatency + expr: histogram_quantile(0.95, sum by (le) (rate(payment_processing_seconds_bucket[10m]))) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: Payment processing latency is elevated + description: Review recent deployments and contract invocation patterns. diff --git a/docs/operations/monitoring-and-alerting.md b/docs/operations/monitoring-and-alerting.md new file mode 100644 index 0000000..132bf5f --- /dev/null +++ b/docs/operations/monitoring-and-alerting.md @@ -0,0 +1,100 @@ +# Monitoring and Alerting + +This guide defines a practical baseline for monitoring the Pulsar payment-processing stack, including contract health, application performance, infrastructure, and operational alerts. + +## Goals + +- Track contract and application health in real time. +- Detect regressions in payment throughput, error rates, and latency. +- Provide actionable alerts for on-call responders. +- Keep alerting lightweight and compatible with common observability tools. + +## Metrics to collect + +### Application metrics + +- Request volume and success/error rate +- Payment processing latency (p50/p95/p99) +- Refund initiation and execution latency +- Contract invocation failures +- Authentication failures and rejected transactions + +### Infrastructure metrics + +- CPU usage per service/container +- Memory usage and pressure +- Disk usage and inode pressure +- Network I/O and dropped packets + +### Database and storage metrics + +- Connection pool saturation +- Query latency for contract state reads/writes +- Storage growth and compaction backlog + +### Business metrics + +- Payments per hour +- Refunds per hour +- Average payment value +- Failed payment rate +- Merchant registration rate + +## Recommended alert thresholds + +| Signal | Warning | Critical | +| --- | --- | --- | +| Error rate | > 2% for 10m | > 5% for 5m | +| P95 latency | > 800ms for 15m | > 2s for 5m | +| CPU usage | > 75% for 15m | > 90% for 5m | +| Memory usage | > 80% for 15m | > 95% for 5m | +| Disk usage | > 80% for 1h | > 95% for 15m | +| Payment throughput drop | < 80% of baseline for 15m | < 50% of baseline for 5m | + +## Alert routing + +- Primary channel: incident Slack channel +- Secondary channel: email and PagerDuty +- Escalation path: + 1. On-call engineer + 2. Backend lead + 3. Engineering manager + +## On-call rotation + +- Rotate weekly for the initial rollout. +- Keep a primary and secondary on-call contact. +- Document escalation steps in the incident runbook. + +## Example Prometheus-style scrape config + +```yaml +scrape_configs: + - job_name: pulsar-api + static_configs: + - targets: ["api:9100"] + metrics_path: /metrics + scrape_interval: 15s +``` + +## Example alert rules + +```yaml +groups: + - name: pulsar-alerts + rules: + - alert: HighErrorRate + expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: Elevated request error rate +``` + +## Operational checklist + +- Verify dashboards are populated after deployment. +- Test each alert pathway at least once per quarter. +- Review thresholds after observing baseline behavior. +- Preserve runbooks and dashboard links in the incident channel.