From 0422fcca905c45d8a02fd50fe1b17c9d2c7c5f09 Mon Sep 17 00:00:00 2001 From: ladinoraa Date: Fri, 26 Jun 2026 07:50:59 +0000 Subject: [PATCH] feat(monitoring): add dashboard templates for service metrics (#323) - Add monitoring/grafana-dashboard.json: Grafana dashboard with panels for RPC latency (p50/p95/p99), event polling rate, error rate, active proposals, votes cast, RPC uptime, and event polling lag - Add monitoring/alerts.yml: Prometheus alerting rules for high latency, high error rate, event polling lag, and RPC target down - Add monitoring/README.md: metrics reference table, instructions for connecting dashboards via UI import and provisioning, alert thresholds Closes #323 --- monitoring/README.md | 104 +++++++++++++++++ monitoring/alerts.yml | 47 ++++++++ monitoring/grafana-dashboard.json | 182 ++++++++++++++++++++++++++++++ 3 files changed, 333 insertions(+) create mode 100644 monitoring/README.md create mode 100644 monitoring/alerts.yml create mode 100644 monitoring/grafana-dashboard.json diff --git a/monitoring/README.md b/monitoring/README.md new file mode 100644 index 0000000..2166044 --- /dev/null +++ b/monitoring/README.md @@ -0,0 +1,104 @@ +# Monitoring Dashboards + +This directory contains observability templates for CosmosVote deployments. + +| File | Description | +|------|-------------| +| `grafana-dashboard.json` | Grafana dashboard — import via UI or provisioning | +| `alerts.yml` | Prometheus alerting rules | + +--- + +## Metrics Reference + +The following metrics are expected to be exposed by the CosmosVote indexer/relayer service. Instrument your service using a Prometheus client library (e.g. [`prometheus`](https://crates.io/crates/prometheus) for Rust, [`prom-client`](https://github.com/siimon/prom-client) for Node.js). + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cosmosvote_rpc_requests_total` | Counter | `network`, `method` | Total RPC calls made | +| `cosmosvote_rpc_errors_total` | Counter | `network`, `method`, `code` | Failed RPC calls | +| `cosmosvote_rpc_request_duration_ms` | Histogram | `network`, `method` | RPC call latency in milliseconds | +| `cosmosvote_events_polled_total` | Counter | `network`, `event_type` | Raw events fetched from chain | +| `cosmosvote_events_processed_total` | Counter | `network`, `event_type` | Events successfully processed | +| `cosmosvote_event_poll_attempts_total` | Counter | `network` | Total polling attempts | +| `cosmosvote_event_poll_errors_total` | Counter | `network` | Polling attempts that failed | +| `cosmosvote_event_poll_lag_seconds` | Gauge | `network` | Seconds the poller is behind the chain tip | +| `cosmosvote_proposals_active` | Gauge | `network` | Number of currently active proposals | +| `cosmosvote_votes_cast_total` | Counter | `network` | Cumulative votes cast | + +--- + +## Connecting the Grafana Dashboard + +### Option 1 — Import via UI + +1. Open Grafana → **Dashboards → Import**. +2. Upload `grafana-dashboard.json` or paste its contents. +3. Select your **Prometheus** data source when prompted. +4. Click **Import**. + +### Option 2 — Provisioning (recommended for automated setups) + +Place the file in your Grafana provisioning directory: + +``` +/etc/grafana/provisioning/dashboards/cosmosvote/grafana-dashboard.json +``` + +Create a provisioning config at `/etc/grafana/provisioning/dashboards/cosmosvote.yaml`: + +```yaml +apiVersion: 1 +providers: + - name: cosmosvote + folder: CosmosVote + type: file + options: + path: /etc/grafana/provisioning/dashboards/cosmosvote +``` + +Restart Grafana. The dashboard will appear under the **CosmosVote** folder. + +--- + +## Connecting Prometheus Alerts + +Add the alerts file to your Prometheus configuration: + +```yaml +# prometheus.yml +rule_files: + - /path/to/cosmosvote/monitoring/alerts.yml +``` + +Or, if using the Prometheus Operator (Kubernetes): + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cosmosvote-alerts +spec: + groups: [] # paste contents of alerts.yml groups here +``` + +Reload Prometheus (`SIGHUP` or `/-/reload` endpoint) to pick up the rules. + +--- + +## Dashboard Variables + +The dashboard exposes a **Network** template variable (`testnet` / `mainnet`) that filters all panels to a single deployment. Add additional values as you deploy to more networks. + +--- + +## Alerting Thresholds + +| Alert | Threshold | Severity | +|-------|-----------|----------| +| `HighRpcLatency` | p95 > 2 000 ms for 5 min | warning | +| `HighRpcErrorRate` | error rate > 5 % for 5 min | critical | +| `EventPollingLag` | lag > 300 s for 2 min | warning | +| `RpcTargetDown` | target unreachable for 1 min | critical | + +Adjust thresholds in `alerts.yml` to match your SLA requirements. diff --git a/monitoring/alerts.yml b/monitoring/alerts.yml new file mode 100644 index 0000000..009909a --- /dev/null +++ b/monitoring/alerts.yml @@ -0,0 +1,47 @@ +groups: + - name: cosmosvote + interval: 60s + rules: + # RPC latency p95 > 2s for 5 minutes + - alert: HighRpcLatency + expr: | + histogram_quantile(0.95, + rate(cosmosvote_rpc_request_duration_ms_bucket[5m]) + ) > 2000 + for: 5m + labels: + severity: warning + annotations: + summary: "RPC p95 latency above 2 s on {{ $labels.network }}" + description: "Current value: {{ $value | humanizeDuration }}" + + # RPC error rate > 5% for 5 minutes + - alert: HighRpcErrorRate + expr: | + rate(cosmosvote_rpc_errors_total[5m]) + / rate(cosmosvote_rpc_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "RPC error rate above 5% on {{ $labels.network }}" + description: "Current rate: {{ $value | humanizePercentage }}" + + # Event poll lag > 5 minutes + - alert: EventPollingLag + expr: cosmosvote_event_poll_lag_seconds > 300 + for: 2m + labels: + severity: warning + annotations: + summary: "Event polling is lagging on {{ $labels.network }}" + description: "Lag: {{ $value | humanizeDuration }}" + + # RPC target down + - alert: RpcTargetDown + expr: up{job="cosmosvote-rpc"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "CosmosVote RPC target is down on {{ $labels.network }}" diff --git a/monitoring/grafana-dashboard.json b/monitoring/grafana-dashboard.json new file mode 100644 index 0000000..a20556e --- /dev/null +++ b/monitoring/grafana-dashboard.json @@ -0,0 +1,182 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Prometheus data source", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "10.0.0" }, + { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, + { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" }, + { "type": "panel", "id": "stat", "name": "Stat", "version": "" }, + { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" } + ], + "title": "CosmosVote Service Metrics", + "uid": "cosmosvote-overview", + "tags": ["cosmosvote", "governance", "stellar"], + "timezone": "browser", + "schemaVersion": 38, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "templating": { + "list": [ + { + "name": "network", + "type": "custom", + "label": "Network", + "options": [ + { "text": "testnet", "value": "testnet" }, + { "text": "mainnet", "value": "mainnet" } + ], + "current": { "text": "testnet", "value": "testnet" } + } + ] + }, + "panels": [ + { + "id": 1, + "title": "RPC Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "unit": "ms", "thresholds": { "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 500 }, + { "color": "red", "value": 2000 } + ]}} + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(cosmosvote_rpc_request_duration_ms_bucket{network=\"$network\"}[5m]))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, rate(cosmosvote_rpc_request_duration_ms_bucket{network=\"$network\"}[5m]))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, rate(cosmosvote_rpc_request_duration_ms_bucket{network=\"$network\"}[5m]))", + "legendFormat": "p99" + } + ] + }, + { + "id": 2, + "title": "Event Polling Rate (events/min)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "targets": [ + { + "expr": "rate(cosmosvote_events_polled_total{network=\"$network\"}[1m]) * 60", + "legendFormat": "events/min" + }, + { + "expr": "rate(cosmosvote_events_processed_total{network=\"$network\"}[1m]) * 60", + "legendFormat": "processed/min" + } + ] + }, + { + "id": 3, + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { "defaults": { "unit": "percentunit" } }, + "targets": [ + { + "expr": "rate(cosmosvote_rpc_errors_total{network=\"$network\"}[5m]) / rate(cosmosvote_rpc_requests_total{network=\"$network\"}[5m])", + "legendFormat": "RPC error rate" + }, + { + "expr": "rate(cosmosvote_event_poll_errors_total{network=\"$network\"}[5m]) / rate(cosmosvote_event_poll_attempts_total{network=\"$network\"}[5m])", + "legendFormat": "Poll error rate" + } + ] + }, + { + "id": 4, + "title": "Active Proposals", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "thresholds" } } }, + "targets": [ + { + "expr": "cosmosvote_proposals_active{network=\"$network\"}", + "legendFormat": "Active" + } + ] + }, + { + "id": 5, + "title": "Total Votes Cast", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "targets": [ + { + "expr": "cosmosvote_votes_cast_total{network=\"$network\"}", + "legendFormat": "Votes" + } + ] + }, + { + "id": 6, + "title": "RPC Uptime", + "type": "gauge", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, "max": 1, + "thresholds": { "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.95 }, + { "color": "green", "value": 0.99 } + ]} + } + }, + "targets": [ + { + "expr": "avg_over_time(up{job=\"cosmosvote-rpc\",network=\"$network\"}[24h])", + "legendFormat": "24h uptime" + } + ] + }, + { + "id": 7, + "title": "Event Polling Lag (seconds behind)", + "type": "gauge", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 60 }, + { "color": "red", "value": 300 } + ]} + } + }, + "targets": [ + { + "expr": "cosmosvote_event_poll_lag_seconds{network=\"$network\"}", + "legendFormat": "lag" + } + ] + } + ] +}