Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions monitoring/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Monitoring Dashboards

This directory contains observability templates for CosmosVote deployments.

| File | Description |
|------|-------------|
| `grafana-dashboard.json` | Grafana dashboard — import via UI or provisioning |
| `alerts.yml` | Prometheus alerting rules |

---

## Metrics Reference

The following metrics are expected to be exposed by the CosmosVote indexer/relayer service. Instrument your service using a Prometheus client library (e.g. [`prometheus`](https://crates.io/crates/prometheus) for Rust, [`prom-client`](https://github.com/siimon/prom-client) for Node.js).

| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `cosmosvote_rpc_requests_total` | Counter | `network`, `method` | Total RPC calls made |
| `cosmosvote_rpc_errors_total` | Counter | `network`, `method`, `code` | Failed RPC calls |
| `cosmosvote_rpc_request_duration_ms` | Histogram | `network`, `method` | RPC call latency in milliseconds |
| `cosmosvote_events_polled_total` | Counter | `network`, `event_type` | Raw events fetched from chain |
| `cosmosvote_events_processed_total` | Counter | `network`, `event_type` | Events successfully processed |
| `cosmosvote_event_poll_attempts_total` | Counter | `network` | Total polling attempts |
| `cosmosvote_event_poll_errors_total` | Counter | `network` | Polling attempts that failed |
| `cosmosvote_event_poll_lag_seconds` | Gauge | `network` | Seconds the poller is behind the chain tip |
| `cosmosvote_proposals_active` | Gauge | `network` | Number of currently active proposals |
| `cosmosvote_votes_cast_total` | Counter | `network` | Cumulative votes cast |

---

## Connecting the Grafana Dashboard

### Option 1 — Import via UI

1. Open Grafana → **Dashboards → Import**.
2. Upload `grafana-dashboard.json` or paste its contents.
3. Select your **Prometheus** data source when prompted.
4. Click **Import**.

### Option 2 — Provisioning (recommended for automated setups)

Place the file in your Grafana provisioning directory:

```
/etc/grafana/provisioning/dashboards/cosmosvote/grafana-dashboard.json
```

Create a provisioning config at `/etc/grafana/provisioning/dashboards/cosmosvote.yaml`:

```yaml
apiVersion: 1
providers:
- name: cosmosvote
folder: CosmosVote
type: file
options:
path: /etc/grafana/provisioning/dashboards/cosmosvote
```

Restart Grafana. The dashboard will appear under the **CosmosVote** folder.

---

## Connecting Prometheus Alerts

Add the alerts file to your Prometheus configuration:

```yaml
# prometheus.yml
rule_files:
- /path/to/cosmosvote/monitoring/alerts.yml
```

Or, if using the Prometheus Operator (Kubernetes):

```yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: cosmosvote-alerts
spec:
groups: [] # paste contents of alerts.yml groups here
```

Reload Prometheus (`SIGHUP` or `/-/reload` endpoint) to pick up the rules.

---

## Dashboard Variables

The dashboard exposes a **Network** template variable (`testnet` / `mainnet`) that filters all panels to a single deployment. Add additional values as you deploy to more networks.

---

## Alerting Thresholds

| Alert | Threshold | Severity |
|-------|-----------|----------|
| `HighRpcLatency` | p95 > 2 000 ms for 5 min | warning |
| `HighRpcErrorRate` | error rate > 5 % for 5 min | critical |
| `EventPollingLag` | lag > 300 s for 2 min | warning |
| `RpcTargetDown` | target unreachable for 1 min | critical |

Adjust thresholds in `alerts.yml` to match your SLA requirements.
47 changes: 47 additions & 0 deletions monitoring/alerts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
groups:
- name: cosmosvote
interval: 60s
rules:
# RPC latency p95 > 2s for 5 minutes
- alert: HighRpcLatency
expr: |
histogram_quantile(0.95,
rate(cosmosvote_rpc_request_duration_ms_bucket[5m])
) > 2000
for: 5m
labels:
severity: warning
annotations:
summary: "RPC p95 latency above 2 s on {{ $labels.network }}"
description: "Current value: {{ $value | humanizeDuration }}"

# RPC error rate > 5% for 5 minutes
- alert: HighRpcErrorRate
expr: |
rate(cosmosvote_rpc_errors_total[5m])
/ rate(cosmosvote_rpc_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "RPC error rate above 5% on {{ $labels.network }}"
description: "Current rate: {{ $value | humanizePercentage }}"

# Event poll lag > 5 minutes
- alert: EventPollingLag
expr: cosmosvote_event_poll_lag_seconds > 300
for: 2m
labels:
severity: warning
annotations:
summary: "Event polling is lagging on {{ $labels.network }}"
description: "Lag: {{ $value | humanizeDuration }}"

# RPC target down
- alert: RpcTargetDown
expr: up{job="cosmosvote-rpc"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "CosmosVote RPC target is down on {{ $labels.network }}"
182 changes: 182 additions & 0 deletions monitoring/grafana-dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "Prometheus data source",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{ "type": "grafana", "id": "grafana", "name": "Grafana", "version": "10.0.0" },
{ "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" },
{ "type": "panel", "id": "timeseries", "name": "Time series", "version": "" },
{ "type": "panel", "id": "stat", "name": "Stat", "version": "" },
{ "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }
],
"title": "CosmosVote Service Metrics",
"uid": "cosmosvote-overview",
"tags": ["cosmosvote", "governance", "stellar"],
"timezone": "browser",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"time": { "from": "now-1h", "to": "now" },
"templating": {
"list": [
{
"name": "network",
"type": "custom",
"label": "Network",
"options": [
{ "text": "testnet", "value": "testnet" },
{ "text": "mainnet", "value": "mainnet" }
],
"current": { "text": "testnet", "value": "testnet" }
}
]
},
"panels": [
{
"id": 1,
"title": "RPC Latency (p50 / p95 / p99)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": { "unit": "ms", "thresholds": { "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 500 },
{ "color": "red", "value": 2000 }
]}}
},
"targets": [
{
"expr": "histogram_quantile(0.50, rate(cosmosvote_rpc_request_duration_ms_bucket{network=\"$network\"}[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(cosmosvote_rpc_request_duration_ms_bucket{network=\"$network\"}[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(cosmosvote_rpc_request_duration_ms_bucket{network=\"$network\"}[5m]))",
"legendFormat": "p99"
}
]
},
{
"id": 2,
"title": "Event Polling Rate (events/min)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"targets": [
{
"expr": "rate(cosmosvote_events_polled_total{network=\"$network\"}[1m]) * 60",
"legendFormat": "events/min"
},
{
"expr": "rate(cosmosvote_events_processed_total{network=\"$network\"}[1m]) * 60",
"legendFormat": "processed/min"
}
]
},
{
"id": 3,
"title": "Error Rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": { "defaults": { "unit": "percentunit" } },
"targets": [
{
"expr": "rate(cosmosvote_rpc_errors_total{network=\"$network\"}[5m]) / rate(cosmosvote_rpc_requests_total{network=\"$network\"}[5m])",
"legendFormat": "RPC error rate"
},
{
"expr": "rate(cosmosvote_event_poll_errors_total{network=\"$network\"}[5m]) / rate(cosmosvote_event_poll_attempts_total{network=\"$network\"}[5m])",
"legendFormat": "Poll error rate"
}
]
},
{
"id": 4,
"title": "Active Proposals",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "thresholds" } } },
"targets": [
{
"expr": "cosmosvote_proposals_active{network=\"$network\"}",
"legendFormat": "Active"
}
]
},
{
"id": 5,
"title": "Total Votes Cast",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"targets": [
{
"expr": "cosmosvote_votes_cast_total{network=\"$network\"}",
"legendFormat": "Votes"
}
]
},
{
"id": 6,
"title": "RPC Uptime",
"type": "gauge",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 12 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0, "max": 1,
"thresholds": { "steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.95 },
{ "color": "green", "value": 0.99 }
]}
}
},
"targets": [
{
"expr": "avg_over_time(up{job=\"cosmosvote-rpc\",network=\"$network\"}[24h])",
"legendFormat": "24h uptime"
}
]
},
{
"id": 7,
"title": "Event Polling Lag (seconds behind)",
"type": "gauge",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 12 },
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": { "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 60 },
{ "color": "red", "value": 300 }
]}
}
},
"targets": [
{
"expr": "cosmosvote_event_poll_lag_seconds{network=\"$network\"}",
"legendFormat": "lag"
}
]
}
]
}
Loading