Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ldk-server-protos/src/endpoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ pub const SPONTANEOUS_SEND_PATH: &str = "SpontaneousSend";
pub const SIGN_MESSAGE_PATH: &str = "SignMessage";
pub const VERIFY_SIGNATURE_PATH: &str = "VerifySignature";
pub const EXPORT_PATHFINDING_SCORES_PATH: &str = "ExportPathfindingScores";
pub const GET_METRICS_PATH: &str = "metrics";
16 changes: 15 additions & 1 deletion ldk-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ use crate::io::persist::{
use crate::service::NodeService;
use crate::util::config::{load_config, ArgsConfig, ChainSource};
use crate::util::logger::ServerLogger;
use crate::util::metrics::{Metrics, BUILD_METRICS_INTERVAL};
use crate::util::proto_adapter::{forwarded_payment_to_proto, payment_to_proto};
use crate::util::tls::get_or_generate_tls_config;

Expand Down Expand Up @@ -256,6 +257,19 @@ fn main() {
}
};
let event_node = Arc::clone(&node);

let metrics_node = Arc::clone(&node);
let mut interval = tokio::time::interval(BUILD_METRICS_INTERVAL);
let metrics = Arc::new(Metrics::new());
let metrics_bg = Arc::clone(&metrics);

runtime.spawn(async move {
loop {
interval.tick().await;
metrics_bg.update_service_health_score(&metrics_node);
}
});

let rest_svc_listener = TcpListener::bind(config_file.rest_service_addr)
.await
.expect("Failed to bind listening port");
Expand Down Expand Up @@ -415,7 +429,7 @@ fn main() {
res = rest_svc_listener.accept() => {
match res {
Ok((stream, _)) => {
let node_service = NodeService::new(Arc::clone(&node), Arc::clone(&paginated_store), api_key.clone());
let node_service = NodeService::new(Arc::clone(&node), Arc::clone(&paginated_store), api_key.clone(), Arc::clone(&metrics));
let acceptor = tls_acceptor.clone();
runtime.spawn(async move {
match acceptor.accept(stream).await {
Expand Down
24 changes: 19 additions & 5 deletions ldk-server/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ use ldk_node::Node;
use ldk_server_protos::endpoints::{
BOLT11_RECEIVE_PATH, BOLT11_SEND_PATH, BOLT12_RECEIVE_PATH, BOLT12_SEND_PATH,
CLOSE_CHANNEL_PATH, CONNECT_PEER_PATH, EXPORT_PATHFINDING_SCORES_PATH,
FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH,
LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH,
ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SIGN_MESSAGE_PATH, SPLICE_IN_PATH, SPLICE_OUT_PATH,
SPONTANEOUS_SEND_PATH, UPDATE_CHANNEL_CONFIG_PATH, VERIFY_SIGNATURE_PATH,
FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, GET_METRICS_PATH, GET_NODE_INFO_PATH,
GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH,
ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SIGN_MESSAGE_PATH, SPLICE_IN_PATH,
SPLICE_OUT_PATH, SPONTANEOUS_SEND_PATH, UPDATE_CHANNEL_CONFIG_PATH, VERIFY_SIGNATURE_PATH,
};
use prost::Message;

Expand Down Expand Up @@ -52,6 +52,7 @@ use crate::api::spontaneous_send::handle_spontaneous_send_request;
use crate::api::update_channel_config::handle_update_channel_config_request;
use crate::api::verify_signature::handle_verify_signature_request;
use crate::io::persist::paginated_kv_store::PaginatedKVStore;
use crate::util::metrics::Metrics;
use crate::util::proto_adapter::to_error_response;

// Maximum request body size: 10 MB
Expand All @@ -63,13 +64,15 @@ pub struct NodeService {
node: Arc<Node>,
paginated_kv_store: Arc<dyn PaginatedKVStore>,
api_key: String,
metrics: Arc<Metrics>,
}

impl NodeService {
pub(crate) fn new(
node: Arc<Node>, paginated_kv_store: Arc<dyn PaginatedKVStore>, api_key: String,
metrics: Arc<Metrics>,
) -> Self {
Self { node, paginated_kv_store, api_key }
Self { node, paginated_kv_store, api_key, metrics }
}
}

Expand Down Expand Up @@ -153,6 +156,17 @@ impl Service<Request<Incoming>> for NodeService {
type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;

fn call(&self, req: Request<Incoming>) -> Self::Future {
// Handle metrics endpoint separately to bypass auth and return plain text
if req.uri().path().len() > 1 && &req.uri().path()[1..] == GET_METRICS_PATH {
let metrics = Arc::clone(&self.metrics);
return Box::pin(async move {
Ok(Response::builder()
.header("Content-Type", "text/plain")
.body(Full::new(Bytes::from(metrics.gather_metrics())))
.unwrap())
});
}

// Extract auth params from headers (validation happens after body is read)
let auth_params = match extract_auth_params(&req) {
Ok(params) => params,
Expand Down
172 changes: 172 additions & 0 deletions ldk-server/src/util/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
// This file is Copyright its original authors, visible in version control
// history.
//
// This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE
// or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option.
// You may not use this file except in accordance with one or both of these
// licenses.

use std::sync::atomic::{AtomicI64, Ordering};
use std::time::Duration;

use ldk_node::Node;

pub const BUILD_METRICS_INTERVAL: Duration = Duration::from_secs(60);

/// This represents a [`Metrics`] type that can go up and down in value.
pub struct IntGauge {
inner: AtomicI64,
}

impl IntGauge {
pub fn new() -> Self {
Self { inner: AtomicI64::new(0) }
}

pub fn set(&self, value: i64) {
self.inner.store(value, Ordering::Relaxed);
}

pub fn get(&self) -> i64 {
self.inner.load(Ordering::Relaxed)
}
}

/// Represents the [`Metrics`] output values and type.
pub struct MetricsOutput {
name: String,
help_text: String,
metric_type: String,
value: String,
}

impl MetricsOutput {
pub fn new(name: &str, help_text: &str, metric_type: &str, value: &str) -> Self {
Self {
name: name.to_string(),
help_text: help_text.to_string(),
metric_type: metric_type.to_string(),
value: value.to_string(),
}
}
}

pub struct Metrics {
pub service_health_score: IntGauge,
}

impl Metrics {
pub fn new() -> Self {
Self { service_health_score: IntGauge::new() }
}

pub fn update_service_health_score(&self, node: &Node) {
let score = self.calculate_ldk_server_health_score(node);
self.service_health_score.set(score);
}

/// The health score computation is pretty basic for now and simply
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, is it customary to have such a score? I would find it very hard to interpret, tbh.?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, is it customary to have such a score?

I think it is as some users might want to rely on that to know how their node is performing per time at a glance.

I would find it very hard to interpret, tbh.?

The current computation is pretty basic and relies on the NodeStatus informations from ldk-node. Though, we might want to refine that to include more informations and decide the best weightage value to assign for each event.

/// calculated based on the impacted events on the components of the
/// `Node`. The events severity and weightage value are as follows:
///
/// - Critical: 0 (Total failure)
/// - Major: 35%
/// - Minor: 25%
///
/// Using the assigned score above, the health score of the `Node` is
/// computed as:
///
/// Health score = Maximum health score - Sum(Event severity score)
///
/// Where:
///
/// - Maximum health score = 100
///
/// If the `Node` is not running/online, i.e `is_running` is false,
/// the severity is critical with a weightage value of -100%.
///
/// If the `Node` is running but isn't connected to any peer yet,
/// the severity is major with a weightage value of -35%.
///
/// If the `Node` is running but the Lightning Wallet hasn't been synced
/// yet, the severity is minor with a weightage value of -25%.
pub fn calculate_ldk_server_health_score(&self, node: &Node) -> i64 {
Self::compute_health_score(
node.status().is_running,
!node.list_peers().is_empty(),
node.status().latest_lightning_wallet_sync_timestamp.is_some(),
)
}

pub fn format_metrics_output(&self, buffer: &mut String, options: &MetricsOutput) {
buffer.push_str(&format!("# HELP {} {}\n", options.name, options.help_text));
buffer.push_str(&format!("# TYPE {} {}\n", options.name, options.metric_type));
buffer.push_str(&format!("{} {}\n", options.name, options.value));
}

pub fn gather_metrics(&self) -> String {
let mut buffer = String::new();
let options = &MetricsOutput::new(
"ldk_server_health_score",
"Current health score (0-100)",
"gauge",
&self.service_health_score.get().to_string(),
);

self.format_metrics_output(&mut buffer, options);

buffer
}

fn compute_health_score(is_running: bool, has_peers: bool, is_wallet_synced: bool) -> i64 {
if !is_running {
return 0;
}

let mut health_score = 100;

if !has_peers {
health_score -= 35;
}

if !is_wallet_synced {
health_score -= 25;
}

health_score
}
}

#[cfg(test)]
mod tests {

use super::*;

#[test]
fn test_compute_health_score() {
// Node is not running
assert_eq!(Metrics::compute_health_score(false, true, true), 0);
assert_eq!(Metrics::compute_health_score(false, false, false), 0);

// Node is running, connected to a peer and wallet is synced
assert_eq!(Metrics::compute_health_score(true, true, true), 100);

// Node is running, not connected to a peer but wallet is synced
assert_eq!(Metrics::compute_health_score(true, false, true), 65);

// Node is running, connected to a peer but wallet is not synced
assert_eq!(Metrics::compute_health_score(true, true, false), 75);

// Node is running, not connected to a peer and wallet is not synced
assert_eq!(Metrics::compute_health_score(true, false, false), 40);
}

#[test]
fn test_gather_metrics_format() {
let metrics = Metrics::new();

let result = metrics.gather_metrics();
assert!(result.contains("ldk_server_health_score"));
}
}
1 change: 1 addition & 0 deletions ldk-server/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@

pub(crate) mod config;
pub(crate) mod logger;
pub(crate) mod metrics;
pub(crate) mod proto_adapter;
pub(crate) mod tls;