diff --git a/NEWS b/NEWS index 748ae30eb2..31593dabc0 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,12 @@ Post v26.03.0 ------------- - Added ability to set any "ipsec_*" NB_Global option to configure the IPsec backend. + - Added nb_cfg_timestamp to SB_Global, written atomically with each + nb_cfg update by ovn-northd. ovn-controller propagates this value + to the local OVS bridge external_ids as "ovn-nb-cfg-sb-ts". In + large deployments where options:enable_chassis_nb_cfg_update is + false, this enables per-chassis propagation latency tracking without + any southbound database writes. - Documented missing ovn-nbctl commands: "mirror-rule-add", "mirror-rule-del", "lr-nat-update-ext-ip", "ha-chassis-group-set-chassis-prio", "lsp-add-router-port", diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml index 57e7cf5dd2..ec713ce4f7 100644 --- a/controller/ovn-controller.8.xml +++ b/controller/ovn-controller.8.xml @@ -658,6 +658,28 @@

+
+ external-ids:ovn-nb-cfg-sb-ts in the Bridge + table +
+ +
+

+ This key records the value of + OVN_Southbound.SB_Global.nb_cfg_timestamp that + corresponded to the ovn-nb-cfg generation for which all + flows have been successfully installed in OVS. It is the wall-clock + time (in milliseconds since the Unix epoch) at which + ovn-northd wrote that nb_cfg value to the + Southbound database. The difference between this timestamp and + ovn-nb-cfg-ts gives the per-chassis end-to-end + propagation latency for that configuration generation. The key is + absent when no timestamp is available (e.g. when + NB_Global.options:enable_chassis_nb_cfg_update is + false). +

+
+
external_ids:ovn-installed and external_ids:ovn-installed-ts in the diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index fd848c54c6..405c985d47 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -141,6 +141,7 @@ static unixctl_cb_func debug_delay_nb_cfg_report; #define OVS_NB_CFG_NAME "ovn-nb-cfg" #define OVS_NB_CFG_TS_NAME "ovn-nb-cfg-ts" +#define OVS_NB_CFG_SB_TS_NAME "ovn-nb-cfg-sb-ts" #define OVS_STARTUP_TS_NAME "ovn-startup-ts" struct br_int_remote { @@ -825,28 +826,62 @@ struct ed_type_ct_zones { }; -static uint64_t +/* Snapshot of SB_Global.nb_cfg and its paired nb_cfg_timestamp, always read + * from the same IDL transaction so the two values are consistent. */ +struct nb_cfg_snap { + uint64_t nb_cfg; + uint64_t ts; +}; + +/* Returns a snapshot of the current SB_Global nb_cfg and nb_cfg_timestamp. + * The pair is always read from the same SB_Global snapshot so callers can + * rely on (nb_cfg, ts) being consistent. + * + * 'ts' is the wall-clock time northd wrote nb_cfg to SB. The delta between + * that and the local completion time is the per-chassis end-to-end + * propagation latency (northd compile + SB write + relay fan-out + + * ovn-controller engine + ofctrl barrier ack). + * + * If a monitor condition change is in flight the cached snapshot from the + * previous call is returned, because updates received between the request + * and the cond ack could be from before the SB_Global value we're trying + * to read. + */ +static struct nb_cfg_snap get_nb_cfg(const struct sbrec_sb_global_table *sb_global_table, unsigned int cond_seqno, unsigned int expected_cond_seqno) { - static uint64_t nb_cfg = 0; + static struct nb_cfg_snap snap = {0, 0}; - /* Delay getting nb_cfg if there are monitor condition changes - * in flight. It might be that those changes would instruct the - * server to send updates that happened before SB_Global.nb_cfg. - */ - if (cond_seqno != expected_cond_seqno) { - return nb_cfg; + if (cond_seqno == expected_cond_seqno) { + const struct sbrec_sb_global *sb + = sbrec_sb_global_table_first(sb_global_table); + snap.nb_cfg = sb ? sb->nb_cfg : 0; + snap.ts = sb ? (uint64_t) sb->nb_cfg_timestamp : 0; } - const struct sbrec_sb_global *sb - = sbrec_sb_global_table_first(sb_global_table); - nb_cfg = sb ? sb->nb_cfg : 0; - return nb_cfg; + return snap; } /* Propagates the local cfg seqno, 'cur_cfg', to the chassis_private record * and to the local OVS DB. + * + * The br-int external_ids triplet (ovn-nb-cfg, ovn-nb-cfg-ts, + * ovn-nb-cfg-sb-ts) is stamped unconditionally, independent of + * 'enable_ch_nb_cfg_update'. The SB chassis_private writeback remains + * gated by 'enable_ch_nb_cfg_update' for deployments that want to suppress + * the per-bump SB write load. An external exporter watching br-int can + * compute the per-chassis propagation delta as + * (ovn-nb-cfg-ts - ovn-nb-cfg-sb-ts) + * regardless of the writeback setting. + * + * 'ovn-nb-cfg-sb-ts' is the SB_Global.nb_cfg_timestamp that was paired + * with this cur_cfg at the moment the barrier was queued (snapshotted via + * ofctrl-seqno's req_ts). This avoids the pitfall of pairing the just- + * acked cur_cfg with whatever SB_Global timestamp happens to be current + * now -- on a fast-churning fleet SB_Global may already have advanced + * past cur_cfg by the time the barrier acks, which would under-report + * the delta. */ static void store_nb_cfg(struct ovsdb_idl_txn *sb_txn, struct ovsdb_idl_txn *ovs_txn, @@ -858,6 +893,7 @@ store_nb_cfg(struct ovsdb_idl_txn *sb_txn, struct ovsdb_idl_txn *ovs_txn, struct ofctrl_acked_seqnos *acked_nb_cfg_seqnos = ofctrl_acked_seqnos_get(ofctrl_seq_type_nb_cfg); uint64_t cur_cfg = acked_nb_cfg_seqnos->last_acked; + uint64_t cur_cfg_sb_ts = acked_nb_cfg_seqnos->last_acked_req_ts; int64_t startup_ts = daemon_startup_ts(); if (ovs_txn && br_int @@ -894,6 +930,16 @@ store_nb_cfg(struct ovsdb_idl_txn *sb_txn, struct ovsdb_idl_txn *ovs_txn, cur_cfg_str); ovsrec_bridge_update_external_ids_setkey(br_int, OVS_NB_CFG_TS_NAME, cur_cfg_ts_str); + if (cur_cfg_sb_ts) { + char *sb_ts_str = xasprintf("%"PRIu64, cur_cfg_sb_ts); + ovsrec_bridge_update_external_ids_setkey(br_int, + OVS_NB_CFG_SB_TS_NAME, + sb_ts_str); + free(sb_ts_str); + } else { + ovsrec_bridge_update_external_ids_delkey(br_int, + OVS_NB_CFG_SB_TS_NAME); + } free(cur_cfg_ts_str); free(cur_cfg_str); } @@ -8199,12 +8245,18 @@ main(int argc, char *argv[]) chassis, mac_cache_data); } - ofctrl_seqno_update_create( - ofctrl_seq_type_nb_cfg, - get_nb_cfg(sbrec_sb_global_table_get( - ovnsb_idl_loop.idl), - ovnsb_cond_seqno, - ovnsb_expected_cond_seqno)); + /* Snapshot (nb_cfg, sb_ts) atomically from SB_Global + * and pair them through the barrier ack so the + * eventual completion can be attributed to the + * timestamp that corresponded to this exact nb_cfg + * generation -- not whatever SB_Global value has + * moved on to by the time the barrier acks. */ + struct nb_cfg_snap snap = get_nb_cfg( + sbrec_sb_global_table_get(ovnsb_idl_loop.idl), + ovnsb_cond_seqno, ovnsb_expected_cond_seqno); + ofctrl_stamped_seqno_update_create(ofctrl_seq_type_nb_cfg, + snap.nb_cfg, + snap.ts); struct local_binding_data *binding_data = runtime_data ? &runtime_data->lbinding_data : NULL; diff --git a/lib/ofctrl-seqno.c b/lib/ofctrl-seqno.c index 83c17c0e52..48ed158590 100644 --- a/lib/ofctrl-seqno.c +++ b/lib/ofctrl-seqno.c @@ -36,6 +36,11 @@ struct ofctrl_seqno_update { * application. */ uint64_t req_cfg; /* Application specific seqno. */ + uint64_t req_ts; /* Opaque per-request timestamp captured by + * the caller at the moment the update was + * queued. Carried through to the acked + * state so consumers can pair the acked + * seqno with the input that produced it. */ }; /* List of in flight sequence number updates. */ @@ -51,6 +56,9 @@ struct ofctrl_seqno_state { * application consumed acked requests. */ uint64_t cur_cfg; /* Last acked application seqno. */ + uint64_t cur_cfg_req_ts; /* req_ts that was paired with cur_cfg when + * the update was queued. 0 if the caller + * didn't supply a timestamp. */ uint64_t req_cfg; /* Last requested application seqno. */ }; @@ -73,6 +81,7 @@ ofctrl_acked_seqnos_get(size_t seqno_type) struct ofctrl_acked_seqnos *acked_seqnos = xmalloc(sizeof *acked_seqnos); acked_seqnos->acked = vector_clone(&state->acked_cfgs); acked_seqnos->last_acked = state->cur_cfg; + acked_seqnos->last_acked_req_ts = state->cur_cfg_req_ts; vector_clear(&state->acked_cfgs); if (vector_capacity(&state->acked_cfgs) >= VECTOR_THRESHOLD) { @@ -140,6 +149,7 @@ ofctrl_seqno_add_type(void) struct ofctrl_seqno_state state = (struct ofctrl_seqno_state) { .acked_cfgs = VECTOR_EMPTY_INITIALIZER(uint64_t), .cur_cfg = 0, + .cur_cfg_req_ts = 0, .req_cfg = 0, }; vector_push(&ofctrl_seqno_states, &state); @@ -152,6 +162,19 @@ ofctrl_seqno_add_type(void) */ void ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg) +{ + ofctrl_stamped_seqno_update_create(seqno_type, new_cfg, 0); +} + +/* Like ofctrl_seqno_update_create() but also records 'req_ts', an opaque + * timestamp captured by the caller (e.g. SB_Global nb_cfg_timestamp at the + * moment 'new_cfg' was read). The timestamp is carried unchanged to + * ofctrl_acked_seqnos_get() so consumers can pair the eventual ack with the + * input state that produced it. + */ +void +ofctrl_stamped_seqno_update_create(size_t seqno_type, uint64_t new_cfg, + uint64_t req_ts) { struct ofctrl_seqno_state *state = ofctrl_seqno_state_get(seqno_type); @@ -169,6 +192,7 @@ ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg) .seqno_type = seqno_type, .flow_cfg = ofctrl_req_seqno, .req_cfg = new_cfg, + .req_ts = req_ts, }; vector_push(&ofctrl_seqno_updates, &update); } @@ -190,6 +214,7 @@ ofctrl_seqno_run(uint64_t flow_cfg) struct ofctrl_seqno_state *state = ofctrl_seqno_state_get(update->seqno_type); state->cur_cfg = update->req_cfg; + state->cur_cfg_req_ts = update->req_ts; vector_push(&state->acked_cfgs, &update->req_cfg); index++; diff --git a/lib/ofctrl-seqno.h b/lib/ofctrl-seqno.h index faa97cc535..06db4e67c3 100644 --- a/lib/ofctrl-seqno.h +++ b/lib/ofctrl-seqno.h @@ -23,10 +23,18 @@ /* Collection of acked ofctrl_seqno_update requests and the most recent * 'last_acked' value. + * + * 'last_acked_req_ts' carries the opaque timestamp that was associated with + * 'last_acked' at the time the seqno was requested. Consumers that don't + * pass a timestamp at create time will see 0 here. The timestamp is + * preserved across the barrier so that applications can pair the acked + * config seqno with the input state that produced it (e.g. SB_Global + * nb_cfg_timestamp at the moment we asked OVS to barrier). */ struct ofctrl_acked_seqnos { struct vector acked; uint64_t last_acked; + uint64_t last_acked_req_ts; }; struct ofctrl_acked_seqnos *ofctrl_acked_seqnos_get(size_t seqno_type); @@ -36,6 +44,8 @@ bool ofctrl_acked_seqnos_contains(const struct ofctrl_acked_seqnos *seqnos, size_t ofctrl_seqno_add_type(void); void ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg); +void ofctrl_stamped_seqno_update_create(size_t seqno_type, uint64_t new_cfg, + uint64_t req_ts); void ofctrl_seqno_run(uint64_t flow_cfg); uint64_t ofctrl_seqno_get_req_cfg(void); void ofctrl_seqno_flush(void); diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c index 7a64a6ef27..ffd3d00fb4 100644 --- a/northd/ovn-northd.c +++ b/northd/ovn-northd.c @@ -541,6 +541,7 @@ update_sequence_numbers(int64_t loop_start_time, * Also set up to update sb_cfg once our southbound transaction commits. */ if (nb->nb_cfg != sb->nb_cfg) { sbrec_sb_global_set_nb_cfg(sb, nb->nb_cfg); + sbrec_sb_global_set_nb_cfg_timestamp(sb, loop_start_time); nbrec_nb_global_set_nb_cfg_timestamp(nb, loop_start_time); } sb_loop->next_cfg = nb->nb_cfg; @@ -943,6 +944,8 @@ main(int argc, char *argv[]) /* Disable alerting for pure write-only columns. */ ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg); + ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, + &sbrec_sb_global_col_nb_cfg_timestamp); ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_address_set_col_name); ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_address_set_col_addresses); for (size_t i = 0; i < SBREC_LOGICAL_FLOW_N_COLUMNS; i++) { diff --git a/ovn-sb.ovsschema b/ovn-sb.ovsschema index d9a91739cc..f3766f01c5 100644 --- a/ovn-sb.ovsschema +++ b/ovn-sb.ovsschema @@ -1,11 +1,12 @@ { "name": "OVN_Southbound", - "version": "21.8.0", - "cksum": "614397313 36713", + "version": "21.9.0", + "cksum": "2351636143 36779", "tables": { "SB_Global": { "columns": { "nb_cfg": {"type": {"key": "integer"}}, + "nb_cfg_timestamp": {"type": {"key": "integer"}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}, diff --git a/ovn-sb.xml b/ovn-sb.xml index e45b63d73f..c1883db2e3 100644 --- a/ovn-sb.xml +++ b/ovn-sb.xml @@ -156,6 +156,15 @@ the southbound database to bring it up to date with these changes, it updates this column to the same value. + + + The time at which ovn-northd last wrote + to the southbound database, in milliseconds + since the Unix epoch. Set atomically with each update to + . Hypervisors read this value to measure + end-to-end propagation latency from northbound commit to local + datapath programming completion. + diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at index 47f6adbd73..225709a76e 100644 --- a/tests/ovn-controller.at +++ b/tests/ovn-controller.at @@ -608,6 +608,112 @@ OVN_CLEANUP([hv1]) AT_CLEANUP ]) +# Verify that SB_Global.nb_cfg_timestamp is written by northd and propagated +# by ovn-controller to br-int external_ids as ovn-nb-cfg-sb-ts. The SB +# timestamp must be <= the local completion timestamp (ovn-nb-cfg-ts) because +# northd writes it before any hypervisor has processed the generation, so the +# local processing always adds latency on top. +OVN_FOR_EACH_NORTHD([ +AT_SETUP([nb_cfg_timestamp propagation to br-int (writeback enabled)]) +AT_KEYWORDS([ovn nb_cfg_timestamp]) +ovn_start + +net_add n1 +sim_add hv1 +as hv1 +ovs-vsctl add-br br-phys +ovn_attach n1 br-phys 192.168.0.1 + +wait_row_count Chassis 1 + +# Trigger a northd compile cycle and wait for hv1 to ack it. +check ovn-nbctl --wait=hv sync + +as hv1 + +# ovn-nb-cfg-sb-ts must be present and non-zero. +OVS_WAIT_UNTIL([ + sb_ts=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg-sb-ts 2>/dev/null | xargs) + test -n "$sb_ts" && test "$sb_ts" != "0" && test "$sb_ts" -gt 0 +]) + +# Read the two timestamps. +nb_cfg_ts=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg-ts | xargs) +sb_ts=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg-sb-ts | xargs) + +# SB_Global timestamp must be <= local completion time: the HV adds latency. +AT_CHECK([test "$sb_ts" -le "$nb_cfg_ts"]) + +# SB_Global.nb_cfg_timestamp must also be non-zero in the southbound DB itself. +sb_global_ts=$(fetch_column sb_global nb_cfg_timestamp) +AT_CHECK([test "$sb_global_ts" -gt 0]) + +OVN_CLEANUP([hv1]) +AT_CLEANUP +]) + +# Same as above but with enable_chassis_nb_cfg_update=false. The br-int +# external_ids path (ovn-nb-cfg + ovn-nb-cfg-sb-ts) is unconditional and must +# still advance even when the per-chassis SB write-back is suppressed. +# chassis_private.nb_cfg must remain stale while the option is false. +OVN_FOR_EACH_NORTHD([ +AT_SETUP([nb_cfg_timestamp propagation to br-int (writeback disabled)]) +AT_KEYWORDS([ovn nb_cfg_timestamp]) +ovn_start + +net_add n1 +sim_add hv1 +as hv1 +ovs-vsctl add-br br-phys +ovn_attach n1 br-phys 192.168.0.1 + +wait_row_count Chassis 1 + +# Establish a baseline with writeback on. Capture the acked nb_cfg so we +# can later assert it hasn't advanced (don't assume any specific value). +check ovn-nbctl --wait=hv sync +baseline_nb_cfg=$(fetch_column chassis_priv nb_cfg name=hv1) + +# Disable chassis nb_cfg write-back then bump nb_cfg. +# enable_chassis_nb_cfg_update is propagated NB->SB atomically with nb_cfg +# by northd, so ovn-controller sees both changes together. +check ovn-nbctl set nb_global . options:enable_chassis_nb_cfg_update=false +check ovn-nbctl set NB_Global . nb_cfg=100 + +as hv1 + +# br-int must advance to nb_cfg=100 even with writeback disabled. +OVS_WAIT_UNTIL([ + ovs_nb_cfg=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg | xargs) + test "$ovs_nb_cfg" = "100" +]) + +# ovn-nb-cfg-sb-ts must be present and non-zero. +OVS_WAIT_UNTIL([ + sb_ts=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg-sb-ts 2>/dev/null | xargs) + test -n "$sb_ts" && test "$sb_ts" -gt 0 +]) + +# chassis_private.nb_cfg must NOT have advanced past the baseline +# (writeback is off so ovn-controller must not write back to SB). +check_column "$baseline_nb_cfg" chassis_priv nb_cfg name=hv1 + +# SB_Global.nb_cfg_timestamp must reflect the latest northd cycle. +sb_global_ts=$(fetch_column sb_global nb_cfg_timestamp) +AT_CHECK([test "$sb_global_ts" -gt 0]) + +# The SB timestamp must still be <= the local completion time. +nb_cfg_ts=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg-ts | xargs) +sb_ts=$(ovs-vsctl get Bridge br-int external_ids:ovn-nb-cfg-sb-ts | xargs) +AT_CHECK([test "$sb_ts" -le "$nb_cfg_ts"]) + +# Re-enable writeback so OVN_CLEANUP's --wait=hv sync can complete. +check ovn-nbctl set nb_global . options:enable_chassis_nb_cfg_update=true + +OVN_CLEANUP([hv1]) +AT_CLEANUP +]) + OVN_FOR_EACH_NORTHD([ AT_SETUP([features]) AT_KEYWORDS([features])