diff --git a/api/gr_api.h b/api/gr_api.h index a238d168c..0713d7bf9 100644 --- a/api/gr_api.h +++ b/api/gr_api.h @@ -11,7 +11,7 @@ #include // Must be bumped when making non-backward compatible changes in API headers -#define GR_API_VERSION 2 +#define GR_API_VERSION 3 // API request header. struct gr_api_request { diff --git a/frr/if_grout.c b/frr/if_grout.c index cf370044a..d48e975ba 100644 --- a/frr/if_grout.c +++ b/frr/if_grout.c @@ -3,6 +3,7 @@ #include "if_grout.h" #include "if_map.h" +#include "l3vni_map.h" #include "log_grout.h" #include "zebra_dplane_grout.h" @@ -51,6 +52,7 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { const struct gr_iface_info_vlan *gr_vlan = NULL; const struct gr_iface_info_port *gr_port = NULL; const struct gr_iface_info_bond *gr_bond = NULL; + const struct gr_iface_info_vrf *gr_vrf = NULL; ifindex_t bridge_ifindex = IFINDEX_INTERNAL; ifindex_t link_ifindex = IFINDEX_INTERNAL; ifindex_t bond_ifindex = IFINDEX_INTERNAL; @@ -86,6 +88,8 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { link_type = ZEBRA_LLT_IPIP; break; case GR_IFACE_TYPE_VRF: + gr_vrf = (const struct gr_iface_info_vrf *)&gr_if->info; + mac = &gr_vrf->mac; link_type = ZEBRA_LLT_ETHER; zif_type = ZEBRA_IF_VRF; break; @@ -151,6 +155,16 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { dplane_ctx_set_ifp_table_id( ctx, vrf_grout_to_frr(gr_if->base.vrf_id) ); + + // For VXLAN in VRF mode, present it as a bridge slave + // of the VRF interface. FRR requires an SVI (derived + // from the bridge master) to bring the L3VNI up and + // compute the Router MAC for EVPN type-5 routes. + if (zif_type == ZEBRA_IF_VXLAN) { + bridge_ifindex = ifindex_grout_to_frr(gr_if->base.vrf_id); + slave_type = ZEBRA_IF_SLAVE_BRIDGE; + l3vni_set(gr_if->base.vrf_id, gr_if->id); + } break; case GR_IFACE_MODE_BOND: bond_ifindex = ifindex_grout_to_frr(gr_if->domain_id); @@ -201,6 +215,8 @@ void grout_link_change(struct gr_iface *gr_if, bool new, bool startup) { } else { dplane_ctx_set_op(ctx, DPLANE_OP_INTF_DELETE); dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_QUEUED); + if (gr_vxlan != NULL && gr_if->mode == GR_IFACE_MODE_VRF) + l3vni_del(gr_if->base.vrf_id); remove_mapping_by_grout_ifindex(gr_if->id); } diff --git a/frr/l3vni_map.c b/frr/l3vni_map.c new file mode 100644 index 000000000..bf427e34a --- /dev/null +++ b/frr/l3vni_map.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2026 Robin Jarry + +#include "if_map.h" +#include "l3vni_map.h" + +#include + +#include +#include + +// All functions in this file run exclusively on the dplane thread +// (grout_link_change, grout_add_nexthop, grout_neigh_update_ctx). +// No locking required. + +// VRF -> VXLAN iface mapping /////////////////////////////////////////////////// + +PREDECL_HASH(l3vni_hash); + +struct l3vni_entry { + struct l3vni_hash_item item; + uint16_t vrf_id; + uint16_t vxlan_iface_id; +}; + +static int l3vni_cmp(const struct l3vni_entry *a, const struct l3vni_entry *b) { + return numcmp(a->vrf_id, b->vrf_id); +} + +static uint32_t l3vni_hashfn(const struct l3vni_entry *e) { + return e->vrf_id; +} + +DECLARE_HASH(l3vni_hash, struct l3vni_entry, item, l3vni_cmp, l3vni_hashfn); +static struct l3vni_hash_head l3vni_entries = INIT_HASH(l3vni_entries); + +void l3vni_set(uint16_t vrf_id, uint16_t vxlan_iface_id) { + struct l3vni_entry *e, key = {.vrf_id = vrf_id}; + + e = l3vni_hash_find(&l3vni_entries, &key); + if (e != NULL) { + e->vxlan_iface_id = vxlan_iface_id; + return; + } + e = XCALLOC(MTYPE_GROUT_MEM, sizeof(*e)); + e->vrf_id = vrf_id; + e->vxlan_iface_id = vxlan_iface_id; + l3vni_hash_add(&l3vni_entries, e); +} + +void l3vni_del(uint16_t vrf_id) { + struct l3vni_entry key = {.vrf_id = vrf_id}; + struct l3vni_entry *e = l3vni_hash_find(&l3vni_entries, &key); + + if (e != NULL) { + l3vni_hash_del(&l3vni_entries, e); + XFREE(MTYPE_GROUT_MEM, e); + } +} + +uint16_t l3vni_get_vxlan(uint16_t vrf_id) { + struct l3vni_entry key = {.vrf_id = vrf_id}; + struct l3vni_entry *e = l3vni_hash_find(&l3vni_entries, &key); + return e ? e->vxlan_iface_id : GR_IFACE_ID_UNDEF; +} + +// (VRF, VTEP) -> RMAC cache /////////////////////////////////////////////////// + +PREDECL_HASH(rmac_hash); + +struct rmac_entry { + struct rmac_hash_item item; + uint16_t vrf_id; + ip4_addr_t vtep; + struct ethaddr mac; +}; + +static int rmac_cmp(const struct rmac_entry *a, const struct rmac_entry *b) { + int r = numcmp(a->vrf_id, b->vrf_id); + return r ? r : numcmp(a->vtep, b->vtep); +} + +static uint32_t rmac_hashfn(const struct rmac_entry *e) { + return jhash_2words(e->vrf_id, e->vtep, 0); +} + +DECLARE_HASH(rmac_hash, struct rmac_entry, item, rmac_cmp, rmac_hashfn); +static struct rmac_hash_head rmac_entries = INIT_HASH(rmac_entries); + +void l3vni_rmac_set(uint16_t vrf_id, ip4_addr_t vtep, const struct ethaddr *mac) { + struct rmac_entry *e, key = {.vrf_id = vrf_id, .vtep = vtep}; + + e = rmac_hash_find(&rmac_entries, &key); + if (e != NULL) { + e->mac = *mac; + return; + } + e = XCALLOC(MTYPE_GROUT_MEM, sizeof(*e)); + e->vrf_id = vrf_id; + e->vtep = vtep; + e->mac = *mac; + rmac_hash_add(&rmac_entries, e); +} + +void l3vni_rmac_del(uint16_t vrf_id, ip4_addr_t vtep) { + struct rmac_entry key = {.vrf_id = vrf_id, .vtep = vtep}; + struct rmac_entry *e = rmac_hash_find(&rmac_entries, &key); + + if (e != NULL) { + rmac_hash_del(&rmac_entries, e); + XFREE(MTYPE_GROUT_MEM, e); + } +} + +const struct ethaddr *l3vni_rmac_get(uint16_t vrf_id, ip4_addr_t vtep) { + struct rmac_entry key = {.vrf_id = vrf_id, .vtep = vtep}; + struct rmac_entry *e = rmac_hash_find(&rmac_entries, &key); + return e ? &e->mac : NULL; +} diff --git a/frr/l3vni_map.h b/frr/l3vni_map.h new file mode 100644 index 000000000..a005dfbe5 --- /dev/null +++ b/frr/l3vni_map.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2026 Robin Jarry + +// L3VNI dplane-thread state for EVPN symmetric IRB (Integrated Routing and +// Bridging). +// +// FRR's EVPN type-5 (IP prefix) routes use a per-VRF L3 VNI with a VXLAN +// interface. Two mappings are maintained on the dplane thread (no locking): +// +// VRF -> VXLAN iface +// +// grout_add_nexthop() redirects nexthops from the VRF (FRR's SVI model) to +// the VXLAN interface so that ip_output routes packets into the tunnel. +// +// (VRF, VTEP) -> RMAC +// +// DPLANE_OP_NEIGH_INSTALL delivers the remote router MAC before +// DPLANE_OP_NH_INSTALL creates the nexthop. The RMAC is cached here and +// applied by grout_add_nexthop() when the nexthop arrives. + +#pragma once + +#include "lib/prefix.h" + +#include + +#include + +// Register vrf_id -> vxlan_iface_id mapping. +void l3vni_set(uint16_t vrf_id, uint16_t vxlan_iface_id); + +// Remove mapping for vrf_id. +void l3vni_del(uint16_t vrf_id); + +// Return vxlan iface id for vrf_id, or GR_IFACE_ID_UNDEF. +uint16_t l3vni_get_vxlan(uint16_t vrf_id); + +// Cache remote VTEP router MAC for (vrf_id, vtep). +void l3vni_rmac_set(uint16_t vrf_id, ip4_addr_t vtep, const struct ethaddr *mac); + +// Remove cached RMAC for (vrf_id, vtep). +void l3vni_rmac_del(uint16_t vrf_id, ip4_addr_t vtep); + +// Look up cached RMAC for (vrf_id, vtep), or NULL. +const struct ethaddr *l3vni_rmac_get(uint16_t vrf_id, ip4_addr_t vtep); diff --git a/frr/meson.build b/frr/meson.build index 1e95e04b1..42ca34f30 100644 --- a/frr/meson.build +++ b/frr/meson.build @@ -32,6 +32,7 @@ frr_plugin = shared_module( files( 'if_grout.c', 'if_map.c', + 'l3vni_map.c', 'rt_grout.c', 'zebra_dplane_grout.c', ) + grout_header, diff --git a/frr/rt_grout.c b/frr/rt_grout.c index 431667cf9..fbe149d5e 100644 --- a/frr/rt_grout.c +++ b/frr/rt_grout.c @@ -2,6 +2,7 @@ // Copyright (c) 2025 Maxime Leroy, Free Mobile #include "if_map.h" +#include "l3vni_map.h" #include "log_grout.h" #include "rt_grout.h" @@ -658,7 +659,9 @@ grout_add_nexthop(uint32_t nh_id, gr_nh_origin_t origin, const struct nexthop *n struct gr_nexthop_info_srv6 *sr6; struct gr_nh_add_req *req = NULL; struct gr_nexthop_info_l3 *l3; + const struct ethaddr *rmac; size_t len = sizeof(*req); + uint16_t vxlan_iface_id; gr_nh_type_t type; switch (nh->type) { @@ -706,12 +709,25 @@ grout_add_nexthop(uint32_t nh_id, gr_nh_origin_t origin, const struct nexthop *n switch (type) { case GR_NH_T_L3: + // For L3 nexthops in VRFs with an L3VNI, redirect the iface from + // the VRF (SVI in FRR's model) to the VXLAN interface. Grout + // routes packets directly through the VXLAN tunnel. + vxlan_iface_id = l3vni_get_vxlan(req->nh.vrf_id); + if (vxlan_iface_id != GR_IFACE_ID_UNDEF) + req->nh.iface_id = vxlan_iface_id; + switch (nh->type) { case NEXTHOP_TYPE_IPV4: case NEXTHOP_TYPE_IPV4_IFINDEX: l3 = (struct gr_nexthop_info_l3 *)req->nh.info; l3->af = GR_AF_IP4; memcpy(&l3->ipv4, &nh->gate.ipv4, sizeof(l3->ipv4)); + // Apply cached RMAC from EVPN NEIGH install if available. + rmac = l3vni_rmac_get(req->nh.vrf_id, l3->ipv4); + if (rmac != NULL) { + memcpy(&l3->mac, rmac, sizeof(l3->mac)); + l3->flags |= GR_NH_F_REMOTE; + } break; case NEXTHOP_TYPE_IPV6: case NEXTHOP_TYPE_IPV6_IFINDEX: @@ -1012,6 +1028,32 @@ enum zebra_dplane_result grout_macfdb_update_ctx(struct zebra_dplane_ctx *ctx) { return ret == 0 ? ZEBRA_DPLANE_REQUEST_SUCCESS : ZEBRA_DPLANE_REQUEST_FAILURE; } +enum zebra_dplane_result grout_neigh_update_ctx(struct zebra_dplane_ctx *ctx) { + const struct ipaddr *addr = dplane_ctx_neigh_get_ipaddr(ctx); + bool add = dplane_ctx_get_op(ctx) != DPLANE_OP_NEIGH_DELETE; + uint16_t vrf_id = vrf_frr_to_grout(dplane_ctx_get_vrf(ctx)); + + if (addr->ipa_type != IPADDR_V4) { + gr_log_debug("only IPv4 VTEP addresses supported, skip"); + return ZEBRA_DPLANE_REQUEST_SUCCESS; + } + + // Cache the RMAC for later use by grout_add_nexthop. We cannot + // create a separate nexthop here because grout's L3 nexthop hash + // keys on (vrf, addr) without iface_id, so it would collide with + // the route nexthop that FRR installs right after. + if (add) { + const struct ethaddr *mac = dplane_ctx_neigh_get_mac(ctx); + gr_log_debug("cache rmac vrf=%u %pIA %pEA", vrf_id, addr, mac); + l3vni_rmac_set(vrf_id, addr->ipaddr_v4.s_addr, mac); + } else { + gr_log_debug("uncache rmac vrf=%u %pIA", vrf_id, addr); + l3vni_rmac_del(vrf_id, addr->ipaddr_v4.s_addr); + } + + return ZEBRA_DPLANE_REQUEST_SUCCESS; +} + enum zebra_dplane_result grout_vxlan_flood_update_ctx(struct zebra_dplane_ctx *ctx) { const struct ipaddr *addr = dplane_ctx_neigh_get_ipaddr(ctx); bool add = dplane_ctx_get_op(ctx) == DPLANE_OP_VTEP_ADD; diff --git a/frr/rt_grout.h b/frr/rt_grout.h index 4f9a84652..57ca525d3 100644 --- a/frr/rt_grout.h +++ b/frr/rt_grout.h @@ -18,4 +18,5 @@ void grout_nexthop_change(bool new, struct gr_nexthop *gr_nh, bool startup); void grout_macfdb_change(const struct gr_fdb_entry *fdb, bool new); enum zebra_dplane_result grout_macfdb_update_ctx(struct zebra_dplane_ctx *ctx); +enum zebra_dplane_result grout_neigh_update_ctx(struct zebra_dplane_ctx *ctx); enum zebra_dplane_result grout_vxlan_flood_update_ctx(struct zebra_dplane_ctx *ctx); diff --git a/frr/zebra_dplane_grout.c b/frr/zebra_dplane_grout.c index 65b2a7464..c4e71179d 100644 --- a/frr/zebra_dplane_grout.c +++ b/frr/zebra_dplane_grout.c @@ -949,6 +949,11 @@ static enum zebra_dplane_result zd_grout_process_update(struct zebra_dplane_ctx case DPLANE_OP_MAC_DELETE: return grout_macfdb_update_ctx(ctx); + case DPLANE_OP_NEIGH_INSTALL: + case DPLANE_OP_NEIGH_UPDATE: + case DPLANE_OP_NEIGH_DELETE: + return grout_neigh_update_ctx(ctx); + case DPLANE_OP_VTEP_ADD: case DPLANE_OP_VTEP_DELETE: return grout_vxlan_flood_update_ctx(ctx); diff --git a/modules/infra/api/gr_infra.h b/modules/infra/api/gr_infra.h index f0895d65a..af3b637c8 100644 --- a/modules/infra/api/gr_infra.h +++ b/modules/infra/api/gr_infra.h @@ -127,6 +127,7 @@ struct gr_iface_info_port { // VRF reconfiguration attribute flags. #define GR_VRF_SET_FIB GR_BIT64(32) +#define GR_VRF_SET_MAC GR_BIT64(33) // Per-AF FIB configuration. struct gr_iface_info_vrf_fib { @@ -138,6 +139,7 @@ struct gr_iface_info_vrf_fib { struct gr_iface_info_vrf { struct gr_iface_info_vrf_fib ipv4; struct gr_iface_info_vrf_fib ipv6; + struct rte_ether_addr mac; // Used as Router MAC for EVPN L3VNI. }; // VLAN reconfiguration attribute flags. diff --git a/modules/infra/api/gr_nexthop.h b/modules/infra/api/gr_nexthop.h index cc8c09e32..cbc4e7939 100644 --- a/modules/infra/api/gr_nexthop.h +++ b/modules/infra/api/gr_nexthop.h @@ -25,6 +25,7 @@ typedef enum : uint8_t { GR_NH_F_GATEWAY = GR_BIT8(2), // Gateway route. GR_NH_F_LINK = GR_BIT8(3), // Connected link route. GR_NH_F_MCAST = GR_BIT8(4), // Multicast address. + GR_NH_F_REMOTE = GR_BIT8(5), // Remote VTEP nexthop (EVPN). } gr_nh_flags_t; // Nexthop types for different forwarding behaviors. @@ -176,6 +177,8 @@ static inline const char *gr_nh_flag_name(const gr_nh_flags_t flag) { return "link"; case GR_NH_F_MCAST: return "multicast"; + case GR_NH_F_REMOTE: + return "remote"; } return "?"; } diff --git a/modules/infra/cli/nexthop.c b/modules/infra/cli/nexthop.c index 484fc8fcc..162595bb4 100644 --- a/modules/infra/cli/nexthop.c +++ b/modules/infra/cli/nexthop.c @@ -379,6 +379,8 @@ static cmd_status_t nh_l3_add(struct gr_api_client *c, const struct ec_pnode *p) goto out; if (arg_eth_addr(p, "MAC", &l3->mac) < 0 && errno != ENOENT) goto out; + if (arg_str(p, "remote")) + l3->flags |= GR_NH_F_REMOTE; if (gr_api_client_send_recv(c, GR_NH_ADD, len, req, NULL) < 0) goto out; @@ -619,13 +621,14 @@ static int ctx_init(struct ec_node *root) { ret = CLI_COMMAND( NEXTHOP_ADD_CTX(root), - "l3 iface IFACE [(id ID),(address IP),(mac MAC)]", + "l3 iface IFACE [(id ID),(address IP),(mac MAC),(remote)]", nh_l3_add, "Add a new L3 nexthop.", with_help("IPv4/6 address.", ec_node_re("IP", IP_ANY_RE)), with_help("Ethernet address.", ec_node_re("MAC", ETH_ADDR_RE)), with_help("Nexthop ID.", ec_node_uint("ID", 1, UINT32_MAX - 1, 10)), - with_help("Output interface.", ec_node_dyn("IFACE", complete_iface_names, NULL)) + with_help("Output interface.", ec_node_dyn("IFACE", complete_iface_names, NULL)), + with_help("Mark as remote (EVPN).", ec_node_str("remote", "remote")) ); if (ret < 0) return ret; diff --git a/modules/infra/cli/vrf.c b/modules/infra/cli/vrf.c index ea7cf0876..a3a854016 100644 --- a/modules/infra/cli/vrf.c +++ b/modules/infra/cli/vrf.c @@ -10,17 +10,19 @@ #define VRF_ATTRS_CMD \ "(rib4-routes RIB4_ROUTES),(fib4-tbl8 FIB4_TBL8)" \ ",(rib6-routes RIB6_ROUTES),(fib6-tbl8 FIB6_TBL8)" \ - ",(description DESCR)" + ",(mac MAC),(description DESCR)" #define VRF_ATTRS_ARGS \ with_help("Max IPv4 routes.", ec_node_uint("RIB4_ROUTES", 1, UINT32_MAX, 10)), \ with_help("IPv4 TBL8 groups.", ec_node_uint("FIB4_TBL8", 1, UINT32_MAX, 10)), \ with_help("Max IPv6 routes.", ec_node_uint("RIB6_ROUTES", 1, UINT32_MAX, 10)), \ with_help("IPv6 TBL8 groups.", ec_node_uint("FIB6_TBL8", 1, UINT32_MAX, 10)), \ + with_help("Set the ethernet address.", ec_node_re("MAC", ETH_ADDR_RE)), \ with_help("Interface description.", ec_node("any", "DESCR")) static void vrf_show(struct gr_api_client *, const struct gr_iface *iface, struct gr_object *o) { const struct gr_iface_info_vrf *info = PAYLOAD(iface); + gr_object_field(o, "mac", 0, ETH_F, &info->mac); gr_object_field(o, "rib4_max_routes", GR_DISP_INT, "%u", info->ipv4.max_routes); gr_object_field(o, "fib4_num_tbl8", GR_DISP_INT, "%u", info->ipv4.num_tbl8); gr_object_field(o, "rib6_max_routes", GR_DISP_INT, "%u", info->ipv6.max_routes); @@ -80,6 +82,11 @@ static uint64_t parse_vrf_args( set_attrs |= GR_VRF_SET_FIB; } + if (arg_eth_addr(p, "MAC", &info->mac) == 0) + set_attrs |= GR_VLAN_SET_MAC; + else if (errno != ENOENT) + return 0; + return set_attrs; } diff --git a/modules/infra/control/vrf.c b/modules/infra/control/vrf.c index 79a7cc665..df5522f61 100644 --- a/modules/infra/control/vrf.c +++ b/modules/infra/control/vrf.c @@ -151,6 +151,7 @@ static int netlink_vrf_add(const struct iface *iface) { strerror(errno)); return ret; } + netlink_link_set_mac(vrf->vrf_ifindex, &vrf->mac); } ret = netlink_add_route(iface->cp_id, table_id); @@ -229,6 +230,7 @@ static int iface_vrf_init(struct iface *iface, const void *api_info) { // VRF's vrf_id is its own iface_id (VRF identifier) iface->vrf_id = iface->id; vrf->ref_count = 0; + rte_eth_random_addr(vrf->mac.addr_bytes); if (iface_loopback_create(iface) < 0) return -errno; @@ -347,10 +349,24 @@ static int iface_vrf_reconfig( fib_conf->num_tbl8); } } + if (set_attrs & GR_VRF_SET_MAC && iface_set_eth_addr(iface, &info->mac) < 0) + return -errno; return 0; } +static int iface_vrf_get_eth_addr(const struct iface *iface, struct rte_ether_addr *mac) { + const struct iface_info_vrf *vrf = iface_info_vrf(iface); + *mac = vrf->mac; + return 0; +} + +static int iface_vrf_set_eth_addr(struct iface *iface, const struct rte_ether_addr *mac) { + struct iface_info_vrf *vrf = iface_info_vrf(iface); + vrf->mac = *mac; + return 0; +} + static void iface_vrf_to_api(void *info, const struct iface *iface) { const struct iface_info_vrf *vrf = iface_info_vrf(iface); struct gr_iface_info_vrf *api = info; @@ -364,6 +380,8 @@ static struct iface_type iface_type_vrf = { .priv_size = sizeof(struct iface_info_vrf), .init = iface_vrf_init, .reconfig = iface_vrf_reconfig, + .set_eth_addr = iface_vrf_set_eth_addr, + .get_eth_addr = iface_vrf_get_eth_addr, .fini = iface_vrf_fini, .to_api = iface_vrf_to_api, }; diff --git a/modules/infra/datapath/eth.h b/modules/infra/datapath/eth.h index 73da053bf..b0c28996c 100644 --- a/modules/infra/datapath/eth.h +++ b/modules/infra/datapath/eth.h @@ -5,6 +5,8 @@ #include "mbuf.h" +#include + #include #include @@ -22,6 +24,7 @@ GR_MBUF_PRIV_DATA_TYPE(eth_input_mbuf_data, { eth_domain_t domain; }) GR_MBUF_PRIV_DATA_TYPE(eth_output_mbuf_data, { struct rte_ether_addr dst; rte_be16_t ether_type; + struct l3_addr vtep; }); void gr_eth_input_add_type(rte_be16_t eth_type, const char *node_name); diff --git a/modules/infra/datapath/eth_output.c b/modules/infra/datapath/eth_output.c index f85b0475c..7b3c41fc8 100644 --- a/modules/infra/datapath/eth_output.c +++ b/modules/infra/datapath/eth_output.c @@ -24,6 +24,7 @@ eth_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, struct rte_ether_hdr *eth; uint16_t last_iface_id; struct rte_mbuf *mbuf; + struct l3_addr vtep; rte_edge_t edge; last_iface_id = GR_IFACE_ID_UNDEF; @@ -59,7 +60,9 @@ eth_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, t->src_addr = src_mac; t->ether_type = priv->ether_type; } + vtep = priv->vtep; iface_mbuf_data(mbuf)->vlan_id = 0; + iface_mbuf_data(mbuf)->vtep = vtep; rte_node_enqueue_x1(graph, node, edge, mbuf); } diff --git a/modules/ip/datapath/ip_output.c b/modules/ip/datapath/ip_output.c index 61a0dc14a..78214edc7 100644 --- a/modules/ip/datapath/ip_output.c +++ b/modules/ip/datapath/ip_output.c @@ -134,6 +134,12 @@ ip_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, u eth_data = eth_output_mbuf_data(mbuf); eth_data->dst = l3->mac; eth_data->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4); + if (iface->type == GR_IFACE_TYPE_VXLAN) { + eth_data->vtep.af = GR_AF_IP4; + eth_data->vtep.ipv4 = l3->ipv4; + } else { + eth_data->vtep.af = GR_AF_UNSPEC; + } sent++; next: if (gr_mbuf_is_traced(mbuf)) { diff --git a/modules/ip6/control/address.c b/modules/ip6/control/address.c index a0fe9eb5d..4e53c3630 100644 --- a/modules/ip6/control/address.c +++ b/modules/ip6/control/address.c @@ -420,6 +420,9 @@ static void ip6_iface_llocal_init(const struct iface *iface) { struct rte_ether_addr mac; unsigned i; + if (iface->type == GR_IFACE_TYPE_VRF) + return; // VRF interfaces shoulnd't have a link local address + if (iface_get_eth_addr(iface, &mac) < 0) return; diff --git a/modules/ip6/datapath/ip6_output.c b/modules/ip6/datapath/ip6_output.c index 9eb89aa6c..63137862c 100644 --- a/modules/ip6/datapath/ip6_output.c +++ b/modules/ip6/datapath/ip6_output.c @@ -116,6 +116,12 @@ ip6_output_process(struct rte_graph *graph, struct rte_node *node, void **objs, else eth_data->dst = l3->mac; eth_data->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6); + if (iface->type == GR_IFACE_TYPE_VXLAN) { + eth_data->vtep.af = GR_AF_IP6; + eth_data->vtep.ipv6 = l3->ipv6; + } else { + eth_data->vtep.af = GR_AF_UNSPEC; + } sent++; next: if (gr_mbuf_is_traced(mbuf)) { diff --git a/modules/l2/control/vxlan.c b/modules/l2/control/vxlan.c index 316122e8f..e8b28a067 100644 --- a/modules/l2/control/vxlan.c +++ b/modules/l2/control/vxlan.c @@ -45,7 +45,7 @@ struct iface *vxlan_get_iface(rte_be32_t vni, uint16_t encap_vrf_id) { static int iface_vxlan_reconfig( struct iface *iface, uint64_t set_attrs, - const struct gr_iface *, + const struct gr_iface *conf, const void *api_info ) { struct iface_info_vxlan *cur = iface_info_vxlan(iface); @@ -135,10 +135,31 @@ static int iface_vxlan_reconfig( conf_done |= GR_VXLAN_SET_LOCAL; } - if (set_attrs & GR_VXLAN_SET_MAC) { - if (iface_set_eth_addr(iface, &next->mac) < 0) + if (set_attrs & (GR_IFACE_SET_VRF | GR_VXLAN_SET_ENCAP_VRF | GR_VXLAN_SET_MAC)) { + struct iface *vrf = get_vrf_iface(cur->encap_vrf_id); + struct rte_ether_addr mac = next->mac; + + assert(vrf != NULL); + + // Some devices assume a unique RMAC per VTEP. + // When no explicit MAC is given, inherit the VTEP VRF's MAC. + if (rte_is_zero_ether_addr(&mac)) + mac = iface_info_vrf(vrf)->mac; + + if (iface_set_eth_addr(iface, &mac) < 0) goto err; + conf_done |= GR_VXLAN_SET_MAC; + + // If configured for EVPN L3VNI, also synchronize the MAC on the interface VRF. + // So it will be advertised as RMAC by FRR. + vrf = NULL; + if (set_attrs & GR_IFACE_SET_VRF) + vrf = get_vrf_iface(conf->vrf_id); + else if (iface->mode == GR_IFACE_MODE_VRF) + vrf = get_vrf_iface(iface->vrf_id); + if (vrf != NULL && iface_set_eth_addr(vrf, &mac) < 0) + goto err; } // Update the datapath template from the current config. diff --git a/modules/l4/l4_input_local.c b/modules/l4/l4_input_local.c index 702bca16d..d9ebbb980 100644 --- a/modules/l4/l4_input_local.c +++ b/modules/l4/l4_input_local.c @@ -7,6 +7,7 @@ #include "l4.h" #include "log.h" +#include #include LOG_TYPE("graph"); @@ -62,13 +63,27 @@ int l4_input_unalias_port(uint8_t proto, rte_be16_t alias) { return 0; } +struct l4_trace_data { + rte_be16_t sport; + rte_be16_t dport; +}; + +static int trace_l4_format(char *buf, size_t len, const void *data, size_t /*data_len*/) { + const struct l4_trace_data *d = data; + return snprintf( + buf, len, "src=%u dst=%u", rte_be_to_cpu_16(d->sport), rte_be_to_cpu_16(d->dport) + ); +} + static uint16_t l4_input_local_process( struct rte_graph *graph, struct rte_node *node, void **objs, uint16_t nb_objs ) { - struct rte_udp_hdr *hdr; + struct rte_tcp_hdr *tcp; + struct rte_udp_hdr *udp; + rte_be16_t sport, dport; struct rte_mbuf *mbuf; rte_edge_t edge; uint8_t proto; @@ -76,6 +91,7 @@ static uint16_t l4_input_local_process( for (uint16_t i = 0; i < nb_objs; i++) { mbuf = objs[i]; edge = BAD_PROTO; + sport = dport = 0; if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) proto = ip_local_mbuf_data(mbuf)->proto; @@ -84,14 +100,28 @@ static uint16_t l4_input_local_process( else goto next; - if (proto != IPPROTO_UDP) { + switch (proto) { + case IPPROTO_UDP: + udp = rte_pktmbuf_mtod(mbuf, struct rte_udp_hdr *); + sport = udp->src_port; + dport = udp->dst_port; + edge = udp_edges[udp->dst_port]; + break; + case IPPROTO_TCP: + tcp = rte_pktmbuf_mtod(mbuf, struct rte_tcp_hdr *); + sport = tcp->src_port; + dport = tcp->dst_port; + // fallthrough + default: edge = MANAGEMENT; - goto next; + break; } - - hdr = rte_pktmbuf_mtod(mbuf, struct rte_udp_hdr *); - edge = udp_edges[hdr->dst_port]; next: + if (gr_mbuf_is_traced(mbuf)) { + struct l4_trace_data *t = gr_mbuf_trace_add(mbuf, node, sizeof(*t)); + t->sport = sport; + t->dport = dport; + } rte_node_enqueue_x1(graph, node, edge, mbuf); } return nb_objs; @@ -117,6 +147,7 @@ static struct gr_node_info info = { .node = &input_node, .type = GR_NODE_T_L4, .register_callback = l4_input_local_register, + .trace_format = trace_l4_format, }; GR_NODE_REGISTER(info); diff --git a/smoke/evpn_l3vpn_frr_test.sh b/smoke/evpn_l3vpn_frr_test.sh new file mode 100755 index 000000000..e8278c227 --- /dev/null +++ b/smoke/evpn_l3vpn_frr_test.sh @@ -0,0 +1,242 @@ +#!/bin/bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2026 Robin Jarry + +# This test verifies EVPN Type-5 (IP prefix) L3VPN connectivity using symmetric +# IRB (Integrated Routing and Bridging) over VXLAN between FRR+Grout and +# a standalone FRR+Linux peer. +# +# Each side has a VRF with an L3 VNI (1000) and a host connected to a local +# port. BGP EVPN advertises IP prefixes (type-5 routes) and RMAC entries +# (type-2 routes with GR_NH_F_REMOTE nexthops) across the VXLAN overlay. +# +# Success criteria: +# - Both sides exchange EVPN type-5 routes (IP prefixes installed). +# - Host-A and Host-B can ping each other through the L3 VXLAN overlay. +# - RMACs are installed as remote nexthops on the grout side. +# +# .-------------------------------. .-----------------------------. +# | evpn-peer | | grout | +# | | | | +# | .- - - - - - - . | | .- - - - - - - . | +# | ' vrf tenant ' | | ' vrf tenant ' | +# | ' ' | | ' ' | +# | ' +-------+ ' | | ' ' | +# | ' | br-l3 | ' | | ' ' | +# | ' +---+---+ ' | | ' ' | +# | ' | ' | | ' ' | +# | ' +----+-----+ ' | | ' +----------+ ' | +# | ' | vxlan-l3 |........... | | ..........| vxlan-l3 | ' | +# | ' +----------+ ' . | | . ' +----------+ ' | +# | ' ' . | | . ' ' | +# | ' .1 ' . | | . ' .1 ' | +# | ' +------+ ' .1 | | .2 ' +-------+ ' | +# | ' | p1 | ' +--------+ | | +------+ ' | p1 | ' | +# | ' +--+---+ ' | x-p0 | | | | p0 | ' +---+---+ ' | +# | '- - - |- - - -' +---+----+ | | +--+---+ '- - - |- - - -' | +# '--------|---------------|------' '----|--------------|---------' +# | | | | +# | | <------- BGP ----> | | +# 16.0.0.0/24 '---------------------' 48.0.0.0/24 +# | underlay | +# .-------|-----------. 172.16.0.0/24 .----------|--------. +# | +---+----+ | | +---+----+ | +# | | x-p1 | | | | x-p1 | | +# | +--------+ | <= = = = = = = = = = = = => | +--------+ | +# | .2 | overlay L3VPN | .2 | +# | | | | +# | host-a | | host-b | +# '-------------------' '-------------------' + +. $(dirname $0)/_init_frr.sh + +# right side (grout) ----------------------------------------------------------- +create_interface p0 +set_ip_address p0 172.16.0.2/24 + +# left side (Linux peer) ------------------------------------------------------- +start_frr evpn-peer + +ip netns exec evpn-peer sysctl -qw net.ipv4.conf.all.forwarding=1 +ip netns exec evpn-peer sysctl -qw net.ipv4.conf.all.rp_filter=0 +ip netns exec evpn-peer sysctl -qw net.ipv4.conf.default.rp_filter=0 + +move_to_netns x-p0 evpn-peer +ip -n evpn-peer addr add 172.16.0.1/24 dev x-p0 + +# Create L3VNI VXLAN on the Linux peer with a bridge+SVI (required by Linux) +ip -n evpn-peer link add br-l3 type bridge +ip -n evpn-peer link set br-l3 up + +ip -n evpn-peer link add vxlan-l3 type vxlan id 1000 local 172.16.0.1 dstport 4789 nolearning +ip -n evpn-peer link set vxlan-l3 master br-l3 +ip -n evpn-peer link set vxlan-l3 up + +# Create VRF "tenant" on the peer and bind the L3VNI bridge as SVI +ip -n evpn-peer link add tenant type vrf table 10 +ip -n evpn-peer link set tenant up +ip -n evpn-peer link set br-l3 master tenant + +# Host-facing port in the peer VRF +ip -n evpn-peer link add p1 type veth peer name x-p1 +ip -n evpn-peer link set p1 master tenant +ip -n evpn-peer link set p1 up +ip -n evpn-peer addr add 16.0.0.1/24 dev p1 + +netns_add host-a +ip -n evpn-peer link set x-p1 netns host-a +ip -n host-a link set x-p1 up +ip -n host-a addr add 16.0.0.2/24 dev x-p1 +ip -n host-a route add default via 16.0.0.1 + +# FRR config on the Linux peer +vtysh -N evpn-peer <<-EOF +configure terminal + +vrf tenant + vni 1000 +exit-vrf + +router bgp 65000 + bgp router-id 172.16.0.1 + no bgp default ipv4-unicast + + neighbor 172.16.0.2 remote-as 65000 + + address-family l2vpn evpn + neighbor 172.16.0.2 activate + advertise-all-vni + exit-address-family +exit + +router bgp 65000 vrf tenant + bgp router-id 172.16.0.1 + + address-family ipv4 unicast + redistribute connected + exit-address-family + + address-family l2vpn evpn + advertise ipv4 unicast + exit-address-family +exit +EOF + +# right side (grout) setup L3VPN ----------------------------------------------- +create_vrf tenant + +# L3 VNI VXLAN in VRF mode (no bridge needed in grout) +grcli interface add vxlan vxlan-l3 vni 1000 local 172.16.0.2 vrf tenant + +create_interface p1 vrf tenant +set_ip_address p1 48.0.0.1/24 + +netns_add host-b +move_to_netns x-p1 host-b +ip -n host-b addr add 48.0.0.2/24 dev x-p1 +ip -n host-b route add default via 48.0.0.1 + +mark_events + +# FRR config on grout +vtysh <<-EOF +configure terminal + +vrf tenant + vni 1000 +exit-vrf + +router bgp 65000 + bgp router-id 172.16.0.2 + no bgp default ipv4-unicast + + neighbor 172.16.0.1 remote-as 65000 + + address-family l2vpn evpn + neighbor 172.16.0.1 activate + advertise-all-vni + exit-address-family +exit + +router bgp 65000 vrf tenant + bgp router-id 172.16.0.2 + + address-family ipv4 unicast + redistribute connected + exit-address-family + + address-family l2vpn evpn + advertise ipv4 unicast + exit-address-family +exit +EOF + +# -- Check L3VNI is recognized by both sides ----------------------------------- +attempts=0 +while ! vtysh -c "show evpn vni 1000" | grep -qF "L3"; do + if [ "$attempts" -ge 5 ]; then + vtysh -c "show evpn vni" + fail "Grout FRR does not recognize VNI 1000 as L3VNI" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +attempts=0 +while ! vtysh -N evpn-peer -c "show evpn vni 1000" | grep -qF "L3"; do + if [ "$attempts" -ge 5 ]; then + vtysh -N evpn-peer -c "show evpn vni" + fail "Linux peer does not recognize VNI 1000 as L3VNI" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +# -- Wait for EVPN type-5 route exchange --------------------------------------- +attempts=0 +while ! vtysh -c "show bgp l2vpn evpn route type 5" | grep -qF "16.0.0.0"; do + if [ "$attempts" -ge 5 ]; then + vtysh -c "show bgp l2vpn evpn route type 5" + fail "Grout FRR did not learn type-5 route for 16.0.0.0/24" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +attempts=0 +while ! vtysh -N evpn-peer -c "show bgp l2vpn evpn route type 5" | grep -qF "48.0.0.0"; do + if [ "$attempts" -ge 5 ]; then + vtysh -c "show bgp vrf tenant ipv4 unicast" + vtysh -c "show bgp l2vpn evpn route" + vtysh -N evpn-peer -c "show bgp l2vpn evpn route type 5" + fail "Linux peer did not learn type-5 route for 48.0.0.0/24" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +# -- Wait for routes to be installed in VRF ------------------------------------ +wait_event 'route4 add: vrf=tenant 16.0.0.0/24' + +attempts=0 +while ! ip -n evpn-peer route show vrf tenant | grep -qF "48.0.0.0/24"; do + if [ "$attempts" -ge 5 ]; then + ip -n evpn-peer route show vrf tenant + fail "Route 48.0.0.0/24 not installed in peer VRF tenant" + fi + sleep 1 + attempts=$((attempts + 1)) +done + +# -- Check RMAC is set on the route nexthop ------------------------------------ +rmac=$(ip netns exec evpn-peer cat /sys/class/net/vxlan-l3/address) + +wait_event "nh new: type=L3 id=[0-9]+ iface=vxlan-l3 vrf=tenant origin=zebra family=ipv4 addr=172.16.0.1 mac=$rmac flags=static remote" + +vtysh -c "show bgp l2vpn evpn route type 5" +grcli route show vrf tenant +grcli nexthop show vrf tenant + +# -- Verify L3 connectivity through VXLAN overlay ------------------------------ +ip netns exec host-b ping -i0.1 -c3 -W1 16.0.0.2 +ip netns exec host-a ping -i0.1 -c3 -W1 48.0.0.2