/*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012-2024 Marko Zec * Copyright (c) 2005, 2018 University of Zagreb * Copyright (c) 2005 International Computer Science Institute * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup * structures and a trivial search procedure. More significant bits of * the search key are used to directly index a two-stage trie, while the * remaining bits are used for finding the next hop in a sorted array. * More details in: * * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per * second in software, ACM SIGCOMM Computer Communication Review, September * 2012 * * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split */ #include #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DXR_TRIE_BITS 20 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24); #if DXR_TRIE_BITS > 16 #define DXR_D 16 #else #define DXR_D (DXR_TRIE_BITS - 1) #endif #define D_TBL_SIZE (1 << DXR_D) #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS) #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS) #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS) #define DESC_BASE_BITS 22 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS) #define BASE_MAX ((1 << DESC_BASE_BITS) - 1) #define RTBL_SIZE_INCR (BASE_MAX / 64) #if DXR_TRIE_BITS < 24 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1) #else #define FRAGS_MASK_SHORT 0 #endif #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \ ~FRAGS_MASK_SHORT) #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1) #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2) #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT) #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT) #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL) #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1) #define CHUNK_HASH_BITS 16 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS) #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1) #define TRIE_HASH_BITS 16 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS) #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1) #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16) #define UNUSED_BUCKETS 8 /* Lookup structure elements */ struct direct_entry { uint32_t fragments: DESC_FRAGMENTS_BITS, base: DESC_BASE_BITS; }; struct range_entry_long { uint32_t start: DXR_RANGE_SHIFT, nexthop: DXR_TRIE_BITS; }; #if DXR_TRIE_BITS < 24 struct range_entry_short { uint16_t start: DXR_RANGE_SHIFT - 8, nexthop: DXR_TRIE_BITS - 8; }; #endif /* Auxiliary structures */ struct heap_entry { uint32_t start; uint32_t end; uint32_t preflen; uint32_t nexthop; }; struct chunk_desc { LIST_ENTRY(chunk_desc) cd_all_le; LIST_ENTRY(chunk_desc) cd_hash_le; uint32_t cd_hash; uint32_t cd_refcnt; uint32_t cd_base; uint32_t cd_cur_size; uint32_t cd_max_size; }; struct trie_desc { LIST_ENTRY(trie_desc) td_all_le; LIST_ENTRY(trie_desc) td_hash_le; uint32_t td_hash; uint32_t td_index; uint32_t td_refcnt; }; struct dxr_aux { /* Glue to external state */ struct fib_data *fd; uint32_t fibnum; int refcnt; /* Auxiliary build-time tables */ struct direct_entry direct_tbl[DIRECT_TBL_SIZE]; uint16_t d_tbl[D_TBL_SIZE]; struct direct_entry *x_tbl; union { struct range_entry_long re; uint32_t fragments; } *range_tbl; /* Auxiliary internal state */ uint32_t updates_mask[DIRECT_TBL_SIZE / 32]; struct trie_desc *trietbl[D_TBL_SIZE]; LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE]; LIST_HEAD(, chunk_desc) all_chunks; LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS]; LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE]; LIST_HEAD(, trie_desc) all_trie; LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */ struct sockaddr_in dst; struct sockaddr_in mask; struct heap_entry heap[33]; uint32_t prefixes; uint32_t updates_low; uint32_t updates_high; uint32_t unused_chunks_size; uint32_t xtbl_size; uint32_t all_trie_cnt; uint32_t unused_trie_cnt; uint32_t trie_rebuilt_prefixes; uint32_t heap_index; uint32_t d_bits; uint32_t rtbl_size; uint32_t rtbl_top; uint32_t rtbl_work_frags; uint32_t work_chunk; }; /* Main lookup structure container */ struct dxr { /* Lookup tables */ void *d; void *x; void *r; struct nhop_object **nh_tbl; /* Glue to external state */ struct dxr_aux *aux; struct fib_data *fd; struct epoch_context epoch_ctx; uint32_t fibnum; uint16_t d_shift; uint16_t x_shift; uint32_t x_mask; }; static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM"); static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); uma_zone_t chunk_zone; uma_zone_t trie_zone; VNET_DEFINE_STATIC(int, frag_limit) = 100; #define V_frag_limit VNET(frag_limit) /* Binary search for a matching address range */ #define DXR_LOOKUP_STAGE \ if (masked_dst < range[middle].start) { \ upperbound = middle; \ middle = (middle + lowerbound) / 2; \ } else if (masked_dst < range[middle + 1].start) \ return (range[middle].nexthop); \ else { \ lowerbound = middle + 1; \ middle = (upperbound + middle + 1) / 2; \ } \ if (upperbound == lowerbound) \ return (range[lowerbound].nexthop); static int range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst) { uint32_t base; uint32_t upperbound; uint32_t middle; uint32_t lowerbound; uint32_t masked_dst; base = de.base; lowerbound = 0; masked_dst = dst & DXR_RANGE_MASK; #if DXR_TRIE_BITS < 24 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) { upperbound = de.fragments & FRAGS_MASK_SHORT; struct range_entry_short *range = (struct range_entry_short *) &rt[base]; masked_dst >>= 8; middle = upperbound; upperbound = upperbound * 2 + 1; for (;;) { DXR_LOOKUP_STAGE DXR_LOOKUP_STAGE } } #endif upperbound = de.fragments; middle = upperbound / 2; struct range_entry_long *range = &rt[base]; if (__predict_false(IS_XL_FORMAT(de.fragments))) { upperbound = *((uint32_t *) range); range++; middle = upperbound / 2; } for (;;) { DXR_LOOKUP_STAGE DXR_LOOKUP_STAGE } } #define DXR_LOOKUP_DEFINE(D) \ static int inline \ dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \ { \ struct direct_entry de; \ uint16_t *dt = dxr->d; \ struct direct_entry *xt = dxr->x; \ \ de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \ + ((dst >> DXR_RANGE_SHIFT) & \ (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \ if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \ return (de.base); \ return (range_lookup(dxr->r, de, dst)); \ } \ \ static struct nhop_object * \ dxr_fib_lookup_##D(void *algo_data, \ const struct flm_lookup_key key, uint32_t scopeid __unused) \ { \ struct dxr *dxr = algo_data; \ \ return (dxr->nh_tbl[dxr_lookup_##D(dxr, \ ntohl(key.addr4.s_addr))]); \ } #if DXR_TRIE_BITS > 16 DXR_LOOKUP_DEFINE(16) #endif DXR_LOOKUP_DEFINE(15) DXR_LOOKUP_DEFINE(14) DXR_LOOKUP_DEFINE(13) DXR_LOOKUP_DEFINE(12) DXR_LOOKUP_DEFINE(11) DXR_LOOKUP_DEFINE(10) DXR_LOOKUP_DEFINE(9) static int inline dxr_lookup(struct dxr *dxr, uint32_t dst) { struct direct_entry de; uint16_t *dt = dxr->d; struct direct_entry *xt = dxr->x; de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) + ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)]; if (__predict_true(de.fragments == FRAGS_MARK_HIT)) return (de.base); return (range_lookup(dxr->r, de, dst)); } static void initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk) { struct heap_entry *fhp = &da->heap[0]; struct rtentry *rt; struct route_nhop_data rnd; da->heap_index = 0; da->dst.sin_addr.s_addr = htonl(dst_u32); rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED, &rnd); if (rt != NULL) { struct in_addr addr; uint32_t scopeid; rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid); fhp->start = ntohl(addr.s_addr); fhp->end = fhp->start; if (fhp->preflen < 32) fhp->end |= (0xffffffffU >> fhp->preflen); fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop); } else { fhp->preflen = fhp->nexthop = fhp->start = 0; fhp->end = 0xffffffffU; } } static uint32_t chunk_size(struct dxr_aux *da, struct direct_entry *fdesc) { if (IS_SHORT_FORMAT(fdesc->fragments)) return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1); else if (IS_XL_FORMAT(fdesc->fragments)) return (da->range_tbl[fdesc->base].fragments + 2); else /* if (IS_LONG_FORMAT(fdesc->fragments)) */ return (fdesc->fragments + 1); } static uint32_t chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc) { uint32_t size = chunk_size(da, fdesc); uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base]; uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size]; uint32_t hash = fdesc->fragments; for (; p < l; p++) hash = (hash << 7) + (hash >> 13) + *p; return (hash + (hash >> 16)); } static int chunk_ref(struct dxr_aux *da, uint32_t chunk) { struct direct_entry *fdesc = &da->direct_tbl[chunk]; struct chunk_desc *cdp, *empty_cdp; uint32_t base = fdesc->base; uint32_t size = chunk_size(da, fdesc); uint32_t hash = chunk_hash(da, fdesc); int i; /* Find an existing descriptor */ LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cd_hash_le) { if (cdp->cd_hash != hash || cdp->cd_cur_size != size || memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], sizeof(struct range_entry_long) * size)) continue; da->rtbl_top = fdesc->base; fdesc->base = cdp->cd_base; cdp->cd_refcnt++; return (0); } /* No matching chunks found. Find an empty one to recycle. */ for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++) cdp = LIST_FIRST(&da->unused_chunks[i]); if (cdp == NULL) LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le) if (empty_cdp->cd_max_size >= size && (cdp == NULL || empty_cdp->cd_max_size < cdp->cd_max_size)) { cdp = empty_cdp; if (empty_cdp->cd_max_size == size) break; } if (cdp != NULL) { /* Copy from heap into the recycled chunk */ bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base], size * sizeof(struct range_entry_long)); fdesc->base = cdp->cd_base; da->rtbl_top -= size; da->unused_chunks_size -= cdp->cd_max_size; if (cdp->cd_max_size > size) { /* Split the range in two, need a new descriptor */ empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); if (empty_cdp == NULL) return (1); LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le); empty_cdp->cd_base = cdp->cd_base + size; empty_cdp->cd_cur_size = 0; empty_cdp->cd_max_size = cdp->cd_max_size - size; i = empty_cdp->cd_max_size; if (i >= UNUSED_BUCKETS) i = 0; LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp, cd_hash_le); da->unused_chunks_size += empty_cdp->cd_max_size; cdp->cd_max_size = size; } LIST_REMOVE(cdp, cd_hash_le); } else { /* Alloc a new descriptor at the top of the heap*/ cdp = uma_zalloc(chunk_zone, M_NOWAIT); if (cdp == NULL) return (1); cdp->cd_max_size = size; cdp->cd_base = fdesc->base; LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le); MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top); } cdp->cd_hash = hash; cdp->cd_refcnt = 1; cdp->cd_cur_size = size; LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp, cd_hash_le); if (da->rtbl_top >= da->rtbl_size) { if (da->rtbl_top >= BASE_MAX) { FIB_PRINTF(LOG_ERR, da->fd, "structural limit exceeded at %d " "range table elements", da->rtbl_top); return (1); } da->rtbl_size += RTBL_SIZE_INCR; i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", da->rtbl_top * 100 / BASE_MAX); da->range_tbl = realloc(da->range_tbl, sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); if (da->range_tbl == NULL) { FIB_PRINTF(LOG_NOTICE, da->fd, "Unable to allocate DXR range table"); return (1); } } return (0); } static void chunk_unref(struct dxr_aux *da, uint32_t chunk) { struct direct_entry *fdesc = &da->direct_tbl[chunk]; struct chunk_desc *cdp, *cdp2; uint32_t base = fdesc->base; uint32_t size = chunk_size(da, fdesc); uint32_t hash = chunk_hash(da, fdesc); int i; /* Find the corresponding descriptor */ LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cd_hash_le) if (cdp->cd_hash == hash && cdp->cd_cur_size == size && memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base], sizeof(struct range_entry_long) * size) == 0) break; MPASS(cdp != NULL); if (--cdp->cd_refcnt > 0) return; LIST_REMOVE(cdp, cd_hash_le); da->unused_chunks_size += cdp->cd_max_size; cdp->cd_cur_size = 0; /* Attempt to merge with the preceding chunk, if empty */ cdp2 = LIST_NEXT(cdp, cd_all_le); if (cdp2 != NULL && cdp2->cd_cur_size == 0) { MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base); LIST_REMOVE(cdp, cd_all_le); LIST_REMOVE(cdp2, cd_hash_le); cdp2->cd_max_size += cdp->cd_max_size; uma_zfree(chunk_zone, cdp); cdp = cdp2; } /* Attempt to merge with the subsequent chunk, if empty */ cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le); if (cdp2 != NULL && cdp2->cd_cur_size == 0) { MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base); LIST_REMOVE(cdp, cd_all_le); LIST_REMOVE(cdp2, cd_hash_le); cdp2->cd_max_size += cdp->cd_max_size; cdp2->cd_base = cdp->cd_base; uma_zfree(chunk_zone, cdp); cdp = cdp2; } if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) { /* Free the chunk on the top of the range heap, trim the heap */ MPASS(cdp == LIST_FIRST(&da->all_chunks)); da->rtbl_top -= cdp->cd_max_size; da->unused_chunks_size -= cdp->cd_max_size; LIST_REMOVE(cdp, cd_all_le); uma_zfree(chunk_zone, cdp); return; } i = cdp->cd_max_size; if (i >= UNUSED_BUCKETS) i = 0; LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le); } static uint32_t trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index) { uint32_t i, *val; uint32_t hash = 0; for (i = 0; i < (1 << dxr_x); i++) { hash = (hash << 3) ^ (hash >> 3); val = (uint32_t *) (void *) &da->direct_tbl[(index << dxr_x) + i]; hash += (*val << 5); hash += (*val >> 5); } return (hash + (hash >> 16)); } static int trie_ref(struct dxr_aux *da, uint32_t index) { struct trie_desc *tp; uint32_t dxr_d = da->d_bits; uint32_t dxr_x = DXR_TRIE_BITS - dxr_d; uint32_t hash = trie_hash(da, dxr_x, index); /* Find an existing descriptor */ LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le) if (tp->td_hash == hash && memcmp(&da->direct_tbl[index << dxr_x], &da->x_tbl[tp->td_index << dxr_x], sizeof(*da->x_tbl) << dxr_x) == 0) { tp->td_refcnt++; da->trietbl[index] = tp; return(tp->td_index); } tp = LIST_FIRST(&da->unused_trie); if (tp != NULL) { LIST_REMOVE(tp, td_hash_le); da->unused_trie_cnt--; } else { tp = uma_zalloc(trie_zone, M_NOWAIT); if (tp == NULL) return (-1); LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le); tp->td_index = da->all_trie_cnt++; } tp->td_hash = hash; tp->td_refcnt = 1; LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp, td_hash_le); memcpy(&da->x_tbl[tp->td_index << dxr_x], &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x); da->trietbl[index] = tp; if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) { da->xtbl_size += XTBL_SIZE_INCR; da->x_tbl = realloc(da->x_tbl, sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); if (da->x_tbl == NULL) { FIB_PRINTF(LOG_NOTICE, da->fd, "Unable to allocate DXR extension table"); return (-1); } } return(tp->td_index); } static void trie_unref(struct dxr_aux *da, uint32_t index) { struct trie_desc *tp = da->trietbl[index]; if (tp == NULL) return; da->trietbl[index] = NULL; if (--tp->td_refcnt > 0) return; LIST_REMOVE(tp, td_hash_le); da->unused_trie_cnt++; if (tp->td_index != da->all_trie_cnt - 1) { LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le); return; } do { da->all_trie_cnt--; da->unused_trie_cnt--; LIST_REMOVE(tp, td_all_le); uma_zfree(trie_zone, tp); LIST_FOREACH(tp, &da->unused_trie, td_hash_le) if (tp->td_index == da->all_trie_cnt - 1) { LIST_REMOVE(tp, td_hash_le); break; } } while (tp != NULL); } static void heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, uint32_t nh) { struct heap_entry *fhp; int i; for (i = da->heap_index; i >= 0; i--) { if (preflen > da->heap[i].preflen) break; else if (preflen < da->heap[i].preflen) da->heap[i + 1] = da->heap[i]; else return; } fhp = &da->heap[i + 1]; fhp->preflen = preflen; fhp->start = start; fhp->end = end; fhp->nexthop = nh; da->heap_index++; } static int dxr_walk(struct rtentry *rt, void *arg) { struct dxr_aux *da = arg; uint32_t chunk = da->work_chunk; uint32_t first = chunk << DXR_RANGE_SHIFT; uint32_t last = first | DXR_RANGE_MASK; struct range_entry_long *fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; struct heap_entry *fhp = &da->heap[da->heap_index]; uint32_t preflen, nh, start, end, scopeid; struct in_addr addr; rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid); start = ntohl(addr.s_addr); if (start > last) return (-1); /* Beyond chunk boundaries, we are done */ if (start < first) return (0); /* Skip this route */ end = start; if (preflen < 32) end |= (0xffffffffU >> preflen); nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt)); if (start == fhp->start) heap_inject(da, start, end, preflen, nh); else { /* start > fhp->start */ while (start > fhp->end) { uint32_t oend = fhp->end; if (da->heap_index > 0) { fhp--; da->heap_index--; } else initheap(da, fhp->end + 1, chunk); if (fhp->end > oend && fhp->nexthop != fp->nexthop) { fp++; da->rtbl_work_frags++; fp->start = (oend + 1) & DXR_RANGE_MASK; fp->nexthop = fhp->nexthop; } } if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) && nh != fp->nexthop) { fp++; da->rtbl_work_frags++; fp->start = start & DXR_RANGE_MASK; } else if (da->rtbl_work_frags) { if ((--fp)->nexthop == nh) da->rtbl_work_frags--; else fp++; } fp->nexthop = nh; heap_inject(da, start, end, preflen, nh); } return (0); } static int update_chunk(struct dxr_aux *da, uint32_t chunk) { struct range_entry_long *fp; #if DXR_TRIE_BITS < 24 struct range_entry_short *fps; uint32_t start, nh, i; #endif struct heap_entry *fhp; uint32_t first = chunk << DXR_RANGE_SHIFT; uint32_t last = first | DXR_RANGE_MASK; if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT) chunk_unref(da, chunk); initheap(da, first, chunk); fp = &da->range_tbl[da->rtbl_top].re; da->rtbl_work_frags = 0; fp->start = first & DXR_RANGE_MASK; fp->nexthop = da->heap[0].nexthop; da->dst.sin_addr.s_addr = htonl(first); da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK); da->work_chunk = chunk; rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED, (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask, dxr_walk, da); /* Flush any remaining objects on the heap */ fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re; fhp = &da->heap[da->heap_index]; while (fhp->preflen > DXR_TRIE_BITS) { uint32_t oend = fhp->end; if (da->heap_index > 0) { fhp--; da->heap_index--; } else initheap(da, fhp->end + 1, chunk); if (fhp->end > oend && fhp->nexthop != fp->nexthop) { /* Have we crossed the upper chunk boundary? */ if (oend >= last) break; fp++; da->rtbl_work_frags++; fp->start = (oend + 1) & DXR_RANGE_MASK; fp->nexthop = fhp->nexthop; } } /* Direct hit if the chunk contains only a single fragment */ if (da->rtbl_work_frags == 0) { da->direct_tbl[chunk].base = fp->nexthop; da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT; return (0); } da->direct_tbl[chunk].base = da->rtbl_top; da->direct_tbl[chunk].fragments = da->rtbl_work_frags; #if DXR_TRIE_BITS < 24 /* Check whether the chunk can be more compactly encoded */ fp = &da->range_tbl[da->rtbl_top].re; for (i = 0; i <= da->rtbl_work_frags; i++, fp++) if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH) break; if (i == da->rtbl_work_frags + 1) { fp = &da->range_tbl[da->rtbl_top].re; fps = (void *) fp; for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) { start = fp->start; nh = fp->nexthop; fps->start = start >> 8; fps->nexthop = nh; } fps->start = start >> 8; fps->nexthop = nh; da->rtbl_work_frags >>= 1; da->direct_tbl[chunk].fragments = da->rtbl_work_frags | FRAGS_PREF_SHORT; } else #endif if (da->rtbl_work_frags >= FRAGS_MARK_HIT) { da->direct_tbl[chunk].fragments = FRAGS_MARK_XL; memmove(&da->range_tbl[da->rtbl_top + 1], &da->range_tbl[da->rtbl_top], (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl)); da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags; da->rtbl_work_frags++; } da->rtbl_top += (da->rtbl_work_frags + 1); return (chunk_ref(da, chunk)); } static void dxr_build(struct dxr *dxr) { struct dxr_aux *da = dxr->aux; struct chunk_desc *cdp; struct rib_rtable_info rinfo; struct timeval t0, t1, t2, t3; uint32_t r_size, dxr_tot_size; uint32_t i, m, range_rebuild = 0; uint32_t range_frag; struct trie_desc *tp; uint32_t d_tbl_size, dxr_x, d_size, x_size; uint32_t ti, trie_rebuild = 0, prev_size = 0; uint32_t trie_frag; MPASS(dxr->d == NULL); if (da == NULL) { da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT); if (da == NULL) { FIB_PRINTF(LOG_NOTICE, dxr->fd, "Unable to allocate DXR aux struct"); return; } dxr->aux = da; da->fibnum = dxr->fibnum; da->fd = dxr->fd; da->refcnt = 1; LIST_INIT(&da->all_chunks); LIST_INIT(&da->all_trie); da->rtbl_size = RTBL_SIZE_INCR; da->range_tbl = NULL; da->xtbl_size = XTBL_SIZE_INCR; da->x_tbl = NULL; bzero(&da->dst, sizeof(da->dst)); bzero(&da->mask, sizeof(da->mask)); da->dst.sin_len = sizeof(da->dst); da->mask.sin_len = sizeof(da->mask); da->dst.sin_family = AF_INET; da->mask.sin_family = AF_INET; } if (da->range_tbl == NULL) { da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); if (da->range_tbl == NULL) { FIB_PRINTF(LOG_NOTICE, da->fd, "Unable to allocate DXR range table"); return; } range_rebuild = 1; } if (da->x_tbl == NULL) { da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT); if (da->x_tbl == NULL) { FIB_PRINTF(LOG_NOTICE, da->fd, "Unable to allocate DXR extension table"); return; } trie_rebuild = 1; } microuptime(&t0); dxr->nh_tbl = fib_get_nhop_array(da->fd); fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); if (da->updates_low > da->updates_high) range_rebuild = 1; range_build: if (range_rebuild) { /* Bulk cleanup */ bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { LIST_REMOVE(cdp, cd_all_le); uma_zfree(chunk_zone, cdp); } for (i = 0; i < UNUSED_BUCKETS; i++) LIST_INIT(&da->unused_chunks[i]); da->unused_chunks_size = 0; da->rtbl_top = 0; da->updates_low = 0; da->updates_high = DIRECT_TBL_SIZE - 1; memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); for (i = 0; i < DIRECT_TBL_SIZE; i++) { da->direct_tbl[i].fragments = FRAGS_MARK_HIT; da->direct_tbl[i].base = 0; } } da->prefixes = rinfo.num_prefixes; /* DXR: construct direct & range table */ for (i = da->updates_low; i <= da->updates_high; i++) { m = da->updates_mask[i >> 5] >> (i & 0x1f); if (m == 0) i |= 0x1f; else if (m & 1 && update_chunk(da, i) != 0) return; } range_frag = 0; if (da->rtbl_top) range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; if (range_frag > V_frag_limit) { range_rebuild = 1; goto range_build; } r_size = sizeof(*da->range_tbl) * da->rtbl_top; microuptime(&t1); if (range_rebuild || abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) trie_rebuild = 1; trie_build: if (trie_rebuild) { da->trie_rebuilt_prefixes = da->prefixes; da->d_bits = DXR_D; da->updates_low = 0; da->updates_high = DIRECT_TBL_SIZE - 1; if (!range_rebuild) memset(da->updates_mask, 0xff, sizeof(da->updates_mask)); } dxr2_try_squeeze: if (trie_rebuild) { /* Bulk cleanup */ bzero(da->trietbl, sizeof(da->trietbl)); bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl)); while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { LIST_REMOVE(tp, td_all_le); uma_zfree(trie_zone, tp); } LIST_INIT(&da->unused_trie); da->all_trie_cnt = da->unused_trie_cnt = 0; } /* Populate d_tbl, x_tbl */ dxr_x = DXR_TRIE_BITS - da->d_bits; d_tbl_size = (1 << da->d_bits); for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x; i++) { if (!trie_rebuild) { m = 0; for (int j = 0; j < (1 << dxr_x); j += 32) m |= da->updates_mask[((i << dxr_x) + j) >> 5]; if (m == 0) continue; trie_unref(da, i); } ti = trie_ref(da, i); if (ti < 0) return; da->d_tbl[i] = ti; } trie_frag = 0; if (da->all_trie_cnt) trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; if (trie_frag > V_frag_limit) { trie_rebuild = 1; goto trie_build; } d_size = sizeof(*da->d_tbl) * d_tbl_size; x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size * da->all_trie_cnt; dxr_tot_size = d_size + x_size + r_size; if (trie_rebuild == 1) { /* Try to find a more compact D/X split */ if (prev_size == 0 || dxr_tot_size <= prev_size) da->d_bits--; else { da->d_bits++; trie_rebuild = 2; } prev_size = dxr_tot_size; goto dxr2_try_squeeze; } microuptime(&t2); dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT); if (dxr->d == NULL) { FIB_PRINTF(LOG_NOTICE, da->fd, "Unable to allocate DXR lookup table"); return; } memcpy(dxr->d, da->d_tbl, d_size); dxr->x = ((char *) dxr->d) + d_size; memcpy(dxr->x, da->x_tbl, x_size); dxr->r = ((char *) dxr->x) + x_size; dxr->d_shift = 32 - da->d_bits; dxr->x_shift = dxr_x; dxr->x_mask = 0xffffffffU >> (32 - dxr_x); memcpy(dxr->r, da->range_tbl, r_size); if (da->updates_low <= da->updates_high) bzero(&da->updates_mask[da->updates_low / 32], (da->updates_high - da->updates_low) / 8 + 1); da->updates_low = DIRECT_TBL_SIZE - 1; da->updates_high = 0; microuptime(&t3); FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)", da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops); i = dxr_tot_size * 100; if (rinfo.num_prefixes) i /= rinfo.num_prefixes; FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, i / 100, i % 100); FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d%% trie, %d.%02d%% range fragmentation", trie_frag / 100, trie_frag % 100, range_frag / 100, range_frag % 100); i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms", trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", i / 1000, i % 1000); } /* * Glue functions for attaching to the FIB_ALGO infrastructure. */ static struct nhop_object * dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) { struct dxr *dxr = algo_data; return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]); } static enum flm_op_result dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data) { struct dxr *old_dxr = old_data; struct dxr_aux *da = NULL; struct dxr *dxr; dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT); if (dxr == NULL) { FIB_PRINTF(LOG_NOTICE, fd, "Unable to allocate DXR container struct"); return (FLM_REBUILD); } /* Check whether we may reuse the old auxiliary structures */ if (old_dxr != NULL && old_dxr->aux != NULL && old_dxr->aux->fd == fd) { da = old_dxr->aux; atomic_add_int(&da->refcnt, 1); } dxr->aux = da; dxr->d = NULL; dxr->fd = fd; dxr->fibnum = fibnum; *data = dxr; return (FLM_SUCCESS); } static void dxr_destroy(void *data) { struct dxr *dxr = data; struct dxr_aux *da = dxr->aux; struct chunk_desc *cdp; struct trie_desc *tp; free(dxr->d, M_DXRLPM); free(dxr, M_DXRAUX); if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1) return; /* Release auxiliary structures */ while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) { LIST_REMOVE(cdp, cd_all_le); uma_zfree(chunk_zone, cdp); } while ((tp = LIST_FIRST(&da->all_trie)) != NULL) { LIST_REMOVE(tp, td_all_le); uma_zfree(trie_zone, tp); } free(da->range_tbl, M_DXRAUX); free(da->x_tbl, M_DXRAUX); free(da, M_DXRAUX); } static void epoch_dxr_destroy(epoch_context_t ctx) { struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx); dxr_destroy(dxr); } static void * choose_lookup_fn(struct dxr_aux *da) { switch (da->d_bits) { #if DXR_TRIE_BITS > 16 case 16: return (dxr_fib_lookup_16); #endif case 15: return (dxr_fib_lookup_15); case 14: return (dxr_fib_lookup_14); case 13: return (dxr_fib_lookup_13); case 12: return (dxr_fib_lookup_12); case 11: return (dxr_fib_lookup_11); case 10: return (dxr_fib_lookup_10); case 9: return (dxr_fib_lookup_9); } return (dxr_fib_lookup); } static enum flm_op_result dxr_dump_end(void *data, struct fib_dp *dp) { struct dxr *dxr = data; struct dxr_aux *da; dxr_build(dxr); da = dxr->aux; if (da == NULL || dxr->d == NULL) return (FLM_REBUILD); if (da->rtbl_top >= BASE_MAX) return (FLM_ERROR); dp->f = choose_lookup_fn(da); dp->arg = dxr; return (FLM_SUCCESS); } static enum flm_op_result dxr_dump_rib_item(struct rtentry *rt, void *data) { return (FLM_SUCCESS); } static enum flm_op_result dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, void *data) { return (FLM_BATCH); } static enum flm_op_result dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, void *data) { struct dxr *dxr = data; struct dxr *new_dxr; struct dxr_aux *da; struct fib_dp new_dp; enum flm_op_result res; uint32_t ip, plen, hmask, start, end, i, ui; #ifdef INVARIANTS struct rib_rtable_info rinfo; int update_delta = 0; #endif MPASS(data != NULL); MPASS(q != NULL); MPASS(q->count < q->size); da = dxr->aux; MPASS(da != NULL); MPASS(da->fd == dxr->fd); MPASS(da->refcnt > 0); FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count); for (ui = 0; ui < q->count; ui++) { #ifdef INVARIANTS if (q->entries[ui].nh_new != NULL) update_delta++; if (q->entries[ui].nh_old != NULL) update_delta--; #endif plen = q->entries[ui].plen; ip = ntohl(q->entries[ui].addr4.s_addr); if (plen < 32) hmask = 0xffffffffU >> plen; else hmask = 0; start = (ip & ~hmask) >> DXR_RANGE_SHIFT; end = (ip | hmask) >> DXR_RANGE_SHIFT; if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f) for (i = start >> 5; i <= end >> 5; i++) da->updates_mask[i] = 0xffffffffU; else for (i = start; i <= end; i++) da->updates_mask[i >> 5] |= (1 << (i & 0x1f)); if (start < da->updates_low) da->updates_low = start; if (end > da->updates_high) da->updates_high = end; } #ifdef INVARIANTS fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); MPASS(da->prefixes + update_delta == rinfo.num_prefixes); #endif res = dxr_init(0, dxr->fd, data, (void **) &new_dxr); if (res != FLM_SUCCESS) return (res); dxr_build(new_dxr); /* Structural limit exceeded, hard error */ if (da->rtbl_top >= BASE_MAX) { dxr_destroy(new_dxr); return (FLM_ERROR); } if (new_dxr->d == NULL) { dxr_destroy(new_dxr); return (FLM_REBUILD); } new_dp.f = choose_lookup_fn(da); new_dp.arg = new_dxr; if (fib_set_datapath_ptr(dxr->fd, &new_dp)) { fib_set_algo_ptr(dxr->fd, new_dxr); fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx); return (FLM_SUCCESS); } FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed"); dxr_destroy(new_dxr); return (FLM_REBUILD); } static uint8_t dxr_get_pref(const struct rib_rtable_info *rinfo) { /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */ return (251); } SYSCTL_DECL(_net_route_algo); SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "DXR tunables"); static int sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) { char buf[8]; int error, new, i; snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, V_frag_limit % 100); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (!isdigit(*buf) && *buf != '.') return (EINVAL); for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) new = new * 10 + buf[i] - '0'; new *= 100; if (buf[i++] == '.') { if (!isdigit(buf[i])) return (EINVAL); new += (buf[i++] - '0') * 10; if (isdigit(buf[i])) new += buf[i++] - '0'; } if (new > 1000) return (EINVAL); V_frag_limit = new; snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, V_frag_limit % 100); return (0); } SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, 0, 0, sysctl_dxr_frag_limit, "A", "Fragmentation threshold to full rebuild"); static struct fib_lookup_module fib_dxr_mod = { .flm_name = "dxr", .flm_family = AF_INET, .flm_init_cb = dxr_init, .flm_destroy_cb = dxr_destroy, .flm_dump_rib_item_cb = dxr_dump_rib_item, .flm_dump_end_cb = dxr_dump_end, .flm_change_rib_item_cb = dxr_change_rib_item, .flm_change_rib_items_cb = dxr_change_rib_batch, .flm_get_pref = dxr_get_pref, }; static int dxr_modevent(module_t mod, int type, void *unused) { int error; switch (type) { case MOD_LOAD: chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); fib_module_register(&fib_dxr_mod); return(0); case MOD_UNLOAD: error = fib_module_unregister(&fib_dxr_mod); if (error) return (error); uma_zdestroy(chunk_zone); uma_zdestroy(trie_zone); return(0); default: return(EOPNOTSUPP); } } static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0}; DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(fib_dxr, 1);