src/usr.sbin/bgpd/rde_decide.c

/*	$OpenBSD: rde_decide.c,v 1.101 2023/03/13 16:52:42 claudio Exp $ */

/*
 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
 * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/types.h>
#include <sys/queue.h>

#include <string.h>

#include "bgpd.h"
#include "rde.h"
#include "log.h"

int	prefix_cmp(struct prefix *, struct prefix *, int *);
void	prefix_set_dmetric(struct prefix *, struct prefix *);
void	prefix_insert(struct prefix *, struct prefix *, struct rib_entry *);
void	prefix_remove(struct prefix *, struct rib_entry *);
/*
 * Decision Engine RFC implementation:
 *  Phase 1:
 *   - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes
 *   - IBGP routes may either use LOCAL_PREF or the local system computes
 *     the degree of preference
 *   - If the route is ineligible, the route MAY NOT serve as an input to
 *     the next phase of route selection
 *   - if the route is eligible the computed value MUST be used as the
 *     LOCAL_PREF value in any IBGP readvertisement
 *
 *  Phase 2:
 *   - If the NEXT_HOP attribute of a BGP route depicts an address that is
 *     not resolvable the BGP route MUST be excluded from the Phase 2 decision
 *     function.
 *   - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP
 *     route should be excluded from the Phase 2 decision function.
 *   - The local BGP speaker identifies the route that has:
 *     a) the highest degree of preference of any route to the same set
 *        of destinations
 *     b) is the only route to that destination
 *     c) is selected as a result of the Phase 2 tie breaking rules
 *   - The local speaker MUST determine the immediate next-hop address from
 *     the NEXT_HOP attribute of the selected route.
 *   - If either the immediate next hop or the IGP cost to the NEXT_HOP changes,
 *     Phase 2 Route Selection MUST be performed again.
 *
 *  Route Resolvability Condition
 *   - A route Rte1, referencing only the intermediate network address, is
 *     considered resolvable if the Routing Table contains at least one
 *     resolvable route Rte2 that matches Rte1's intermediate network address
 *     and is not recursively resolved through Rte1.
 *   - Routes referencing interfaces are considered resolvable if the state of
 *     the referenced interface is up and IP processing is enabled.
 *
 *  Breaking Ties (Phase 2)
 *   1. Remove from consideration all routes which are not tied for having the
 *      smallest number of AS numbers present in their AS_PATH attributes.
 *      Note, that when counting this number, an AS_SET counts as 1
 *   2. Remove from consideration all routes which are not tied for having the
 *      lowest Origin number in their Origin attribute.
 *   3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC
 *      attributes. MULTI_EXIT_DISC is only comparable between routes learned
 *      from the same neighboring AS.
 *   4. If at least one of the candidate routes was received via EBGP,
 *      remove from consideration all routes which were received via IBGP.
 *   5. Remove from consideration any routes with less-preferred interior cost.
 *      If the NEXT_HOP hop for a route is reachable, but no cost can be
 *      determined, then this step should be skipped.
 *   6. Remove from consideration all routes other than the route that was
 *      advertised by the BGP speaker whose BGP Identifier has the lowest value.
 *   7. Prefer the route received from the lowest peer address.
 *
 * Phase 3: Route Dissemination
 *   - All routes in the Loc-RIB are processed into Adj-RIBs-Out according
 *     to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out
 *     unless the destination and NEXT_HOP described by this route may be
 *     forwarded appropriately by the Routing Table.
 */

/*
 * Decision Engine OUR implementation:
 * The filtering is done first. The filtering calculates the preference and
 * stores it in LOCAL_PREF (Phase 1).
 * Ineligible routes are flagged as ineligible via nexthop_add().
 * Phase 3 is done together with Phase 2.
 * In following cases a prefix needs to be reevaluated:
 *  - update of a prefix (prefix_update)
 *  - withdraw of a prefix (prefix_withdraw)
 *  - state change of the nexthop (nexthop-{in}validate)
 *  - state change of session (session down)
 */

/*
 * Compare two prefixes with equal pt_entry. Returns an integer greater than or
 * less than 0, according to whether the prefix p1 is more or less preferred
 * than the prefix p2. p1 should be used for the new prefix and p2 for a
 * already added prefix. The absolute value returned specifies the similarity
 * of the prefixes.
 *   1: prefixes differ because of validity
 *   2: prefixes don't belong in any multipath set
 *   3: prefixes belong only in the as-wide multipath set
 *   4: prefixes belong in both the ecmp and as-wide multipath set
 *   TODO: maybe we also need a strict ecmp set that requires
 *   prefixes to e.g. equal ASPATH or equal neighbor-as (like for MED).
 */
int
prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall)
{
	struct rde_aspath	*asp1, *asp2;
	struct rde_peer		*peer1, *peer2;
	struct attr		*a;
	uint32_t		 p1id, p2id;
	int			 p1cnt, p2cnt, i;
	int			 rv = 1;

	/*
	 * If a match happens before the MED check then the list is
	 * correctly sorted. If a match happens after MED then further
	 * elements may need to be checked to ensure that all paths
	 * which could affect this path were considered. This only
	 * matters for strict MED evaluation and in that case testall
	 * is set to 1. If the check happens to be on the MED check
	 * itself testall is set to 2.
	 */
	*testall = 0;

	if (p1 == NULL)
		return -rv;
	if (p2 == NULL)
		return rv;

	asp1 = prefix_aspath(p1);
	asp2 = prefix_aspath(p2);
	peer1 = prefix_peer(p1);
	peer2 = prefix_peer(p2);

	/* 1. check if prefix is eligible a.k.a reachable */
	if (!prefix_eligible(p2))
		return rv;
	if (!prefix_eligible(p1))
		return -rv;

	/* bump rv, from here on prefix is considered valid */
	rv++;

	/* 2. local preference of prefix, bigger is better */
	if (asp1->lpref > asp2->lpref)
		return rv;
	if (asp1->lpref < asp2->lpref)
		return -rv;

	/* 3. aspath count, the shorter the better */
	if (asp1->aspath->ascnt < asp2->aspath->ascnt)
		return rv;
	if (asp1->aspath->ascnt > asp2->aspath->ascnt)
		return -rv;

	/* 4. origin, the lower the better */
	if (asp1->origin < asp2->origin)
		return rv;
	if (asp1->origin > asp2->origin)
		return -rv;

	/*
	 * 5. MED decision
	 * Only comparable between the same neighboring AS or if
	 * 'rde med compare always' is set. In the first case
	 * set the testall flag since further elements need to be
	 * evaluated as well.
	 */
	if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) ||
	    aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {
		if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
			*testall = 2;
		/* lowest value wins */
		if (asp1->med < asp2->med)
			return rv;
		if (asp1->med > asp2->med)
			return -rv;
	}

	if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
		*testall = 1;

	/*
	 * 6. EBGP is cooler than IBGP
	 * It is absolutely important that the ebgp value in peer_config.ebgp
	 * is bigger than all other ones (IBGP, confederations)
	 */
	if (peer1->conf.ebgp != peer2->conf.ebgp) {
		if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */
			return rv;
		else if (peer2->conf.ebgp) /* peer2 is EBGP */
			return -rv;
	}

	/* bump rv, as-wide multipath */
	rv++;

	/*
	 * 7. local tie-breaker, this weight is here to tip equal long AS
	 * paths in one or the other direction. It happens more and more
	 * that AS paths are equally long and so traffic engineering needs
	 * a metric that weights a prefix at a very late stage in the
	 * decision process.
	 */
	if (asp1->weight > asp2->weight)
		return rv;
	if (asp1->weight < asp2->weight)
		return -rv;

	/* 8. nexthop costs. NOT YET -> IGNORE */

	/* bump rv, equal cost multipath */
	rv++;

	/*
	 * 9. older route (more stable) wins but only if route-age
	 * evaluation is enabled.
	 */
	if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) {
		if (p1->lastchange < p2->lastchange) /* p1 is older */
			return rv;
		if (p1->lastchange > p2->lastchange)
			return -rv;
	}

	/* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */
	if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) {
		memcpy(&p1id, a->data, sizeof(p1id));
		p1id = ntohl(p1id);
	} else
		p1id = peer1->remote_bgpid;
	if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) {
		memcpy(&p2id, a->data, sizeof(p2id));
		p2id = ntohl(p2id);
	} else
		p2id = peer2->remote_bgpid;
	if (p1id < p2id)
		return rv;
	if (p1id > p2id)
		return -rv;

	/* 11. compare CLUSTER_LIST length, shorter is better */
	p1cnt = p2cnt = 0;
	if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL)
		p1cnt = a->len / sizeof(uint32_t);
	if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL)
		p2cnt = a->len / sizeof(uint32_t);
	if (p1cnt < p2cnt)
		return rv;
	if (p1cnt > p2cnt)
		return -rv;

	/* 12. lowest peer address wins (IPv4 is better than IPv6) */
	if (peer1->remote_addr.aid < peer2->remote_addr.aid)
		return rv;
	if (peer1->remote_addr.aid > peer2->remote_addr.aid)
		return -rv;
	switch (peer1->remote_addr.aid) {
	case AID_INET:
		i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4,
		    sizeof(struct in_addr));
		break;
	case AID_INET6:
		i = memcmp(&peer1->remote_addr.v6, &peer2->remote_addr.v6,
		    sizeof(struct in6_addr));
		break;
	default:
		fatalx("%s: unknown af", __func__);
	}
	if (i < 0)
		return rv;
	if (i > 0)
		return -rv;

	/* RFC7911 does not specify this but something like this is needed. */
	/* 13. lowest path identifier wins */
	if (p1->path_id < p2->path_id)
		return rv;
	if (p1->path_id > p2->path_id)
		return -rv;

	fatalx("Uh, oh a politician in the decision process");
}

/*
 * set the dmetric value of np based on the return value of
 * prefix_evaluate(pp, np) or set it to either PREFIX_DMETRIC_BEST
 * or PREFIX_DMETRIC_INVALID for the first element.
 */
void
prefix_set_dmetric(struct prefix *pp, struct prefix *np)
{
	int testall;

	if (np != NULL) {
		if (pp == NULL)
			np->dmetric = prefix_eligible(np) ?
			    PREFIX_DMETRIC_BEST : PREFIX_DMETRIC_INVALID;
		else
			np->dmetric = prefix_cmp(pp, np, &testall);
		if (np->dmetric < 0) {
			struct bgpd_addr addr;
			pt_getaddr(np->pt, &addr);
			log_debug("bad dmetric in decision process: %s/%u",
			    log_addr(&addr), np->pt->prefixlen);
		}
	}
}

/*
 * Insert a prefix keeping the total order of the list. For routes
 * that may depend on a MED selection the set is scanned until the
 * condition is cleared. If a MED inversion is detected the respective
 * prefix is taken of the rib list and put onto a redo queue. All
 * prefixes on the redo queue are re-inserted at the end.
 */
void
prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
{
	struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo);
	struct prefix *xp, *np, *insertp = ep;
	int testall, preferred, selected = 0, removed = 0;

	/* start scan at the entry point (ep) or the head if ep == NULL */
	if (ep == NULL)
		ep = TAILQ_FIRST(&re->prefix_h);

	for (xp = ep; xp != NULL; xp = np) {
		np = TAILQ_NEXT(xp, entry.list.rib);

		if ((preferred = (prefix_cmp(new, xp, &testall) > 0))) {
			/* new is preferred over xp */
			if (testall == 2) {
				/*
				 * MED inversion, take out prefix and
				 * put it onto redo queue.
				 */
				TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib);
				TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib);
				removed = 1;
				continue;
			}

			if (testall == 1) {
				/*
				 * lock insertion point and
				 * continue on with scan
				 */
				selected = 1;
			}
		} else {
			/*
			 * xp is preferred over new.
			 * Remember insertion point for later unless the
			 * traverse is just looking for a possible MED
			 * inversion (selected == 1).
			 * If the last comparison's tie-breaker was the MED
			 * check reset selected and with it insertp since
			 * this was an actual MED priority inversion.
			 */
			if (testall == 2)
				selected = 0;
			if (!selected)
				insertp = xp;
		}

		/*
		 * If previous element(s) got removed, fixup the
		 * dmetric, now that it is clear that this element
		 * is on the list.
		 */
		if (removed) {
			prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue,
			    entry.list.rib), xp);
			removed = 0;
		}

		if (preferred && testall == 0)
			break;			/* we're done */
	}

	if (insertp == NULL) {
		TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
	} else {
		TAILQ_INSERT_AFTER(&re->prefix_h, insertp, new, entry.list.rib);
	}

	prefix_set_dmetric(insertp, new);
	prefix_set_dmetric(new, TAILQ_NEXT(new, entry.list.rib));

	/* Fixup MED order again. All elements are < new */
	while (!TAILQ_EMPTY(&redo)) {
		xp = TAILQ_FIRST(&redo);
		TAILQ_REMOVE(&redo, xp, entry.list.rib);

		prefix_insert(xp, new, re);
	}
}

/*
 * Remove a prefix from the RIB list ensuring that the total order of the
 * list remains intact. All routes that differ in the MED are taken of the
 * list and put on the redo list. To figure out if a route could cause a
 * resort because of a MED check the next prefix of the to-remove prefix
 * is compared with the old prefix. A full scan is only done if the next
 * route differs because of the MED or later checks.
 * Again at the end all routes on the redo queue are reinserted.
 */
void
prefix_remove(struct prefix *old, struct rib_entry *re)
{
	struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo);
	struct prefix *xp, *np, *pp;
	int testall, removed = 0;

	xp = TAILQ_NEXT(old, entry.list.rib);
	pp = TAILQ_PREV(old, prefix_queue, entry.list.rib);
	TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);

	/* check if a MED inversion could be possible */
	prefix_cmp(old, xp, &testall);
	if (testall > 0) {
		/* maybe MED route, scan tail for other possible routes */
		for (; xp != NULL; xp = np) {
			np = TAILQ_NEXT(xp, entry.list.rib);

			/* only interested in the testall result */
			prefix_cmp(old, xp, &testall);
			if (testall == 2) {
				/*
				 * possible MED inversion, take out prefix and
				 * put it onto redo queue.
				 */
				TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib);
				TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib);
				removed = 1;
				continue;
			}
			/*
			 * If previous element(s) got removed, fixup the
			 * dmetric, now that it is clear that this element
			 * is on the list.
			 */
			if (removed) {
				prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue,
				    entry.list.rib), xp);
				removed = 0;
			}
			if (testall == 0)
				break;		/* we're done */
		}
	}

	if (pp)
		prefix_set_dmetric(pp, TAILQ_NEXT(pp, entry.list.rib));
	else
		prefix_set_dmetric(NULL, TAILQ_FIRST(&re->prefix_h));

	/* Fixup MED order again, reinsert prefixes from the start */
	while (!TAILQ_EMPTY(&redo)) {
		xp = TAILQ_FIRST(&redo);
		TAILQ_REMOVE(&redo, xp, entry.list.rib);

		prefix_insert(xp, NULL, re);
	}
}

/* helper function to check if a prefix is valid to be selected */
int
prefix_eligible(struct prefix *p)
{
	struct rde_aspath *asp = prefix_aspath(p);

	/* The aspath needs to be loop and error free */
	if (asp == NULL ||
	    asp->flags & (F_ATTR_LOOP|F_ATTR_OTC_LEAK|F_ATTR_PARSE_ERR))
		return 0;

	/* The nexthop must be valid. */
	if (!prefix_nhvalid(p))
		return 0;

	return 1;
}

struct prefix *
prefix_best(struct rib_entry *re)
{
	struct prefix	*xp;
	struct rib	*rib;

	rib = re_rib(re);
	if (rib->flags & F_RIB_NOEVALUATE)
		/* decision process is turned off */
		return NULL;

	xp = TAILQ_FIRST(&re->prefix_h);
	if (xp != NULL && !prefix_eligible(xp))
		xp = NULL;
	return xp;
}

/*
 * Find the correct place to insert the prefix in the prefix list.
 * If the active prefix has changed we need to send an update also special
 * treatment is needed if 'rde evaluate all' is used on some peers.
 * To re-evaluate a prefix just call prefix_evaluate with old and new pointing
 * to the same prefix.
 */
void
prefix_evaluate(struct rib_entry *re, struct prefix *new, struct prefix *old)
{
	struct prefix	*newbest, *oldbest;
	struct rib	*rib;

	rib = re_rib(re);
	if (rib->flags & F_RIB_NOEVALUATE) {
		/* decision process is turned off */
		if (old != NULL)
			TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);
		if (new != NULL) {
			TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
			new->dmetric = PREFIX_DMETRIC_INVALID;
		}
		return;
	}

	oldbest = prefix_best(re);
	if (old != NULL)
		prefix_remove(old, re);
	if (new != NULL)
		prefix_insert(new, NULL, re);
	newbest = prefix_best(re);

	/*
	 * If the active prefix changed or the active prefix was removed
	 * and added again then generate an update.
	 */
	if (oldbest != newbest || (old != NULL && newbest == old)) {
		/*
		 * Send update withdrawing oldbest and adding newbest
		 * but remember that newbest may be NULL aka ineligible.
		 * Additional decision may be made by the called functions.
		 */
		if ((rib->flags & F_RIB_NOFIB) == 0)
			rde_send_kroute(rib, newbest, oldbest);
		rde_generate_updates(re, new, old, EVAL_DEFAULT);
		return;
	}

	/*
	 * If there are peers with 'rde evaluate all' every update needs
	 * to be passed on (not only a change of the best prefix).
	 * rde_generate_updates() will then take care of distribution.
	 */
	if (rde_evaluate_all())
		if ((new != NULL && prefix_eligible(new)) || old != NULL)
			rde_generate_updates(re, new, old, EVAL_ALL);
}

void
prefix_evaluate_nexthop(struct prefix *p, enum nexthop_state state,
    enum nexthop_state oldstate)
{
	struct rib_entry *re = prefix_re(p);
	struct prefix	*newbest, *oldbest;
	struct rib	*rib;

	/* Skip non local-RIBs or RIBs that are flagged as noeval. */
	rib = re_rib(re);
	if (rib->flags & F_RIB_NOEVALUATE) {
		log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__);
		return;
	}

	if (oldstate == state) {
		/*
		 * The state of the nexthop did not change. The only
		 * thing that may have changed is the true_nexthop
		 * or other internal infos. This will not change
		 * the routing decision so shortcut here.
		 * XXX needs to be changed for ECMP
		 */
		if (state == NEXTHOP_REACH) {
			if ((rib->flags & F_RIB_NOFIB) == 0 &&
			    p == prefix_best(re))
				rde_send_kroute(rib, p, NULL);
		}
		return;
	}

	/*
	 * Re-evaluate the prefix by removing the prefix then updating the
	 * nexthop state and reinserting the prefix again.
	 */
	oldbest = prefix_best(re);
	prefix_remove(p, re);

	if (state == NEXTHOP_REACH)
		p->nhflags |= NEXTHOP_VALID;
	else
		p->nhflags &= ~NEXTHOP_VALID;

	prefix_insert(p, NULL, re);
	newbest = prefix_best(re);

	/*
	 * If the active prefix changed or the active prefix was removed
	 * and added again then generate an update.
	 */
	if (oldbest != newbest || newbest == p) {
		/*
		 * Send update withdrawing oldbest and adding newbest
		 * but remember that newbest may be NULL aka ineligible.
		 * Additional decision may be made by the called functions.
		 */
		if ((rib->flags & F_RIB_NOFIB) == 0)
			rde_send_kroute(rib, newbest, oldbest);
		rde_generate_updates(re, p, p, EVAL_DEFAULT);
		return;
	}

	/*
	 * If there are peers with 'rde evaluate all' every update needs
	 * to be passed on (not only a change of the best prefix).
	 * rde_generate_updates() will then take care of distribution.
	 */
	if (rde_evaluate_all())
		rde_generate_updates(re, p, p, EVAL_ALL);
}
SecBSD 1.3 -current: The Digital Resistance. A belief in the importance of privacy and the right to secure communication. 2023-04-30 03:15:27 +02:00			`/* $OpenBSD: rde_decide.c,v 1.101 2023/03/13 16:52:42 claudio Exp $ */`

			`/*`
			`* Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>`
			`* Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>`
			`*`
			`* Permission to use, copy, modify, and distribute this software for any`
			`* purpose with or without fee is hereby granted, provided that the above`
			`* copyright notice and this permission notice appear in all copies.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES`
			`* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF`
			`* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR`
			`* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES`
			`* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN`
			`* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF`
			`* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.`
			`*/`

			`#include <sys/types.h>`
			`#include <sys/queue.h>`

			`#include <string.h>`

			`#include "bgpd.h"`
			`#include "rde.h"`
			`#include "log.h"`

			`int prefix_cmp(struct prefix , struct prefix , int *);`
			`void prefix_set_dmetric(struct prefix , struct prefix );`
			`void prefix_insert(struct prefix , struct prefix , struct rib_entry *);`
			`void prefix_remove(struct prefix , struct rib_entry );`
			`/*`
			`* Decision Engine RFC implementation:`
			`* Phase 1:`
			`* - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes`
			`* - IBGP routes may either use LOCAL_PREF or the local system computes`
			`* the degree of preference`
			`* - If the route is ineligible, the route MAY NOT serve as an input to`
			`* the next phase of route selection`
			`* - if the route is eligible the computed value MUST be used as the`
			`* LOCAL_PREF value in any IBGP readvertisement`
			`*`
			`* Phase 2:`
			`* - If the NEXT_HOP attribute of a BGP route depicts an address that is`
			`* not resolvable the BGP route MUST be excluded from the Phase 2 decision`
			`* function.`
			`* - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP`
			`* route should be excluded from the Phase 2 decision function.`
			`* - The local BGP speaker identifies the route that has:`
			`* a) the highest degree of preference of any route to the same set`
			`* of destinations`
			`* b) is the only route to that destination`
			`* c) is selected as a result of the Phase 2 tie breaking rules`
			`* - The local speaker MUST determine the immediate next-hop address from`
			`* the NEXT_HOP attribute of the selected route.`
			`* - If either the immediate next hop or the IGP cost to the NEXT_HOP changes,`
			`* Phase 2 Route Selection MUST be performed again.`
			`*`
			`* Route Resolvability Condition`
			`* - A route Rte1, referencing only the intermediate network address, is`
			`* considered resolvable if the Routing Table contains at least one`
			`* resolvable route Rte2 that matches Rte1's intermediate network address`
			`* and is not recursively resolved through Rte1.`
			`* - Routes referencing interfaces are considered resolvable if the state of`
			`* the referenced interface is up and IP processing is enabled.`
			`*`
			`* Breaking Ties (Phase 2)`
			`* 1. Remove from consideration all routes which are not tied for having the`
			`* smallest number of AS numbers present in their AS_PATH attributes.`
			`* Note, that when counting this number, an AS_SET counts as 1`
			`* 2. Remove from consideration all routes which are not tied for having the`
			`* lowest Origin number in their Origin attribute.`
			`* 3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC`
			`* attributes. MULTI_EXIT_DISC is only comparable between routes learned`
			`* from the same neighboring AS.`
			`* 4. If at least one of the candidate routes was received via EBGP,`
			`* remove from consideration all routes which were received via IBGP.`
			`* 5. Remove from consideration any routes with less-preferred interior cost.`
			`* If the NEXT_HOP hop for a route is reachable, but no cost can be`
			`* determined, then this step should be skipped.`
			`* 6. Remove from consideration all routes other than the route that was`
			`* advertised by the BGP speaker whose BGP Identifier has the lowest value.`
			`* 7. Prefer the route received from the lowest peer address.`
			`*`
			`* Phase 3: Route Dissemination`
			`* - All routes in the Loc-RIB are processed into Adj-RIBs-Out according`
			`* to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out`
			`* unless the destination and NEXT_HOP described by this route may be`
			`* forwarded appropriately by the Routing Table.`
			`*/`

			`/*`
			`* Decision Engine OUR implementation:`
			`* The filtering is done first. The filtering calculates the preference and`
			`* stores it in LOCAL_PREF (Phase 1).`
			`* Ineligible routes are flagged as ineligible via nexthop_add().`
			`* Phase 3 is done together with Phase 2.`
			`* In following cases a prefix needs to be reevaluated:`
			`* - update of a prefix (prefix_update)`
			`* - withdraw of a prefix (prefix_withdraw)`
			`* - state change of the nexthop (nexthop-{in}validate)`
			`* - state change of session (session down)`
			`*/`

			`/*`
			`* Compare two prefixes with equal pt_entry. Returns an integer greater than or`
			`* less than 0, according to whether the prefix p1 is more or less preferred`
			`* than the prefix p2. p1 should be used for the new prefix and p2 for a`
			`* already added prefix. The absolute value returned specifies the similarity`
			`* of the prefixes.`
			`* 1: prefixes differ because of validity`
			`* 2: prefixes don't belong in any multipath set`
			`* 3: prefixes belong only in the as-wide multipath set`
			`* 4: prefixes belong in both the ecmp and as-wide multipath set`
			`* TODO: maybe we also need a strict ecmp set that requires`
			`* prefixes to e.g. equal ASPATH or equal neighbor-as (like for MED).`
			`*/`
			`int`
			`prefix_cmp(struct prefix p1, struct prefix p2, int *testall)`
			`{`
			`struct rde_aspath asp1, asp2;`
			`struct rde_peer peer1, peer2;`
			`struct attr *a;`
			`uint32_t p1id, p2id;`
			`int p1cnt, p2cnt, i;`
			`int rv = 1;`

			`/*`
			`* If a match happens before the MED check then the list is`
			`* correctly sorted. If a match happens after MED then further`
			`* elements may need to be checked to ensure that all paths`
			`* which could affect this path were considered. This only`
			`* matters for strict MED evaluation and in that case testall`
			`* is set to 1. If the check happens to be on the MED check`
			`* itself testall is set to 2.`
			`*/`
			`*testall = 0;`

			`if (p1 == NULL)`
			`return -rv;`
			`if (p2 == NULL)`
			`return rv;`

			`asp1 = prefix_aspath(p1);`
			`asp2 = prefix_aspath(p2);`
			`peer1 = prefix_peer(p1);`
			`peer2 = prefix_peer(p2);`

			`/* 1. check if prefix is eligible a.k.a reachable */`
			`if (!prefix_eligible(p2))`
			`return rv;`
			`if (!prefix_eligible(p1))`
			`return -rv;`

			`/* bump rv, from here on prefix is considered valid */`
			`rv++;`

			`/* 2. local preference of prefix, bigger is better */`
			`if (asp1->lpref > asp2->lpref)`
			`return rv;`
			`if (asp1->lpref < asp2->lpref)`
			`return -rv;`

			`/* 3. aspath count, the shorter the better */`
			`if (asp1->aspath->ascnt < asp2->aspath->ascnt)`
			`return rv;`
			`if (asp1->aspath->ascnt > asp2->aspath->ascnt)`
			`return -rv;`

			`/* 4. origin, the lower the better */`
			`if (asp1->origin < asp2->origin)`
			`return rv;`
			`if (asp1->origin > asp2->origin)`
			`return -rv;`

			`/*`
			`* 5. MED decision`
			`* Only comparable between the same neighboring AS or if`
			`* 'rde med compare always' is set. In the first case`
			`* set the testall flag since further elements need to be`
			`* evaluated as well.`
			`*/`
			`if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) \|\|`
			`aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {`
			`if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))`
			`*testall = 2;`
			`/* lowest value wins */`
			`if (asp1->med < asp2->med)`
			`return rv;`
			`if (asp1->med > asp2->med)`
			`return -rv;`
			`}`

			`if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))`
			`*testall = 1;`

			`/*`
			`* 6. EBGP is cooler than IBGP`
			`* It is absolutely important that the ebgp value in peer_config.ebgp`
			`* is bigger than all other ones (IBGP, confederations)`
			`*/`
			`if (peer1->conf.ebgp != peer2->conf.ebgp) {`
			`if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */`
			`return rv;`
			`else if (peer2->conf.ebgp) /* peer2 is EBGP */`
			`return -rv;`
			`}`

			`/* bump rv, as-wide multipath */`
			`rv++;`

			`/*`
			`* 7. local tie-breaker, this weight is here to tip equal long AS`
			`* paths in one or the other direction. It happens more and more`
			`* that AS paths are equally long and so traffic engineering needs`
			`* a metric that weights a prefix at a very late stage in the`
			`* decision process.`
			`*/`
			`if (asp1->weight > asp2->weight)`
			`return rv;`
			`if (asp1->weight < asp2->weight)`
			`return -rv;`

			`/* 8. nexthop costs. NOT YET -> IGNORE */`

			`/* bump rv, equal cost multipath */`
			`rv++;`

			`/*`
			`* 9. older route (more stable) wins but only if route-age`
			`* evaluation is enabled.`
			`*/`
			`if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) {`
			`if (p1->lastchange < p2->lastchange) /* p1 is older */`
			`return rv;`
			`if (p1->lastchange > p2->lastchange)`
			`return -rv;`
			`}`

			`/* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */`
			`if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) {`
			`memcpy(&p1id, a->data, sizeof(p1id));`
			`p1id = ntohl(p1id);`
			`} else`
			`p1id = peer1->remote_bgpid;`
			`if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) {`
			`memcpy(&p2id, a->data, sizeof(p2id));`
			`p2id = ntohl(p2id);`
			`} else`
			`p2id = peer2->remote_bgpid;`
			`if (p1id < p2id)`
			`return rv;`
			`if (p1id > p2id)`
			`return -rv;`

			`/* 11. compare CLUSTER_LIST length, shorter is better */`
			`p1cnt = p2cnt = 0;`
			`if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL)`
			`p1cnt = a->len / sizeof(uint32_t);`
			`if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL)`
			`p2cnt = a->len / sizeof(uint32_t);`
			`if (p1cnt < p2cnt)`
			`return rv;`
			`if (p1cnt > p2cnt)`
			`return -rv;`

			`/* 12. lowest peer address wins (IPv4 is better than IPv6) */`
			`if (peer1->remote_addr.aid < peer2->remote_addr.aid)`
			`return rv;`
			`if (peer1->remote_addr.aid > peer2->remote_addr.aid)`
			`return -rv;`
			`switch (peer1->remote_addr.aid) {`
			`case AID_INET:`
			`i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4,`
			`sizeof(struct in_addr));`
			`break;`
			`case AID_INET6:`
			`i = memcmp(&peer1->remote_addr.v6, &peer2->remote_addr.v6,`
			`sizeof(struct in6_addr));`
			`break;`
			`default:`
			`fatalx("%s: unknown af", __func__);`
			`}`
			`if (i < 0)`
			`return rv;`
			`if (i > 0)`
			`return -rv;`

			`/* RFC7911 does not specify this but something like this is needed. */`
			`/* 13. lowest path identifier wins */`
			`if (p1->path_id < p2->path_id)`
			`return rv;`
			`if (p1->path_id > p2->path_id)`
			`return -rv;`

			`fatalx("Uh, oh a politician in the decision process");`
			`}`

			`/*`
			`* set the dmetric value of np based on the return value of`
			`* prefix_evaluate(pp, np) or set it to either PREFIX_DMETRIC_BEST`
			`* or PREFIX_DMETRIC_INVALID for the first element.`
			`*/`
			`void`
			`prefix_set_dmetric(struct prefix pp, struct prefix np)`
			`{`
			`int testall;`

			`if (np != NULL) {`
			`if (pp == NULL)`
			`np->dmetric = prefix_eligible(np) ?`
			`PREFIX_DMETRIC_BEST : PREFIX_DMETRIC_INVALID;`
			`else`
			`np->dmetric = prefix_cmp(pp, np, &testall);`
			`if (np->dmetric < 0) {`
			`struct bgpd_addr addr;`
			`pt_getaddr(np->pt, &addr);`
			`log_debug("bad dmetric in decision process: %s/%u",`
			`log_addr(&addr), np->pt->prefixlen);`
			`}`
			`}`
			`}`

			`/*`
			`* Insert a prefix keeping the total order of the list. For routes`
			`* that may depend on a MED selection the set is scanned until the`
			`* condition is cleared. If a MED inversion is detected the respective`
			`* prefix is taken of the rib list and put onto a redo queue. All`
			`* prefixes on the redo queue are re-inserted at the end.`
			`*/`
			`void`
			`prefix_insert(struct prefix new, struct prefix ep, struct rib_entry *re)`
			`{`
			`struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo);`
			`struct prefix xp, np, *insertp = ep;`
			`int testall, preferred, selected = 0, removed = 0;`

			`/* start scan at the entry point (ep) or the head if ep == NULL */`
			`if (ep == NULL)`
			`ep = TAILQ_FIRST(&re->prefix_h);`

			`for (xp = ep; xp != NULL; xp = np) {`
			`np = TAILQ_NEXT(xp, entry.list.rib);`

			`if ((preferred = (prefix_cmp(new, xp, &testall) > 0))) {`
			`/* new is preferred over xp */`
			`if (testall == 2) {`
			`/*`
			`* MED inversion, take out prefix and`
			`* put it onto redo queue.`
			`*/`
			`TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib);`
			`TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib);`
			`removed = 1;`
			`continue;`
			`}`

			`if (testall == 1) {`
			`/*`
			`* lock insertion point and`
			`* continue on with scan`
			`*/`
			`selected = 1;`
			`}`
			`} else {`
			`/*`
			`* xp is preferred over new.`
			`* Remember insertion point for later unless the`
			`* traverse is just looking for a possible MED`
			`* inversion (selected == 1).`
			`* If the last comparison's tie-breaker was the MED`
			`* check reset selected and with it insertp since`
			`* this was an actual MED priority inversion.`
			`*/`
			`if (testall == 2)`
			`selected = 0;`
			`if (!selected)`
			`insertp = xp;`
			`}`

			`/*`
			`* If previous element(s) got removed, fixup the`
			`* dmetric, now that it is clear that this element`
			`* is on the list.`
			`*/`
			`if (removed) {`
			`prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue,`
			`entry.list.rib), xp);`
			`removed = 0;`
			`}`

			`if (preferred && testall == 0)`
			`break; /* we're done */`
			`}`

			`if (insertp == NULL) {`
			`TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);`
			`} else {`
			`TAILQ_INSERT_AFTER(&re->prefix_h, insertp, new, entry.list.rib);`
			`}`

			`prefix_set_dmetric(insertp, new);`
			`prefix_set_dmetric(new, TAILQ_NEXT(new, entry.list.rib));`

			`/* Fixup MED order again. All elements are < new */`
			`while (!TAILQ_EMPTY(&redo)) {`
			`xp = TAILQ_FIRST(&redo);`
			`TAILQ_REMOVE(&redo, xp, entry.list.rib);`

			`prefix_insert(xp, new, re);`
			`}`
			`}`

			`/*`
			`* Remove a prefix from the RIB list ensuring that the total order of the`
			`* list remains intact. All routes that differ in the MED are taken of the`
			`* list and put on the redo list. To figure out if a route could cause a`
			`* resort because of a MED check the next prefix of the to-remove prefix`
			`* is compared with the old prefix. A full scan is only done if the next`
			`* route differs because of the MED or later checks.`
			`* Again at the end all routes on the redo queue are reinserted.`
			`*/`
			`void`
			`prefix_remove(struct prefix old, struct rib_entry re)`
			`{`
			`struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo);`
			`struct prefix xp, np, *pp;`
			`int testall, removed = 0;`

			`xp = TAILQ_NEXT(old, entry.list.rib);`
			`pp = TAILQ_PREV(old, prefix_queue, entry.list.rib);`
			`TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);`

			`/* check if a MED inversion could be possible */`
			`prefix_cmp(old, xp, &testall);`
			`if (testall > 0) {`
			`/* maybe MED route, scan tail for other possible routes */`
			`for (; xp != NULL; xp = np) {`
			`np = TAILQ_NEXT(xp, entry.list.rib);`

			`/* only interested in the testall result */`
			`prefix_cmp(old, xp, &testall);`
			`if (testall == 2) {`
			`/*`
			`* possible MED inversion, take out prefix and`
			`* put it onto redo queue.`
			`*/`
			`TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib);`
			`TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib);`
			`removed = 1;`
			`continue;`
			`}`
			`/*`
			`* If previous element(s) got removed, fixup the`
			`* dmetric, now that it is clear that this element`
			`* is on the list.`
			`*/`
			`if (removed) {`
			`prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue,`
			`entry.list.rib), xp);`
			`removed = 0;`
			`}`
			`if (testall == 0)`
			`break; /* we're done */`
			`}`
			`}`

			`if (pp)`
			`prefix_set_dmetric(pp, TAILQ_NEXT(pp, entry.list.rib));`
			`else`
			`prefix_set_dmetric(NULL, TAILQ_FIRST(&re->prefix_h));`

			`/* Fixup MED order again, reinsert prefixes from the start */`
			`while (!TAILQ_EMPTY(&redo)) {`
			`xp = TAILQ_FIRST(&redo);`
			`TAILQ_REMOVE(&redo, xp, entry.list.rib);`

			`prefix_insert(xp, NULL, re);`
			`}`
			`}`

			`/* helper function to check if a prefix is valid to be selected */`
			`int`
			`prefix_eligible(struct prefix *p)`
			`{`
			`struct rde_aspath *asp = prefix_aspath(p);`

			`/* The aspath needs to be loop and error free */`
			`if (asp == NULL \|\|`
			`asp->flags & (F_ATTR_LOOP\|F_ATTR_OTC_LEAK\|F_ATTR_PARSE_ERR))`
			`return 0;`

			`/* The nexthop must be valid. */`
			`if (!prefix_nhvalid(p))`
			`return 0;`

			`return 1;`
			`}`

			`struct prefix *`
			`prefix_best(struct rib_entry *re)`
			`{`
			`struct prefix *xp;`
			`struct rib *rib;`

			`rib = re_rib(re);`
			`if (rib->flags & F_RIB_NOEVALUATE)`
			`/* decision process is turned off */`
			`return NULL;`

			`xp = TAILQ_FIRST(&re->prefix_h);`
			`if (xp != NULL && !prefix_eligible(xp))`
			`xp = NULL;`
			`return xp;`
			`}`

			`/*`
			`* Find the correct place to insert the prefix in the prefix list.`
			`* If the active prefix has changed we need to send an update also special`
			`* treatment is needed if 'rde evaluate all' is used on some peers.`
			`* To re-evaluate a prefix just call prefix_evaluate with old and new pointing`
			`* to the same prefix.`
			`*/`
			`void`
			`prefix_evaluate(struct rib_entry re, struct prefix new, struct prefix *old)`
			`{`
			`struct prefix newbest, oldbest;`
			`struct rib *rib;`

			`rib = re_rib(re);`
			`if (rib->flags & F_RIB_NOEVALUATE) {`
			`/* decision process is turned off */`
			`if (old != NULL)`
			`TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);`
			`if (new != NULL) {`
			`TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);`
			`new->dmetric = PREFIX_DMETRIC_INVALID;`
			`}`
			`return;`
			`}`

			`oldbest = prefix_best(re);`
			`if (old != NULL)`
			`prefix_remove(old, re);`
			`if (new != NULL)`
			`prefix_insert(new, NULL, re);`
			`newbest = prefix_best(re);`

			`/*`
			`* If the active prefix changed or the active prefix was removed`
			`* and added again then generate an update.`
			`*/`
			`if (oldbest != newbest \|\| (old != NULL && newbest == old)) {`
			`/*`
			`* Send update withdrawing oldbest and adding newbest`
			`* but remember that newbest may be NULL aka ineligible.`
			`* Additional decision may be made by the called functions.`
			`*/`
			`if ((rib->flags & F_RIB_NOFIB) == 0)`
			`rde_send_kroute(rib, newbest, oldbest);`
			`rde_generate_updates(re, new, old, EVAL_DEFAULT);`
			`return;`
			`}`

			`/*`
			`* If there are peers with 'rde evaluate all' every update needs`
			`* to be passed on (not only a change of the best prefix).`
			`* rde_generate_updates() will then take care of distribution.`
			`*/`
			`if (rde_evaluate_all())`
			`if ((new != NULL && prefix_eligible(new)) \|\| old != NULL)`
			`rde_generate_updates(re, new, old, EVAL_ALL);`
			`}`

			`void`
			`prefix_evaluate_nexthop(struct prefix *p, enum nexthop_state state,`
			`enum nexthop_state oldstate)`
			`{`
			`struct rib_entry *re = prefix_re(p);`
			`struct prefix newbest, oldbest;`
			`struct rib *rib;`

			`/* Skip non local-RIBs or RIBs that are flagged as noeval. */`
			`rib = re_rib(re);`
			`if (rib->flags & F_RIB_NOEVALUATE) {`
			`log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__);`
			`return;`
			`}`

			`if (oldstate == state) {`
			`/*`
			`* The state of the nexthop did not change. The only`
			`* thing that may have changed is the true_nexthop`
			`* or other internal infos. This will not change`
			`* the routing decision so shortcut here.`
			`* XXX needs to be changed for ECMP`
			`*/`
			`if (state == NEXTHOP_REACH) {`
			`if ((rib->flags & F_RIB_NOFIB) == 0 &&`
			`p == prefix_best(re))`
			`rde_send_kroute(rib, p, NULL);`
			`}`
			`return;`
			`}`

			`/*`
			`* Re-evaluate the prefix by removing the prefix then updating the`
			`* nexthop state and reinserting the prefix again.`
			`*/`
			`oldbest = prefix_best(re);`
			`prefix_remove(p, re);`

			`if (state == NEXTHOP_REACH)`
			`p->nhflags \|= NEXTHOP_VALID;`
			`else`
			`p->nhflags &= ~NEXTHOP_VALID;`

			`prefix_insert(p, NULL, re);`
			`newbest = prefix_best(re);`

			`/*`
			`* If the active prefix changed or the active prefix was removed`
			`* and added again then generate an update.`
			`*/`
			`if (oldbest != newbest \|\| newbest == p) {`
			`/*`
			`* Send update withdrawing oldbest and adding newbest`
			`* but remember that newbest may be NULL aka ineligible.`
			`* Additional decision may be made by the called functions.`
			`*/`
			`if ((rib->flags & F_RIB_NOFIB) == 0)`
			`rde_send_kroute(rib, newbest, oldbest);`
			`rde_generate_updates(re, p, p, EVAL_DEFAULT);`
			`return;`
			`}`

			`/*`
			`* If there are peers with 'rde evaluate all' every update needs`
			`* to be passed on (not only a change of the best prefix).`
			`* rde_generate_updates() will then take care of distribution.`
			`*/`
			`if (rde_evaluate_all())`
			`rde_generate_updates(re, p, p, EVAL_ALL);`
			`}`