From a97719482dd97b6690bed7276136b54af7f22a21 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Tue, 22 Feb 2005 13:04:05 +0000 Subject: [PATCH] Add CARP (Common Address Redundancy Protocol), which allows multiple hosts to share an IP address, providing high availability and load balancing. Original work on CARP done by Michael Shalayeff, with many additions by Marco Pfatschbacher and Ryan McBride. FreeBSD port done solely by Max Laier. Patch by: mlaier Obtained from: OpenBSD (mickey, mcbride) --- etc/protocols | 3 +- sbin/ifconfig/Makefile | 3 + sbin/ifconfig/ifcarp.c | 199 ++++ sbin/ifconfig/ifpfsync.c | 144 +++ sys/conf/files | 2 + sys/conf/options | 1 + sys/net/if.c | 18 + sys/net/if_ethersubr.c | 26 + sys/net/if_media.h | 6 + sys/net/if_types.h | 1 + sys/net/if_var.h | 3 +- sys/netinet/if_ether.c | 39 +- sys/netinet/if_ether.h | 1 + sys/netinet/in.h | 2 + sys/netinet/in_proto.c | 16 + sys/netinet/ip_carp.c | 2032 +++++++++++++++++++++++++++++++++++ sys/netinet/ip_carp.h | 163 +++ sys/netinet/ip_input.c | 11 + sys/netinet6/in6.c | 37 +- sys/netinet6/in6_ifattach.c | 1 + sys/netinet6/in6_proto.c | 13 + sys/netinet6/in6_var.h | 4 + sys/netinet6/nd6.c | 3 + sys/netinet6/nd6_nbr.c | 29 +- sys/sys/mbuf.h | 1 + usr.bin/netstat/inet.c | 45 + usr.bin/netstat/main.c | 4 + usr.bin/netstat/netstat.h | 1 + 28 files changed, 2796 insertions(+), 12 deletions(-) create mode 100644 sbin/ifconfig/ifcarp.c create mode 100644 sbin/ifconfig/ifpfsync.c create mode 100644 sys/netinet/ip_carp.c create mode 100644 sys/netinet/ip_carp.h diff --git a/etc/protocols b/etc/protocols index f79362b56461..7c3654f17969 100644 --- a/etc/protocols +++ b/etc/protocols @@ -119,7 +119,7 @@ ipcomp 108 IPComp # IP Payload Compression Protocol snp 109 SNP # Sitara Networks Protocol compaq-peer 110 Compaq-Peer # Compaq Peer Protocol ipx-in-ip 111 IPX-in-IP # IPX in IP -vrrp 112 VRRP # Virtual Router Redundancy Protocol +carp 112 CARP vrrp # Common Address Redundancy Protocol pgm 113 PGM # PGM Reliable Transport Protocol # 114 # any 0-hop protocol l2tp 115 L2TP # Layer Two Tunneling Protocol @@ -142,5 +142,6 @@ pipe 131 PIPE # Private IP Encapsulation within IP sctp 132 SCTP # Stream Control Transmission Protocol fc 133 FC # Fibre Channel # 134-254 # Unassigned +pfsync 240 PFSYNC # PF Synchronization # 255 # Reserved divert 258 DIVERT # Divert pseudo-protocol [non IANA] diff --git a/sbin/ifconfig/Makefile b/sbin/ifconfig/Makefile index 117441bffb74..6b301e9b1e91 100644 --- a/sbin/ifconfig/Makefile +++ b/sbin/ifconfig/Makefile @@ -23,6 +23,9 @@ SRCS+= ifmedia.c # SIOC[GS]IFMEDIA support SRCS+= ifvlan.c # SIOC[GS]ETVLAN support SRCS+= ifieee80211.c # SIOC[GS]IEEE80211 support +SRCS+= ifcarp.c # SIOC[GS]VH support +SRCS+= ifpfsync.c # pfsync(4) support + .if !defined(RELEASE_CRUNCH) SRCS+= af_ipx.c # IPX support DPADD= ${LIBIPX} diff --git a/sbin/ifconfig/ifcarp.c b/sbin/ifconfig/ifcarp.c new file mode 100644 index 000000000000..ab6d152d9bb0 --- /dev/null +++ b/sbin/ifconfig/ifcarp.c @@ -0,0 +1,199 @@ +/* $FreeBSD$ */ +/* from $OpenBSD: ifconfig.c,v 1.82 2003/10/19 05:43:35 mcbride Exp $ */ + +/* + * Copyright (c) 2002 Michael Shalayeff. All rights reserved. + * Copyright (c) 2003 Ryan McBride. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ifconfig.h" + +static const char *carp_states[] = { CARP_STATES }; + +void carp_status(int s, const struct rt_addrinfo *); +void setcarp_advbase(const char *,int, int, const struct afswtch *rafp); +void setcarp_advskew(const char *, int, int, const struct afswtch *rafp); +void setcarp_passwd(const char *, int, int, const struct afswtch *rafp); +void setcarp_vhid(const char *, int, int, const struct afswtch *rafp); + +void +carp_status(int s, const struct rt_addrinfo *info __unused) +{ + const char *state; + struct carpreq carpr; + + memset((char *)&carpr, 0, sizeof(struct carpreq)); + ifr.ifr_data = (caddr_t)&carpr; + + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + return; + + if (carpr.carpr_vhid > 0) { + if (carpr.carpr_state > CARP_MAXSTATE) + state = ""; + else + state = carp_states[carpr.carpr_state]; + + printf("\tcarp: %s vhid %d advbase %d advskew %d\n", + state, carpr.carpr_vhid, carpr.carpr_advbase, + carpr.carpr_advskew); + } + + return; + +} + +void +setcarp_passwd(const char *val, int d, int s, const struct afswtch *afp) +{ + struct carpreq carpr; + + memset((char *)&carpr, 0, sizeof(struct carpreq)); + ifr.ifr_data = (caddr_t)&carpr; + + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + err(1, "SIOCGVH"); + + /* XXX Should hash the password into the key here, perhaps? */ + strlcpy(carpr.carpr_key, val, CARP_KEY_LEN); + + if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) + err(1, "SIOCSVH"); + + return; +} + +void +setcarp_vhid(const char *val, int d, int s, const struct afswtch *afp) +{ + int vhid; + struct carpreq carpr; + + vhid = atoi(val); + + if (vhid <= 0) + errx(1, "vhid must be greater than 0"); + + memset((char *)&carpr, 0, sizeof(struct carpreq)); + ifr.ifr_data = (caddr_t)&carpr; + + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + err(1, "SIOCGVH"); + + carpr.carpr_vhid = vhid; + + if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) + err(1, "SIOCSVH"); + + return; +} + +void +setcarp_advskew(const char *val, int d, int s, const struct afswtch *afp) +{ + int advskew; + struct carpreq carpr; + + advskew = atoi(val); + + memset((char *)&carpr, 0, sizeof(struct carpreq)); + ifr.ifr_data = (caddr_t)&carpr; + + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + err(1, "SIOCGVH"); + + carpr.carpr_advskew = advskew; + + if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) + err(1, "SIOCSVH"); + + return; +} + +void +setcarp_advbase(const char *val, int d, int s, const struct afswtch *afp) +{ + int advbase; + struct carpreq carpr; + + advbase = atoi(val); + + memset((char *)&carpr, 0, sizeof(struct carpreq)); + ifr.ifr_data = (caddr_t)&carpr; + + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + err(1, "SIOCGVH"); + + carpr.carpr_advbase = advbase; + + if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) + err(1, "SIOCSVH"); + + return; +} + +static struct cmd carp_cmds[] = { + DEF_CMD_ARG("advbase", setcarp_advbase), + DEF_CMD_ARG("advskew", setcarp_advskew), + DEF_CMD_ARG("pass", setcarp_passwd), + DEF_CMD_ARG("vhid", setcarp_vhid), +}; +static struct afswtch af_carp = { + .af_name = "af_carp", + .af_af = AF_UNSPEC, + .af_status = carp_status, +}; + +static __constructor void +carp_ctor(void) +{ +#define N(a) (sizeof(a) / sizeof(a[0])) + int i; + + for (i = 0; i < N(carp_cmds); i++) + cmd_register(&carp_cmds[i]); + af_register(&af_carp); +#undef N +} diff --git a/sbin/ifconfig/ifpfsync.c b/sbin/ifconfig/ifpfsync.c new file mode 100644 index 000000000000..7ca9883519a8 --- /dev/null +++ b/sbin/ifconfig/ifpfsync.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2003 Ryan McBride. All rights reserved. + * Copyright (c) 2004 Max Laier. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ifconfig.h" + +void setpfsync_syncif(const char *, int, int, const struct afswtch *rafp); +void unsetpfsync_syncif(const char *, int, int, const struct afswtch *rafp); +void setpfsync_maxupd(const char *, int, int, const struct afswtch *rafp); +void pfsync_status(int, const struct rt_addrinfo *); + +void +setpfsync_syncif(const char *val, int d, int s, const struct afswtch *rafp) +{ + struct pfsyncreq preq; + + bzero((char *)&preq, sizeof(struct pfsyncreq)); + ifr.ifr_data = (caddr_t)&preq; + + if (ioctl(s, SIOCGETPFSYNC, (caddr_t)&ifr) == -1) + err(1, "SIOCGETPFSYNC"); + + strlcpy(preq.pfsyncr_syncif, val, sizeof(preq.pfsyncr_syncif)); + + if (ioctl(s, SIOCSETPFSYNC, (caddr_t)&ifr) == -1) + err(1, "SIOCSETPFSYNC"); +} + +void +unsetpfsync_syncif(const char *val, int d, int s, const struct afswtch *rafp) +{ + struct pfsyncreq preq; + + bzero((char *)&preq, sizeof(struct pfsyncreq)); + ifr.ifr_data = (caddr_t)&preq; + + if (ioctl(s, SIOCGETPFSYNC, (caddr_t)&ifr) == -1) + err(1, "SIOCGETPFSYNC"); + + bzero((char *)&preq.pfsyncr_syncif, sizeof(preq.pfsyncr_syncif)); + + if (ioctl(s, SIOCSETPFSYNC, (caddr_t)&ifr) == -1) + err(1, "SIOCSETPFSYNC"); +} + +void +setpfsync_maxupd(const char *val, int d, int s, const struct afswtch *rafp) +{ + int maxupdates; + struct pfsyncreq preq; + + maxupdates = atoi(val); + + memset((char *)&preq, 0, sizeof(struct pfsyncreq)); + ifr.ifr_data = (caddr_t)&preq; + + if (ioctl(s, SIOCGETPFSYNC, (caddr_t)&ifr) == -1) + err(1, "SIOCGETPFSYNC"); + + preq.pfsyncr_maxupdates = maxupdates; + + if (ioctl(s, SIOCSETPFSYNC, (caddr_t)&ifr) == -1) + err(1, "SIOCSETPFSYNC"); +} + +void +pfsync_status(int s, const struct rt_addrinfo *info __unused) +{ + struct pfsyncreq preq; + + bzero((char *)&preq, sizeof(struct pfsyncreq)); + ifr.ifr_data = (caddr_t)&preq; + + if (ioctl(s, SIOCGETPFSYNC, (caddr_t)&ifr) == -1) + return; + + if (preq.pfsyncr_syncif[0] != '\0') { + printf("\tpfsync: syncif: %s maxupd: %d\n", + preq.pfsyncr_syncif, preq.pfsyncr_maxupdates); + } +} + +static struct cmd pfsync_cmds[] = { + DEF_CMD_ARG("syncif", setpfsync_syncif), + DEF_CMD_ARG("maxupd", setpfsync_maxupd), + DEF_CMD("-syncif", 1, unsetpfsync_syncif), +}; +static struct afswtch af_pfsync = { + .af_name = "af_pfsync", + .af_af = AF_UNSPEC, + .af_status = pfsync_status, +}; + +static __constructor void +pfsync_ctor(void) +{ +#define N(a) (sizeof(a) / sizeof(a[0])) + int i; + + for (i = 0; i < N(pfsync_cmds); i++) + cmd_register(&pfsync_cmds[i]); + af_register(&af_pfsync); +#undef N +} diff --git a/sys/conf/files b/sys/conf/files index daa8a9f0f6d5..1a9c67453559 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -279,6 +279,7 @@ crypto/rijndael/rijndael-api-fst.c optional geom_bde crypto/rijndael/rijndael-api-fst.c optional random crypto/rijndael/rijndael-api.c optional ipsec crypto/rijndael/rijndael-api.c optional wlan_ccmp +crypto/sha1.c optional carp crypto/sha1.c optional netgraph_mppc_encryption crypto/sha1.c optional crypto crypto/sha1.c optional ipsec @@ -1483,6 +1484,7 @@ netinet/if_atm.c optional atm netinet/if_ether.c optional ether netinet/igmp.c optional inet netinet/in.c optional inet +netinet/ip_carp.c optional carp netinet/in_gif.c optional gif inet netinet/ip_gre.c optional gre inet netinet/ip_id.c optional inet diff --git a/sys/conf/options b/sys/conf/options index f104f9405703..008c932448aa 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -610,6 +610,7 @@ NDEVFSOVERFLOW opt_devfs.h DEV_BPF opt_bpf.h DEV_ISA opt_isa.h DEV_MCA opt_mca.h +DEV_CARP opt_carp.h DEV_SPLASH opt_splash.h EISA_SLOTS opt_eisa.h diff --git a/sys/net/if.c b/sys/net/if.c index b6505dfe4a1a..3c25986f2dc3 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -34,6 +34,7 @@ #include "opt_inet6.h" #include "opt_inet.h" #include "opt_mac.h" +#include "opt_carp.h" #include #include @@ -78,6 +79,9 @@ #ifdef INET #include #endif +#ifdef DEV_CARP +#include +#endif void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); @@ -529,6 +533,12 @@ if_detach(struct ifnet *ifp) int found; EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); +#ifdef DEV_CARP + /* Maybe hook to the generalized departure handler above?!? */ + if (ifp->if_carp) + carp_ifdetach(ifp); +#endif + /* * Remove routes and flush queues. */ @@ -933,6 +943,10 @@ if_unroute(struct ifnet *ifp, int flag, int fam) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); if_qflush(&ifp->if_snd); +#ifdef DEV_CARP + if (ifp->if_carp) + carp_carpdev_state(ifp->if_carp); +#endif rt_ifmsg(ifp); } @@ -951,6 +965,10 @@ if_route(struct ifnet *ifp, int flag, int fam) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); +#ifdef DEV_CARP + if (ifp->if_carp) + carp_carpdev_state(ifp->if_carp); +#endif rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 1b973fec4a8a..2c4ff99f4a65 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -37,6 +37,7 @@ #include "opt_bdg.h" #include "opt_mac.h" #include "opt_netgraph.h" +#include "opt_carp.h" #include #include @@ -73,6 +74,10 @@ #include #endif +#ifdef DEV_CARP +#include +#endif + #ifdef IPX #include #include @@ -315,6 +320,12 @@ ether_output(struct ifnet *ifp, struct mbuf *m, } } +#ifdef DEV_CARP + if (ifp->if_carp && + (error = carp_output(ifp, m, dst, NULL))) + goto bad; +#endif + /* Handle ng_ether(4) processing, if any */ if (IFP2AC(ifp)->ac_netgraph != NULL) { if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { @@ -606,6 +617,18 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) if (!(BDG_ACTIVE(ifp)) && !((ether_type == ETHERTYPE_VLAN || m->m_flags & M_VLANTAG) && ifp->if_nvlans > 0)) { +#ifdef DEV_CARP + /* + * XXX: Okay, we need to call carp_forus() and - if it is for us + * jump over code that does the normal check + * "ac_enaddr == ether_dhost". The check sequence is a bit + * different from OpenBSD, so we jump over as few code as possible, + * to catch _all_ sanity checks. This needs evaluation, to see if + * the carp ether_dhost values break any of these checks! + */ + if (ifp->if_carp && carp_forus(ifp->if_carp, eh->ether_dhost)) + goto pre_stats; +#endif /* * Discard packet if upper layers shouldn't see it because it * was unicast to a different Ethernet address. If the driver @@ -628,6 +651,9 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) } } +#ifdef DEV_CARP +pre_stats: +#endif /* Discard packet if interface is not up */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); diff --git a/sys/net/if_media.h b/sys/net/if_media.h index 68a785da9f16..4f7389b3db9b 100644 --- a/sys/net/if_media.h +++ b/sys/net/if_media.h @@ -226,6 +226,11 @@ int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, #define IFM_ATM_NOSCRAMB 0x00000200 /* no scrambling */ #define IFM_ATM_UNASSIGNED 0x00000400 /* unassigned cells */ +/* + * CARP Common Address Redundancy Protocol + */ +#define IFM_CARP 0x000000c0 + /* * Shared media sub-types */ @@ -299,6 +304,7 @@ struct ifmedia_description { { IFM_FDDI, "FDDI" }, \ { IFM_IEEE80211, "IEEE 802.11 Wireless Ethernet" }, \ { IFM_ATM, "ATM" }, \ + { IFM_CARP, "Common Address Redundancy Protocol" }, \ { 0, NULL }, \ } diff --git a/sys/net/if_types.h b/sys/net/if_types.h index 7af5bec09881..56dca4506331 100644 --- a/sys/net/if_types.h +++ b/sys/net/if_types.h @@ -247,4 +247,5 @@ #define IFT_FAITH 0xf2 #define IFT_PFLOG 0xf6 #define IFT_PFSYNC 0xf7 +#define IFT_CARP 0xf8 /* Common Address Redundancy Protocol */ #endif /* !_NET_IF_TYPES_H_ */ diff --git a/sys/net/if_var.h b/sys/net/if_var.h index d1d9fa2c21e9..78eea2c6bd3f 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -68,6 +68,7 @@ struct rtentry; struct rt_addrinfo; struct socket; struct ether_header; +struct carp_if; #endif #include /* get TAILQ macros */ @@ -146,7 +147,7 @@ struct ifnet { */ struct knlist if_klist; /* events attached to this if */ int if_pcount; /* number of promiscuous listeners */ - void *if_carp; /* carp (tbd) interface pointer */ + struct carp_if *if_carp; /* carp interface structure */ struct bpf_if *if_bpf; /* packet filter structure */ u_short if_index; /* numeric abbreviation for this if */ short if_timer; /* time 'til if_watchdog called */ diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 2f1c0375258d..1bc1d060afaf 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -39,6 +39,7 @@ #include "opt_inet.h" #include "opt_bdg.h" #include "opt_mac.h" +#include "opt_carp.h" #include #include @@ -67,6 +68,10 @@ #include #include +#ifdef DEV_CARP +#include +#endif + #define SIN(s) ((struct sockaddr_in *)s) #define SDL(s) ((struct sockaddr_dl *)s) @@ -545,6 +550,7 @@ in_arpinput(m) struct sockaddr_dl *sdl; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; + u_int8_t *enaddr = NULL; int op, rif_len; int req_len; @@ -563,10 +569,18 @@ in_arpinput(m) * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). + * If the interface does not match, but the recieving interface + * is part of carp, we call carp_iamatch to see if this is a + * request for the virtual host ip. + * XXX: This is really ugly! */ LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) - if ((do_bridge || (ia->ia_ifp == ifp)) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) + if ((do_bridge || (ia->ia_ifp == ifp) +#ifdef DEV_CARP + || (ifp->if_carp + && carp_iamatch(ifp->if_carp, ia, &isaddr, &enaddr)) +#endif + ) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) goto match; LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if ((do_bridge || (ia->ia_ifp == ifp)) && @@ -587,8 +601,10 @@ in_arpinput(m) if (!do_bridge || (ia = TAILQ_FIRST(&in_ifaddrhead)) == NULL) goto drop; match: + if (!enaddr) + enaddr = (u_int8_t *)IF_LLADDR(ifp); myaddr = ia->ia_addr.sin_addr; - if (!bcmp(ar_sha(ah), IF_LLADDR(ifp), ifp->if_addrlen)) + if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { log(LOG_ERR, @@ -711,7 +727,7 @@ reply: if (itaddr.s_addr == myaddr.s_addr) { /* I am the target */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); - (void)memcpy(ar_sha(ah), IF_LLADDR(ifp), ah->ar_hln); + (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { la = arplookup(itaddr.s_addr, 0, SIN_PROXY); if (la == NULL) { @@ -738,7 +754,7 @@ reply: goto drop; } (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); - (void)memcpy(ar_sha(ah), IF_LLADDR(ifp), ah->ar_hln); + (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); rtfree(rt); /* @@ -880,6 +896,19 @@ arp_ifinit(ifp, ifa) ifa->ifa_flags |= RTF_CLONING; } +void +arp_ifinit2(ifp, ifa, enaddr) + struct ifnet *ifp; + struct ifaddr *ifa; + u_char *enaddr; +{ + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) + arprequest(ifp, &IA_SIN(ifa)->sin_addr, + &IA_SIN(ifa)->sin_addr, enaddr); + ifa->ifa_rtrequest = arp_rtrequest; + ifa->ifa_flags |= RTF_CLONING; +} + static void arp_init(void) { diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h index 9b60492de78e..14df15fcc17c 100644 --- a/sys/netinet/if_ether.h +++ b/sys/netinet/if_ether.h @@ -112,6 +112,7 @@ extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; int arpresolve(struct ifnet *ifp, struct rtentry *rt, struct mbuf *m, struct sockaddr *dst, u_char *desten); void arp_ifinit(struct ifnet *, struct ifaddr *); +void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *); #endif #endif diff --git a/sys/netinet/in.h b/sys/netinet/in.h index 20995bdef9af..45c269ca89ae 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -230,6 +230,7 @@ __END_DECLS #define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ +#define IPPROTO_CARP 112 /* CARP */ #define IPPROTO_PGM 113 /* PGM */ #define IPPROTO_PFSYNC 240 /* PFSYNC */ /* 255: Reserved */ @@ -357,6 +358,7 @@ __END_DECLS #define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ #define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ #define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ +#define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */ #define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */ #define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */ #define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 9ae32c4db3ae..103682fd1f1b 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -35,6 +35,7 @@ #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_pf.h" +#include "opt_carp.h" #include #include @@ -90,6 +91,10 @@ #include #endif +#ifdef DEV_CARP +#include +#endif + extern struct domain inetdomain; /* Spacer for loadable protocols. */ @@ -237,6 +242,14 @@ struct protosw inetsw[] = { &rip_usrreqs }, #endif /* DEV_PFSYNC */ +#ifdef DEV_CARP +{ SOCK_RAW, &inetdomain, IPPROTO_CARP, PR_ATOMIC|PR_ADDR, + carp_input, (pr_output_t*)rip_output, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif /* DEV_CARP */ /* Spacer n-times for loadable protocols. */ IPPROTOSPACER, IPPROTOSPACER, @@ -290,3 +303,6 @@ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); #ifdef PIM SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); #endif +#ifdef DEV_CARP +SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); +#endif diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c new file mode 100644 index 000000000000..ec54ef706223 --- /dev/null +++ b/sys/netinet/ip_carp.c @@ -0,0 +1,2032 @@ +/* $FreeBSD$ */ + +/* + * Copyright (c) 2002 Michael Shalayeff. All rights reserved. + * Copyright (c) 2003 Ryan McBride. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_carp.h" +#include "opt_bpf.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#include +#include +#include +#endif + +#ifdef INET6 +#include +#include +#include +#include +#include +#endif + +#include +#include + +#define CARP_IFNAME "carp" +static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); +SYSCTL_DECL(_net_inet_carp); + +struct carp_softc { + struct arpcom sc_ac; /* Interface clue */ + int if_flags; /* UP/DOWN */ + struct ifnet *sc_ifp; /* Parent */ + struct in_ifaddr *sc_ia; /* primary iface address */ + struct ip_moptions sc_imo; +#ifdef INET6 + struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ + struct ip6_moptions sc_im6o; +#endif /* INET6 */ + TAILQ_ENTRY(carp_softc) sc_list; + + enum { INIT = 0, BACKUP, MASTER } sc_state; + + int sc_flags_backup; + int sc_suppress; + + int sc_sendad_errors; +#define CARP_SENDAD_MAX_ERRORS 3 + int sc_sendad_success; +#define CARP_SENDAD_MIN_SUCCESS 3 + + int sc_vhid; + int sc_advskew; + int sc_naddrs; + int sc_naddrs6; + int sc_advbase; /* seconds */ + int sc_init_counter; + u_int64_t sc_counter; + + /* authentication */ +#define CARP_HMAC_PAD 64 + unsigned char sc_key[CARP_KEY_LEN]; + unsigned char sc_pad[CARP_HMAC_PAD]; + SHA1_CTX sc_sha1; + + struct callout sc_ad_tmo; /* advertisement timeout */ + struct callout sc_md_tmo; /* master down timeout */ + struct callout sc_md6_tmo; /* master down timeout */ + + LIST_ENTRY(carp_softc) sc_next; /* Interface clue */ +}; +#define sc_if sc_ac.ac_if + +int carp_suppress_preempt = 0; +int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */ +SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW, + &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets"); +SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW, + &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode"); +SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW, + &carp_opts[CARPCTL_LOG], 0, "log bad carp packets"); +SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW, + &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses"); + +struct carpstats carpstats; +SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW, + &carpstats, carpstats, + "CARP statistics (struct carpstats, netinet/ip_carp.h)"); + +struct carp_if { + TAILQ_HEAD(, carp_softc) vhif_vrs; + int vhif_nvrs; + + struct ifnet *vhif_ifp; + struct mtx vhif_mtx; +}; +/* lock per carp_if queue */ +#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp", \ + NULL, MTX_DEF) +#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif->vhif_mtx)) +#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED) +#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx) +#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx) + +#define CARP_LOG(sc, s) \ + if (carp_opts[CARPCTL_LOG]) { \ + if (sc != NULL) \ + log(LOG_INFO, "%s: ", (sc)->sc_if.if_xname); \ + else \ + log(LOG_INFO, "carp: "); \ + printf s; \ +/* addlog s; addlog("\n"); */ \ +} + +void carp_hmac_prepare(struct carp_softc *); +void carp_hmac_generate(struct carp_softc *, u_int32_t *, + unsigned char *); +int carp_hmac_verify(struct carp_softc *, u_int32_t *, + unsigned char *); +void carp_setroute(struct carp_softc *, int); +void carp_input_c(struct mbuf *, struct carp_softc *, + struct carp_header *, sa_family_t); +int carp_clone_create(struct if_clone *, int); +void carp_clone_destroy(struct ifnet *); +void carpdetach(struct carp_softc *); +int carp_prepare_ad(struct mbuf *, struct carp_softc *, + struct carp_header *); +void carp_send_ad_all(void); +void carp_send_ad(void *); +void carp_send_arp(struct carp_softc *); +void carp_master_down(void *); +int carp_ioctl(struct ifnet *, u_long, caddr_t); +static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +void carp_start(struct ifnet *); +void carp_setrun(struct carp_softc *, sa_family_t); +void carp_set_state(struct carp_softc *, int); +int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); +enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; + +int carp_set_addr(struct carp_softc *, struct sockaddr_in *); +int carp_del_addr(struct carp_softc *, struct sockaddr_in *); +#ifdef INET6 +void carp_send_na(struct carp_softc *); +int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *); +int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *); +#endif + +static LIST_HEAD(, carp_softc) carpif_list; +IFC_SIMPLE_DECLARE(carp, 0); + +static __inline u_int16_t +carp_cksum(struct mbuf *m, int len) +{ + return (in_cksum(m, len)); +} + +void +carp_hmac_prepare(struct carp_softc *sc) +{ + u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; + u_int8_t vhid = sc->sc_vhid & 0xff; + struct ifaddr *ifa; + int i; +#ifdef INET6 + struct in6_addr in6; +#endif + + /* compute ipad from key */ + bzero(sc->sc_pad, sizeof(sc->sc_pad)); + bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); + for (i = 0; i < sizeof(sc->sc_pad); i++) + sc->sc_pad[i] ^= 0x36; + + /* precompute first part of inner hash */ + SHA1Init(&sc->sc_sha1); + SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); + SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); + SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); + SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); +#ifdef INET + TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) { + if (ifa->ifa_addr->sa_family == AF_INET) + SHA1Update(&sc->sc_sha1, + (void *)&ifatoia(ifa)->ia_addr.sin_addr.s_addr, + sizeof(struct in_addr)); + } +#endif /* INET */ +#ifdef INET6 + TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) { + if (ifa->ifa_addr->sa_family == AF_INET6) { + in6 = ifatoia6(ifa)->ia_addr.sin6_addr; + if (IN6_IS_ADDR_LINKLOCAL(&in6)) + in6.s6_addr16[1] = 0; + SHA1Update(&sc->sc_sha1, (void *)&in6, sizeof(in6)); + } + } +#endif /* INET6 */ + + /* convert ipad to opad */ + for (i = 0; i < sizeof(sc->sc_pad); i++) + sc->sc_pad[i] ^= 0x36 ^ 0x5c; +} + +void +carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], + unsigned char md[20]) +{ + SHA1_CTX sha1ctx; + + /* fetch first half of inner hash */ + bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); + + SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); + SHA1Final(md, &sha1ctx); + + /* outer hash */ + SHA1Init(&sha1ctx); + SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); + SHA1Update(&sha1ctx, md, 20); + SHA1Final(md, &sha1ctx); +} + +int +carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2], + unsigned char md[20]) +{ + unsigned char md2[20]; + + carp_hmac_generate(sc, counter, md2); + + return (bcmp(md, md2, sizeof(md2))); +} + +void +carp_setroute(struct carp_softc *sc, int cmd) +{ + struct ifaddr *ifa; + int s; + + s = splnet(); + TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) { + if (ifa->ifa_addr->sa_family == AF_INET && sc->sc_ifp != NULL) { + int count = carp_addrcount( + (struct carp_if *)sc->sc_ifp->if_carp, + ifatoia(ifa), CARP_COUNT_MASTER); + + if ((cmd == RTM_ADD && count == 1) || + (cmd == RTM_DELETE && count == 0)) + rtinit(ifa, cmd, RTF_UP | RTF_HOST); + } +#ifdef INET6 + if (ifa->ifa_addr->sa_family == AF_INET6) { + if (cmd == RTM_ADD) + in6_ifaddloop(ifa); + else + in6_ifremloop(ifa); + } +#endif /* INET6 */ + } + splx(s); +} + +int +carp_clone_create(struct if_clone *ifc, int unit) +{ + + struct carp_softc *sc; + struct ifnet *ifp; + + MALLOC(sc, struct carp_softc *, sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); + + sc->sc_flags_backup = 0; + sc->sc_suppress = 0; + sc->sc_advbase = CARP_DFLTINTV; + sc->sc_vhid = -1; /* required setting */ + sc->sc_advskew = 0; + sc->sc_init_counter = 1; + sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ +#ifdef INET6 + sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL; +#endif + + callout_init(&sc->sc_ad_tmo, 0); + callout_init(&sc->sc_md_tmo, 0); + callout_init(&sc->sc_md6_tmo, 0); + + ifp = &sc->sc_if; + ifp->if_softc = sc; + if_initname(ifp, CARP_IFNAME, unit); + ifp->if_mtu = ETHERMTU; + ifp->if_flags = 0; + ifp->if_ioctl = carp_ioctl; + ifp->if_output = carp_looutput; + ifp->if_start = carp_start; + ifp->if_type = IFT_CARP; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = 0; + if_attach(ifp); + LIST_INSERT_HEAD(&carpif_list, sc, sc_next); + bpfattach(&sc->sc_if, DLT_LOOP, sizeof(u_int32_t)); + return (0); +} + +void +carp_clone_destroy(struct ifnet *ifp) +{ + struct carp_softc *sc = ifp->if_softc; + struct carp_if *cif; + struct ip_moptions *imo = &sc->sc_imo; +#ifdef INET6 + struct ip6_moptions *im6o = &sc->sc_im6o; +#endif + +/* carpdetach(sc); */ + + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + + if (imo->imo_num_memberships) { + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); + imo->imo_multicast_ifp = NULL; + } +#ifdef INET6 + while (!LIST_EMPTY(&im6o->im6o_memberships)) { + struct in6_multi_mship *imm = + LIST_FIRST(&im6o->im6o_memberships); + LIST_REMOVE(imm, i6mm_chain); + in6_leavegroup(imm); + } + im6o->im6o_multicast_ifp = NULL; +#endif + + /* Remove ourself from parents if_carp queue */ + if (sc->sc_ifp && (cif = sc->sc_ifp->if_carp)) { + CARP_LOCK(cif); + TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); + if (!--cif->vhif_nvrs) { + sc->sc_ifp->if_carp = NULL; + CARP_LOCK_DESTROY(cif); + FREE(cif, M_CARP); + } else { + CARP_UNLOCK(cif); + } + } + + bpfdetach(ifp); + if_detach(ifp); + LIST_REMOVE(sc, sc_next); + free(sc, M_CARP); +} + +/* + * process input packet. + * we have rearranged checks order compared to the rfc, + * but it seems more efficient this way or not possible otherwise. + */ +void +carp_input(struct mbuf *m, int hlen) +{ + struct carp_softc *sc = NULL; + struct ip *ip = mtod(m, struct ip *); + struct carp_header *ch; + int iplen, len; + + carpstats.carps_ipackets++; + + if (!carp_opts[CARPCTL_ALLOW]) { + m_freem(m); + return; + } + + /* check if received on a valid carp interface */ + if (m->m_pkthdr.rcvif->if_carp == NULL) { + carpstats.carps_badif++; + CARP_LOG(sc, ("packet received on non-carp interface: %s", + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return; + } + + /* verify that the IP TTL is 255. */ + if (ip->ip_ttl != CARP_DFLTTL) { + carpstats.carps_badttl++; + CARP_LOG(sc, ("received ttl %d != 255i on %s", ip->ip_ttl, + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return; + } + + iplen = ip->ip_hl << 2; + + if (m->m_pkthdr.len < iplen + sizeof(*ch)) { + carpstats.carps_badlen++; + CARP_LOG(sc, ("received len %d < sizeof(struct carp_header)", + m->m_len - sizeof(struct ip))); + m_freem(m); + return; + } + + if (iplen + sizeof(*ch) < m->m_len) { + if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { + carpstats.carps_hdrops++; + /* CARP_LOG ? */ + return; + } + ip = mtod(m, struct ip *); + } + ch = (struct carp_header *)((char *)ip + iplen); + + /* + * verify that the received packet length is + * equal to the CARP header + */ + len = iplen + sizeof(*ch); + if (len > m->m_pkthdr.len) { + carpstats.carps_badlen++; + CARP_LOG(sc, ("packet too short %d on %s", m->m_pkthdr.len, + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return; + } + + if ((m = m_pullup(m, len)) == NULL) { + carpstats.carps_hdrops++; + return; + } + ip = mtod(m, struct ip *); + ch = (struct carp_header *)((char *)ip + iplen); + + /* verify the CARP checksum */ + m->m_data += iplen; + if (carp_cksum(m, len - iplen)) { + carpstats.carps_badsum++; + CARP_LOG(sc, ("checksum failed on %s", + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return; + } + m->m_data -= iplen; + + carp_input_c(m, sc, ch, AF_INET); +} + +#ifdef INET6 +int +carp6_input(struct mbuf **mp, int *offp, int proto) +{ + struct mbuf *m = *mp; + struct carp_softc *sc = NULL; + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct carp_header *ch; + u_int len; + + carpstats.carps_ipackets6++; + + if (!carp_opts[CARPCTL_ALLOW]) { + m_freem(m); + return (IPPROTO_DONE); + } + + /* check if received on a valid carp interface */ + if (m->m_pkthdr.rcvif->if_carp == NULL) { + carpstats.carps_badif++; + CARP_LOG(sc, ("packet received on non-carp interface: %s", + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return (IPPROTO_DONE); + } + + /* verify that the IP TTL is 255 */ + if (ip6->ip6_hlim != CARP_DFLTTL) { + carpstats.carps_badttl++; + CARP_LOG(sc, ("received ttl %d != 255 on %s", ip6->ip6_hlim, + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return (IPPROTO_DONE); + } + + /* verify that we have a complete carp packet */ + len = m->m_len; + IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); + if (ch == NULL) { + carpstats.carps_badlen++; + CARP_LOG(sc, ("packet size %u too small on %s", len, + m->m_pkthdr.rcvif->if_xname)); + return (IPPROTO_DONE); + } + + + /* verify the CARP checksum */ + m->m_data += *offp; + if (carp_cksum(m, sizeof(*ch))) { + carpstats.carps_badsum++; + CARP_LOG(sc, ("checksum failed, on %s", + m->m_pkthdr.rcvif->if_xname)); + m_freem(m); + return (IPPROTO_DONE); + } + m->m_data -= *offp; + + carp_input_c(m, sc, ch, AF_INET6); + return (IPPROTO_DONE); +} +#endif /* INET6 */ + +void +carp_input_c(struct mbuf *m, struct carp_softc *sc, + struct carp_header *ch, sa_family_t af) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + u_int64_t tmp_counter; + struct timeval sc_tv, ch_tv; + + /* verify that the VHID is valid on the receiving interface */ + CARP_LOCK(ifp->if_carp); + TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list) + if (sc->sc_vhid == ch->carp_vhid) + break; + CARP_UNLOCK(ifp->if_carp); + if (!sc || (sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) != + (IFF_UP|IFF_RUNNING)) { + carpstats.carps_badvhid++; + m_freem(m); + return; + } + + getmicrotime(&sc->sc_if.if_lastchange); + sc->sc_if.if_ipackets++; + sc->sc_if.if_ibytes += m->m_pkthdr.len; + + if (sc->sc_if.if_bpf) { + /* + * We need to prepend the address family as + * a four byte field. Cons up a dummy header + * to pacify bpf. This is safe because bpf + * will only read from the mbuf (i.e., it won't + * try to free it or keep a pointer to it). + */ + struct mbuf m0; + struct ip *ip = mtod(m, struct ip *); + u_int32_t maf = htonl(af); + + m0.m_next = m; + m0.m_len = sizeof(maf); + m0.m_data = (char *)&maf; + /* BPF wants net byte order */ + ip->ip_len = htonl(ip->ip_len); + ip->ip_off = htonl(ip->ip_off); + BPF_MTAP(&sc->sc_if, &m0); + } + + /* verify the CARP version. */ + if (ch->carp_version != CARP_VERSION) { + carpstats.carps_badver++; + sc->sc_if.if_ierrors++; + CARP_LOG(sc, ("invalid version %d", ch->carp_version)); + m_freem(m); + return; + } + + /* verify the hash */ + if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { + carpstats.carps_badauth++; + sc->sc_if.if_ierrors++; + CARP_LOG(sc, ("incorrect hash")); + m_freem(m); + return; + } + + tmp_counter = ntohl(ch->carp_counter[0]); + tmp_counter = tmp_counter<<32; + tmp_counter += ntohl(ch->carp_counter[1]); + + /* XXX Replay protection goes here */ + + sc->sc_init_counter = 0; + sc->sc_counter = tmp_counter; + + sc_tv.tv_sec = sc->sc_advbase; + if (carp_suppress_preempt && sc->sc_advskew < 240) + sc_tv.tv_usec = 240 * 1000000 / 256; + else + sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256; + ch_tv.tv_sec = ch->carp_advbase; + ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; + + switch (sc->sc_state) { + case INIT: + break; + case MASTER: + /* + * If we receive an advertisement from a master who's going to + * be more frequent than us, go into BACKUP state. + */ + if (timevalcmp(&sc_tv, &ch_tv, >) || + timevalcmp(&sc_tv, &ch_tv, ==)) { + callout_stop(&sc->sc_ad_tmo); + carp_set_state(sc, BACKUP); + carp_setrun(sc, 0); + carp_setroute(sc, RTM_DELETE); + } + break; + case BACKUP: + /* + * If we're pre-empting masters who advertise slower than us, + * and this one claims to be slower, treat him as down. + */ + if (carp_opts[CARPCTL_PREEMPT] && + timevalcmp(&sc_tv, &ch_tv, <)) { + carp_master_down(sc); + break; + } + + /* + * If the master is going to advertise at such a low frequency + * that he's guaranteed to time out, we'd might as well just + * treat him as timed out now. + */ + sc_tv.tv_sec = sc->sc_advbase * 3; + if (timevalcmp(&sc_tv, &ch_tv, <)) { + carp_master_down(sc); + break; + } + + /* + * Otherwise, we reset the counter and wait for the next + * advertisement. + */ + carp_setrun(sc, af); + break; + } + + m_freem(m); + return; +} + +void +carpdetach(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + + while ((ifa = TAILQ_FIRST(&sc->sc_if.if_addrlist)) != NULL) + if (ifa->ifa_addr->sa_family == AF_INET) { + struct in_ifaddr *ia = ifatoia(ifa); + + carp_del_addr(sc, &ia->ia_addr); + + /* ripped screaming from in_control(SIOCDIFADDR) */ + in_ifscrub(&sc->sc_if, ia); + TAILQ_REMOVE(&sc->sc_if.if_addrlist, ifa, ifa_link); + TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); + IFAFREE((&ia->ia_ifa)); + } +} + +/* Detach an interface from the carp. */ +void +carp_ifdetach(struct ifnet *ifp) +{ + struct carp_softc *sc; + struct carp_if *cif = (struct carp_if *)ifp->if_carp; + + CARP_LOCK(cif); + TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) + carpdetach(sc); + CARP_UNLOCK(cif); +} + +int +carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) +{ + struct m_tag *mtag; + struct ifnet *ifp = &sc->sc_if; + + if (sc->sc_init_counter) { + /* this could also be seconds since unix epoch */ + sc->sc_counter = arc4random(); + sc->sc_counter = sc->sc_counter << 32; + sc->sc_counter += arc4random(); + } else + sc->sc_counter++; + + ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); + ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); + + carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); + + /* Tag packet for carp_output */ + mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + sc->sc_if.if_oerrors++; + return (ENOMEM); + } + bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); + m_tag_prepend(m, mtag); + + return (0); +} + +void +carp_send_ad_all(void) +{ + struct ifnet *ifp; + struct carp_if *cif; + struct carp_softc *vh; + + TAILQ_FOREACH(ifp, &ifnet, if_list) { + if (ifp->if_carp == NULL) + continue; + + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((vh->sc_ac.ac_if.if_flags & (IFF_UP|IFF_RUNNING)) && + vh->sc_state == MASTER) + carp_send_ad(vh); + } + CARP_UNLOCK(cif); + } +} + +void +carp_send_ad(void *v) +{ + struct carp_header ch; + struct timeval tv; + struct carp_softc *sc = v; + struct carp_header *ch_ptr; + struct mbuf *m; + int len, advbase, advskew; + + /* bow out if we've lost our UPness or RUNNINGuiness */ + if ((sc->sc_if.if_flags & + (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) { + advbase = 255; + advskew = 255; + } else { + advbase = sc->sc_advbase; + if (!carp_suppress_preempt || sc->sc_advskew > 240) + advskew = sc->sc_advskew; + else + advskew = 240; + tv.tv_sec = advbase; + tv.tv_usec = advskew * 1000000 / 256; + } + + ch.carp_version = CARP_VERSION; + ch.carp_type = CARP_ADVERTISEMENT; + ch.carp_vhid = sc->sc_vhid; + ch.carp_advbase = advbase; + ch.carp_advskew = advskew; + ch.carp_authlen = 7; /* XXX DEFINE */ + ch.carp_pad1 = 0; /* must be zero */ + ch.carp_cksum = 0; + +#ifdef INET + if (sc->sc_ia) { + struct ip *ip; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + sc->sc_ac.ac_if.if_oerrors++; + carpstats.carps_onomem++; + /* XXX maybe less ? */ + if (advbase != 255 || advskew != 255) + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + return; + } + len = sizeof(*ip) + sizeof(ch); + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = NULL; + m->m_len = len; + MH_ALIGN(m, m->m_len); + m->m_flags |= M_MCAST; + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; + ip->ip_len = len; + ip->ip_id = ip_newid(); + ip->ip_off = IP_DF; + ip->ip_ttl = CARP_DFLTTL; + ip->ip_p = IPPROTO_CARP; + ip->ip_sum = 0; + ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr; + ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); + + ch_ptr = (struct carp_header *)(&ip[1]); + bcopy(&ch, ch_ptr, sizeof(ch)); + if (carp_prepare_ad(m, sc, ch_ptr)) + return; + + m->m_data += sizeof(*ip); + ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip)); + m->m_data -= sizeof(*ip); + + getmicrotime(&sc->sc_if.if_lastchange); + sc->sc_ac.ac_if.if_opackets++; + sc->sc_ac.ac_if.if_obytes += len; + carpstats.carps_opackets++; + + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) { + sc->sc_if.if_oerrors++; + if (sc->sc_sendad_errors < INT_MAX) + sc->sc_sendad_errors++; + if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { + carp_suppress_preempt++; + if (carp_suppress_preempt == 1) + carp_send_ad_all(); + } + sc->sc_sendad_success = 0; + } else { + if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { + if (++sc->sc_sendad_success >= + CARP_SENDAD_MIN_SUCCESS) { + carp_suppress_preempt--; + sc->sc_sendad_errors = 0; + } + } else + sc->sc_sendad_errors = 0; + } + } +#endif /* INET */ +#ifdef INET6 + if (sc->sc_ia6) { + struct ip6_hdr *ip6; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + sc->sc_ac.ac_if.if_oerrors++; + carpstats.carps_onomem++; + /* XXX maybe less ? */ + if (advbase != 255 || advskew != 255) + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + return; + } + len = sizeof(*ip6) + sizeof(ch); + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = NULL; + m->m_len = len; + MH_ALIGN(m, m->m_len); + m->m_flags |= M_MCAST; + ip6 = mtod(m, struct ip6_hdr *); + bzero(ip6, sizeof(*ip6)); + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_hlim = CARP_DFLTTL; + ip6->ip6_nxt = IPPROTO_CARP; + bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src, + sizeof(struct in6_addr)); + /* set the multicast destination */ + + ip6->ip6_dst.s6_addr8[0] = 0xff; + ip6->ip6_dst.s6_addr8[1] = 0x02; + ip6->ip6_dst.s6_addr8[15] = 0x12; + + ch_ptr = (struct carp_header *)(&ip6[1]); + bcopy(&ch, ch_ptr, sizeof(ch)); + if (carp_prepare_ad(m, sc, ch_ptr)) + return; + + m->m_data += sizeof(*ip6); + ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6)); + m->m_data -= sizeof(*ip6); + + getmicrotime(&sc->sc_if.if_lastchange); + sc->sc_if.if_opackets++; + sc->sc_if.if_obytes += len; + carpstats.carps_opackets6++; + + if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) { + sc->sc_if.if_oerrors++; + if (sc->sc_sendad_errors < INT_MAX) + sc->sc_sendad_errors++; + if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { + carp_suppress_preempt++; + if (carp_suppress_preempt == 1) + carp_send_ad_all(); + } + sc->sc_sendad_success = 0; + } else { + if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { + if (++sc->sc_sendad_success >= + CARP_SENDAD_MIN_SUCCESS) { + carp_suppress_preempt--; + sc->sc_sendad_errors = 0; + } + } else + sc->sc_sendad_errors = 0; + } + } +#endif /* INET6 */ + + if (advbase != 255 || advskew != 255) + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + +} + +/* + * Broadcast a gratuitous ARP request containing + * the virtual router MAC address for each IP address + * associated with the virtual router. + */ +void +carp_send_arp(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) { + + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + +/* arprequest(sc->sc_ifp, &in, &in, sc->sc_ac.ac_enaddr); */ + arp_ifinit2(sc->sc_ifp, ifa, sc->sc_ac.ac_enaddr); + + DELAY(1000); /* XXX */ + } +} + +#ifdef INET6 +void +carp_send_na(struct carp_softc *sc) +{ + struct ifaddr *ifa; + struct in6_addr *in6; + static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; + + TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) { + + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + + in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; + nd6_na_output(sc->sc_ifp, &mcast, in6, + ND_NA_FLAG_OVERRIDE, 1, NULL); + DELAY(1000); /* XXX */ + } +} +#endif /* INET6 */ + +int +carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) +{ + struct carp_softc *vh; + struct ifaddr *ifa; + int count = 0; + + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((type == CARP_COUNT_RUNNING && + (vh->sc_ac.ac_if.if_flags & (IFF_UP|IFF_RUNNING)) == + (IFF_UP|IFF_RUNNING)) || + (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) { + TAILQ_FOREACH(ifa, &vh->sc_ac.ac_if.if_addrlist, + ifa_list) { + if (ifa->ifa_addr->sa_family == AF_INET && + ia->ia_addr.sin_addr.s_addr == + ifatoia(ifa)->ia_addr.sin_addr.s_addr) + count++; + } + } + } + return (count); +} + +int +carp_iamatch(void *v, struct in_ifaddr *ia, + struct in_addr *isaddr, u_int8_t **enaddr) +{ + struct carp_if *cif = v; + struct carp_softc *vh; + int index, count = 0; + struct ifaddr *ifa; + + CARP_LOCK(cif); + + if (carp_opts[CARPCTL_ARPBALANCE]) { + /* + * XXX proof of concept implementation. + * We use the source ip to decide which virtual host should + * handle the request. If we're master of that virtual host, + * then we respond, otherwise, just drop the arp packet on + * the floor. + */ + count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING); + if (count == 0) { + /* should never reach this */ + CARP_UNLOCK(cif); + return (0); + } + + /* this should be a hash, like pf_hash() */ + index = isaddr->s_addr % count; + count = 0; + + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((vh->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) == + (IFF_UP|IFF_RUNNING)) { + TAILQ_FOREACH(ifa, &vh->sc_if.if_addrlist, + ifa_list) { + if (ifa->ifa_addr->sa_family == + AF_INET && + ia->ia_addr.sin_addr.s_addr == + ifatoia(ifa)->ia_addr.sin_addr.s_addr) { + if (count == index) { + if (vh->sc_state == + MASTER) { + *enaddr = vh->sc_ac.ac_enaddr; + CARP_UNLOCK(cif); + return (1); + } else { + CARP_UNLOCK(cif); + return (0); + } + } + count++; + } + } + } + } + } else { + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((vh->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) == + (IFF_UP|IFF_RUNNING) && ia->ia_ifp == + &vh->sc_if) { + *enaddr = vh->sc_ac.ac_enaddr; + CARP_UNLOCK(cif); + return (1); + } + } + } + CARP_UNLOCK(cif); + return (0); +} + +#ifdef INET6 +struct ifaddr * +carp_iamatch6(void *v, struct in6_addr *taddr) +{ + struct carp_if *cif = v; + struct carp_softc *vh; + struct ifaddr *ifa; + + CARP_LOCK(cif); + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + TAILQ_FOREACH(ifa, &vh->sc_if.if_addrlist, ifa_list) { + if (IN6_ARE_ADDR_EQUAL(taddr, + &ifatoia6(ifa)->ia_addr.sin6_addr) && + ((vh->sc_if.if_flags & + (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))) { + CARP_UNLOCK(cif); + return (ifa); + } + } + } + CARP_UNLOCK(cif); + + return (NULL); +} + +void * +carp_macmatch6(void *v, struct mbuf *m, const struct in6_addr *taddr) +{ + struct m_tag *mtag; + struct carp_if *cif = v; + struct carp_softc *sc; + struct ifaddr *ifa; + + CARP_LOCK(cif); + TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { + TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) { + if (IN6_ARE_ADDR_EQUAL(taddr, + &ifatoia6(ifa)->ia_addr.sin6_addr) && + ((sc->sc_if.if_flags & + (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))) { + struct ifnet *ifp = &sc->sc_if; + mtag = m_tag_get(PACKET_TAG_CARP, + sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) { + /* better a bit than nothing */ + CARP_UNLOCK(cif); + return (sc->sc_ac.ac_enaddr); + } + bcopy(&ifp, (caddr_t)(mtag + 1), + sizeof(struct ifnet *)); + m_tag_prepend(m, mtag); + + CARP_UNLOCK(cif); + return (sc->sc_ac.ac_enaddr); + } + } + } + CARP_UNLOCK(cif); + + return (NULL); +} +#endif + +struct ifnet * +carp_forus(void *v, void *dhost) +{ + struct carp_if *cif = v; + struct carp_softc *vh; + u_int8_t *ena = dhost; + + if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) + return (NULL); + + CARP_LOCK(cif); + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) + if ((vh->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) == + (IFF_UP|IFF_RUNNING) && vh->sc_state == MASTER && + !bcmp(dhost, vh->sc_ac.ac_enaddr, ETHER_ADDR_LEN)) { + CARP_UNLOCK(cif); + return (&vh->sc_if); + } + + CARP_UNLOCK(cif); + return (NULL); +} + +void +carp_master_down(void *v) +{ + struct carp_softc *sc = v; + + switch (sc->sc_state) { + case INIT: + printf("%s: master_down event in INIT state\n", + sc->sc_if.if_xname); + break; + case MASTER: + break; + case BACKUP: + carp_set_state(sc, MASTER); + carp_send_ad(sc); + carp_send_arp(sc); +#ifdef INET6 + carp_send_na(sc); +#endif /* INET6 */ + carp_setrun(sc, 0); + carp_setroute(sc, RTM_ADD); + break; + } +} + +/* + * When in backup state, af indicates whether to reset the master down timer + * for v4 or v6. If it's set to zero, reset the ones which are already pending. + */ +void +carp_setrun(struct carp_softc *sc, sa_family_t af) +{ + struct timeval tv; + + if (sc->sc_if.if_flags & IFF_UP && + sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6)) + sc->sc_if.if_flags |= IFF_RUNNING; + else { + sc->sc_if.if_flags &= ~IFF_RUNNING; + carp_setroute(sc, RTM_DELETE); + return; + } + + switch (sc->sc_state) { + case INIT: + if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) { + carp_send_ad(sc); + carp_send_arp(sc); +#ifdef INET6 + carp_send_na(sc); +#endif /* INET6 */ + carp_set_state(sc, MASTER); + carp_setroute(sc, RTM_ADD); + } else { + carp_set_state(sc, BACKUP); + carp_setroute(sc, RTM_DELETE); + carp_setrun(sc, 0); + } + break; + case BACKUP: + callout_stop(&sc->sc_ad_tmo); + tv.tv_sec = 3 * sc->sc_advbase; + tv.tv_usec = sc->sc_advskew * 1000000 / 256; + switch (af) { +#ifdef INET + case AF_INET: + callout_reset(&sc->sc_md_tmo, tvtohz(&tv), + carp_master_down, sc); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), + carp_master_down, sc); + break; +#endif /* INET6 */ + default: + if (sc->sc_naddrs) + callout_reset(&sc->sc_md_tmo, tvtohz(&tv), + carp_master_down, sc); + if (sc->sc_naddrs6) + callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), + carp_master_down, sc); + break; + } + break; + case MASTER: + tv.tv_sec = sc->sc_advbase; + tv.tv_usec = sc->sc_advskew * 1000000 / 256; + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + break; + } +} + +int +carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) +{ + struct ifnet *ifp; + struct carp_if *cif; + struct in_ifaddr *ia, *ia_if; + struct ip_moptions *imo = &sc->sc_imo; + struct in_addr addr; + u_long iaddr = htonl(sin->sin_addr.s_addr); + int own, error; + + if (sin->sin_addr.s_addr == 0) { + if (!(sc->sc_if.if_flags & IFF_UP)) + carp_set_state(sc, INIT); + if (sc->sc_naddrs) + sc->sc_if.if_flags |= IFF_UP; + carp_setrun(sc, 0); + return (0); + } + + /* we have to do it by hands to check we won't match on us */ + ia_if = NULL; own = 0; + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + /* and, yeah, we need a multicast-capable iface too */ + if (ia->ia_ifp != &sc->sc_if && + (ia->ia_ifp->if_flags & IFF_MULTICAST) && + (iaddr & ia->ia_subnetmask) == ia->ia_subnet) { + if (!ia_if) + ia_if = ia; + if (sin->sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + own++; + } + } + + if (!ia_if) + return (EADDRNOTAVAIL); + + ia = ia_if; + ifp = ia->ia_ifp; + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || + (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) + return (EADDRNOTAVAIL); + + if (imo->imo_num_memberships == 0) { + addr.s_addr = htonl(INADDR_CARP_GROUP); + if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == NULL) + return (ENOBUFS); + imo->imo_num_memberships++; + imo->imo_multicast_ifp = ifp; + imo->imo_multicast_ttl = CARP_DFLTTL; + imo->imo_multicast_loop = 0; + } + + if (!ifp->if_carp) { + + MALLOC(cif, struct carp_if *, sizeof(*cif), M_CARP, + M_WAITOK|M_ZERO); + if (!cif) { + error = ENOBUFS; + goto cleanup; + } + if ((error = ifpromisc(ifp, 1))) { + FREE(cif, M_CARP); + goto cleanup; + } + + CARP_LOCK_INIT(cif); + CARP_LOCK(cif); + cif->vhif_ifp = ifp; + TAILQ_INIT(&cif->vhif_vrs); + ifp->if_carp = cif; + + } else { + struct carp_softc *vr; + + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) + if (vr != sc && vr->sc_vhid == sc->sc_vhid) { + CARP_UNLOCK(cif); + error = EINVAL; + goto cleanup; + } + } + sc->sc_ia = ia; + sc->sc_ifp = ifp; + + { /* XXX prevent endless loop if already in queue */ + struct carp_softc *vr, *after = NULL; + int myself = 0; + cif = (struct carp_if *)ifp->if_carp; + + /* XXX: cif should not change, right? So we still hold the lock */ + CARP_LOCK_ASSERT(cif); + + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { + if (vr == sc) + myself = 1; + if (vr->sc_vhid < sc->sc_vhid) + after = vr; + } + + if (!myself) { + /* We're trying to keep things in order */ + if (after == NULL) { + TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); + } else { + TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); + } + cif->vhif_nvrs++; + } + } + + CARP_UNLOCK(cif); + + sc->sc_naddrs++; + sc->sc_if.if_flags |= IFF_UP; + if (own) + sc->sc_advskew = 0; + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + + return (0); + +cleanup: + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); + return (error); +} + +int +carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) +{ + int error = 0; + + if (!--sc->sc_naddrs) { + struct carp_if *cif = (struct carp_if *)sc->sc_ifp->if_carp; + struct ip_moptions *imo = &sc->sc_imo; + + callout_stop(&sc->sc_ad_tmo); + sc->sc_if.if_flags &= ~(IFF_UP|IFF_RUNNING); + sc->sc_vhid = -1; + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); + imo->imo_multicast_ifp = NULL; + CARP_LOCK(cif); + TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); + if (!--cif->vhif_nvrs) { + sc->sc_ifp->if_carp = NULL; + CARP_LOCK_DESTROY(cif); + FREE(cif, M_IFADDR); + } else { + CARP_UNLOCK(cif); + } + } + + return (error); +} + +#ifdef INET6 +int +carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) +{ + struct ifnet *ifp; + struct carp_if *cif; + struct in6_ifaddr *ia, *ia_if; + struct ip6_moptions *im6o = &sc->sc_im6o; + struct in6_multi_mship *imm; + struct sockaddr_in6 addr; + int own, error; + + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + if (!(sc->sc_if.if_flags & IFF_UP)) + carp_set_state(sc, INIT); + if (sc->sc_naddrs6) + sc->sc_if.if_flags |= IFF_UP; + carp_setrun(sc, 0); + return (0); + } + + /* we have to do it by hands to check we won't match on us */ + ia_if = NULL; own = 0; + for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + int i; + + for (i = 0; i < 4; i++) { + if ((sin6->sin6_addr.s6_addr32[i] & + ia->ia_prefixmask.sin6_addr.s6_addr32[i]) != + (ia->ia_addr.sin6_addr.s6_addr32[i] & + ia->ia_prefixmask.sin6_addr.s6_addr32[i])) + break; + } + /* and, yeah, we need a multicast-capable iface too */ + if (ia->ia_ifp != &sc->sc_ac.ac_if && + (ia->ia_ifp->if_flags & IFF_MULTICAST) && + (i == 4)) { + if (!ia_if) + ia_if = ia; + if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, + &ia->ia_addr.sin6_addr)) + own++; + } + } + + if (!ia_if) + return (EADDRNOTAVAIL); + ia = ia_if; + ifp = ia->ia_ifp; + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || + (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) + return (EADDRNOTAVAIL); + + if (!sc->sc_naddrs6) { + im6o->im6o_multicast_ifp = ifp; + + /* join CARP multicast address */ + bzero(&addr, sizeof(addr)); + addr.sin6_family = AF_INET6; + addr.sin6_len = sizeof(addr); + addr.sin6_addr.s6_addr16[0] = htons(0xff02); + addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + addr.sin6_addr.s6_addr8[15] = 0x12; + if ((imm = in6_joingroup(ifp, &addr.sin6_addr, &error)) == NULL) + goto cleanup; + LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); + + /* join solicited multicast address */ + bzero(&addr.sin6_addr, sizeof(addr.sin6_addr)); + addr.sin6_addr.s6_addr16[0] = htons(0xff02); + addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + addr.sin6_addr.s6_addr32[1] = 0; + addr.sin6_addr.s6_addr32[2] = htonl(1); + addr.sin6_addr.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3]; + addr.sin6_addr.s6_addr8[12] = 0xff; + if ((imm = in6_joingroup(ifp, &addr.sin6_addr, &error)) == NULL) + goto cleanup; + LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); + } + + if (!ifp->if_carp) { + MALLOC(cif, struct carp_if *, sizeof(*cif), M_CARP, + M_WAITOK|M_ZERO); + if (!cif) { + error = ENOBUFS; + goto cleanup; + } + if ((error = ifpromisc(ifp, 1))) { + FREE(cif, M_CARP); + goto cleanup; + } + + CARP_LOCK_INIT(cif); + CARP_LOCK(cif); + cif->vhif_ifp = ifp; + TAILQ_INIT(&cif->vhif_vrs); + ifp->if_carp = cif; + + } else { + struct carp_softc *vr; + + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) + if (vr != sc && vr->sc_vhid == sc->sc_vhid) { + CARP_UNLOCK(cif); + error = EINVAL; + goto cleanup; + } + } + sc->sc_ia6 = ia; + sc->sc_ifp = ifp; + + { /* XXX prevent endless loop if already in queue */ + struct carp_softc *vr, *after = NULL; + int myself = 0; + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK_ASSERT(cif); + + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { + if (vr == sc) + myself = 1; + if (vr->sc_vhid < sc->sc_vhid) + after = vr; + } + + if (!myself) { + /* We're trying to keep things in order */ + if (after == NULL) { + TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); + } else { + TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); + } + cif->vhif_nvrs++; + } + } + + CARP_UNLOCK(cif); + sc->sc_naddrs6++; + sc->sc_ac.ac_if.if_flags |= IFF_UP; + if (own) + sc->sc_advskew = 0; + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + + return (0); + +cleanup: + /* clean up multicast memberships */ + if (!sc->sc_naddrs6) { + while (!LIST_EMPTY(&im6o->im6o_memberships)) { + imm = LIST_FIRST(&im6o->im6o_memberships); + LIST_REMOVE(imm, i6mm_chain); + in6_leavegroup(imm); + } + } + return (error); +} + +int +carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) +{ + int error = 0; + + if (!--sc->sc_naddrs6) { + struct carp_if *cif = (struct carp_if *)sc->sc_ifp->if_carp; + struct ip6_moptions *im6o = &sc->sc_im6o; + + callout_stop(&sc->sc_ad_tmo); + sc->sc_ac.ac_if.if_flags &= ~(IFF_UP|IFF_RUNNING); + sc->sc_vhid = -1; + CARP_LOCK(cif); + while (!LIST_EMPTY(&im6o->im6o_memberships)) { + struct in6_multi_mship *imm = + LIST_FIRST(&im6o->im6o_memberships); + + LIST_REMOVE(imm, i6mm_chain); + in6_leavegroup(imm); + } + im6o->im6o_multicast_ifp = NULL; + TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); + if (!--cif->vhif_nvrs) { + CARP_LOCK_DESTROY(cif); + sc->sc_ifp->if_carp = NULL; + FREE(cif, M_IFADDR); + } else + CARP_UNLOCK(cif); + } + + return (error); +} +#endif /* INET6 */ + +int +carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + struct carp_softc *sc = ifp->if_softc, *vr; + struct carpreq carpr; + struct ifaddr *ifa; + struct ifreq *ifr; + struct ifaliasreq *ifra; + int error = 0; + + ifa = (struct ifaddr *)addr; + ifra = (struct ifaliasreq *)addr; + ifr = (struct ifreq *)addr; + + switch (cmd) { + case SIOCSIFADDR: + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + sc->sc_if.if_flags |= IFF_UP; + bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, + sizeof(struct sockaddr)); + error = carp_set_addr(sc, satosin(ifa->ifa_addr)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + sc->sc_if.if_flags |= IFF_UP; + error = carp_set_addr6(sc, satosin6(ifa->ifa_addr)); + break; +#endif /* INET6 */ + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCAIFADDR: + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + sc->sc_if.if_flags |= IFF_UP; + bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, + sizeof(struct sockaddr)); + error = carp_set_addr(sc, satosin(&ifra->ifra_addr)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + sc->sc_if.if_flags |= IFF_UP; + error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr)); + break; +#endif /* INET6 */ + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCDIFADDR: + sc->if_flags &= ~IFF_UP; + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + error = carp_del_addr(sc, satosin(&ifra->ifra_addr)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr)); + break; +#endif /* INET6 */ + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCSIFFLAGS: + if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) { + sc->if_flags &= ~IFF_UP; + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + if (sc->sc_state == MASTER) + carp_send_ad(sc); + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) { + sc->sc_if.if_flags |= IFF_UP; + carp_setrun(sc, 0); + } + break; + + case SIOCSVH: + if ((error = suser(curthread)) != 0) + break; + if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) + break; + error = 1; + if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { + switch (carpr.carpr_state) { + case BACKUP: + callout_stop(&sc->sc_ad_tmo); + carp_set_state(sc, BACKUP); + carp_setrun(sc, 0); + carp_setroute(sc, RTM_DELETE); + break; + case MASTER: + carp_master_down(sc); + break; + default: + break; + } + } + if (carpr.carpr_vhid > 0) { + if (carpr.carpr_vhid > 255) { + error = EINVAL; + break; + } + if (sc->sc_ifp) { + struct carp_if *cif; + cif = (struct carp_if *)sc->sc_ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) + if (vr != sc && + vr->sc_vhid == carpr.carpr_vhid) { + CARP_UNLOCK(cif); + return EINVAL; + } + CARP_UNLOCK(cif); + } + sc->sc_vhid = carpr.carpr_vhid; + sc->sc_ac.ac_enaddr[0] = 0; + sc->sc_ac.ac_enaddr[1] = 0; + sc->sc_ac.ac_enaddr[2] = 0x5e; + sc->sc_ac.ac_enaddr[3] = 0; + sc->sc_ac.ac_enaddr[4] = 1; + sc->sc_ac.ac_enaddr[5] = sc->sc_vhid; + error--; + } + if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) { + if (carpr.carpr_advskew >= 255) { + error = EINVAL; + break; + } + if (carpr.carpr_advbase > 255) { + error = EINVAL; + break; + } + sc->sc_advbase = carpr.carpr_advbase; + sc->sc_advskew = carpr.carpr_advskew; + error--; + } + bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); + if (error > 0) + error = EINVAL; + else { + error = 0; + carp_setrun(sc, 0); + } + break; + + case SIOCGVH: + bzero(&carpr, sizeof(carpr)); + carpr.carpr_state = sc->sc_state; + carpr.carpr_vhid = sc->sc_vhid; + carpr.carpr_advbase = sc->sc_advbase; + carpr.carpr_advskew = sc->sc_advskew; + if (suser(curthread) == 0) + bcopy(sc->sc_key, carpr.carpr_key, + sizeof(carpr.carpr_key)); + error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); + break; + + default: + error = EINVAL; + } + + carp_hmac_prepare(sc); + return (error); +} + +/* + * XXX: this is looutput. We should eventually use it from there. + */ +static int +carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ + M_ASSERTPKTHDR(m); /* check if we have the packet header */ + + if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + m_freem(m); + return (rt->rt_flags & RTF_BLACKHOLE ? 0 : + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + } + + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; +#if 1 /* XXX */ + switch (dst->sa_family) { + case AF_INET: + case AF_INET6: + case AF_IPX: + case AF_APPLETALK: + break; + default: + printf("carp_looutput: af=%d unexpected\n", dst->sa_family); + m_freem(m); + return (EAFNOSUPPORT); + } +#endif + return(if_simloop(ifp, m, dst->sa_family, 0)); +} + +/* + * Start output on carp interface. This function should never be called. + */ +void +carp_start(struct ifnet *ifp) +{ +#ifdef DEBUG + printf("%s: start called\n", ifp->if_xname); +#endif +} + +int +carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, + struct rtentry *rt) +{ + struct m_tag *mtag; + struct carp_softc *sc; + struct ifnet *carp_ifp; + + if (!sa) + return (0); + + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + break; +#endif /* INET6 */ + default: + return (0); + } + + mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); + if (mtag == NULL) + return (0); + + bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *)); + sc = carp_ifp->if_softc; + + /* Set the source MAC address to Virtual Router MAC Address */ + switch (ifp->if_type) { + case IFT_ETHER: { + struct ether_header *eh; + + eh = mtod(m, struct ether_header *); + eh->ether_shost[0] = 0; + eh->ether_shost[1] = 0; + eh->ether_shost[2] = 0x5e; + eh->ether_shost[3] = 0; + eh->ether_shost[4] = 1; + eh->ether_shost[5] = sc->sc_vhid; + } + break; + case IFT_FDDI: { + struct fddi_header *fh; + + fh = mtod(m, struct fddi_header *); + fh->fddi_shost[0] = 0; + fh->fddi_shost[1] = 0; + fh->fddi_shost[2] = 0x5e; + fh->fddi_shost[3] = 0; + fh->fddi_shost[4] = 1; + fh->fddi_shost[5] = sc->sc_vhid; + } + break; + case IFT_ISO88025: { + struct iso88025_header *th; + th = mtod(m, struct iso88025_header *); + th->iso88025_shost[0] = 3; + th->iso88025_shost[1] = 0; + th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1); + th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1); + th->iso88025_shost[4] = 0; + th->iso88025_shost[5] = 0; + } + break; + default: + printf("%s: carp is not supported for this interface type\n", + ifp->if_xname); + return (EOPNOTSUPP); + } + + return (0); +} + +void +carp_set_state(struct carp_softc *sc, int state) +{ + if (sc->sc_state == state) + return; + + sc->sc_state = state; + switch (state) { + case BACKUP: + sc->sc_ac.ac_if.if_link_state = LINK_STATE_DOWN; + break; + case MASTER: + sc->sc_ac.ac_if.if_link_state = LINK_STATE_UP; + break; + default: + sc->sc_ac.ac_if.if_link_state = LINK_STATE_UNKNOWN; + break; + } + rt_ifmsg(&sc->sc_ac.ac_if); +} + +void +carp_carpdev_state(void *v) +{ + struct carp_if *cif = v; + struct carp_softc *sc; + + CARP_LOCK(cif); + TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { + if (sc->sc_ifp->if_link_state == LINK_STATE_DOWN || + !(sc->sc_ifp->if_flags & IFF_UP)) { + sc->sc_flags_backup = sc->sc_if.if_flags; + sc->sc_if.if_flags &= ~(IFF_UP|IFF_RUNNING); + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + if (!sc->sc_suppress) { + carp_suppress_preempt++; + if (carp_suppress_preempt == 1) + carp_send_ad_all(); + } + sc->sc_suppress = 1; + } else { + sc->sc_if.if_flags |= sc->sc_flags_backup; + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + if (sc->sc_suppress) + carp_suppress_preempt--; + sc->sc_suppress = 0; + } + } + CARP_UNLOCK(cif); +} + +static int +carp_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + LIST_INIT(&carpif_list); + if_clone_attach(&carp_cloner); + printf("carp: attached\n"); + break; + + case MOD_UNLOAD: + if_clone_detach(&carp_cloner); + while (!LIST_EMPTY(&carpif_list)) + carp_clone_destroy( + &LIST_FIRST(&carpif_list)->sc_if); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +static moduledata_t carp_mod = { + "carp", + carp_modevent, + 0 +}; + +DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/ip_carp.h b/sys/netinet/ip_carp.h new file mode 100644 index 000000000000..a050a88e3991 --- /dev/null +++ b/sys/netinet/ip_carp.h @@ -0,0 +1,163 @@ +/* $FreeBSD$ */ +/* $OpenBSD: ip_carp.h,v 1.8 2004/07/29 22:12:15 mcbride Exp $ */ + +/* + * Copyright (c) 2002 Michael Shalayeff. All rights reserved. + * Copyright (c) 2003 Ryan McBride. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_CARP_H +#define _IP_CARP_H + +/* + * The CARP header layout is as follows: + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |Version| Type | VirtualHostID | AdvSkew | Auth Len | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Reserved | AdvBase | Checksum | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Counter (1) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Counter (2) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (1) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (2) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (3) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (4) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (5) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + */ + +struct carp_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int8_t carp_type:4, + carp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int8_t carp_version:4, + carp_type:4; +#endif + u_int8_t carp_vhid; /* virtual host id */ + u_int8_t carp_advskew; /* advertisement skew */ + u_int8_t carp_authlen; /* size of counter+md, 32bit chunks */ + u_int8_t carp_pad1; /* reserved */ + u_int8_t carp_advbase; /* advertisement interval */ + u_int16_t carp_cksum; + u_int32_t carp_counter[2]; + unsigned char carp_md[20]; /* SHA1 HMAC */ +} __packed; + +#define CARP_DFLTTL 255 + +/* carp_version */ +#define CARP_VERSION 2 + +/* carp_type */ +#define CARP_ADVERTISEMENT 0x01 + +#define CARP_KEY_LEN 20 /* a sha1 hash of a passphrase */ + +/* carp_advbase */ +#define CARP_DFLTINTV 1 + +/* + * Statistics. + */ +struct carpstats { + uint64_t carps_ipackets; /* total input packets, IPv4 */ + uint64_t carps_ipackets6; /* total input packets, IPv6 */ + uint64_t carps_badif; /* wrong interface */ + uint64_t carps_badttl; /* TTL is not CARP_DFLTTL */ + uint64_t carps_hdrops; /* packets shorter than hdr */ + uint64_t carps_badsum; /* bad checksum */ + uint64_t carps_badver; /* bad (incl unsupp) version */ + uint64_t carps_badlen; /* data length does not match */ + uint64_t carps_badauth; /* bad authentication */ + uint64_t carps_badvhid; /* bad VHID */ + uint64_t carps_badaddrs; /* bad address list */ + + uint64_t carps_opackets; /* total output packets, IPv4 */ + uint64_t carps_opackets6; /* total output packets, IPv6 */ + uint64_t carps_onomem; /* no memory for an mbuf */ + uint64_t carps_ostates; /* total state updates sent */ + + uint64_t carps_preempt; /* if enabled, preemptions */ +}; + +/* + * Configuration structure for SIOCSVH SIOCGVH + */ +struct carpreq { + int carpr_state; +#define CARP_STATES "INIT", "BACKUP", "MASTER" +#define CARP_MAXSTATE 2 + int carpr_vhid; + int carpr_advskew; + int carpr_advbase; + unsigned char carpr_key[CARP_KEY_LEN]; +}; +#define SIOCSVH _IOWR('i', 245, struct ifreq) +#define SIOCGVH _IOWR('i', 246, struct ifreq) + +/* + * Names for CARP sysctl objects + */ +#define CARPCTL_ALLOW 1 /* accept incoming CARP packets */ +#define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */ +#define CARPCTL_LOG 3 /* log bad packets */ +#define CARPCTL_STATS 4 /* statistics (read-only) */ +#define CARPCTL_ARPBALANCE 5 /* balance arp responses */ +#define CARPCTL_MAXID 6 + +#define CARPCTL_NAMES { \ + { 0, 0 }, \ + { "allow", CTLTYPE_INT }, \ + { "preempt", CTLTYPE_INT }, \ + { "log", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "arpbalance", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +void carp_ifdetach (struct ifnet *); +void carp_carpdev_state(void *); +void carp_input (struct mbuf *, int); +int carp6_input (struct mbuf **, int *, int); +int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +int carp_iamatch (void *, struct in_ifaddr *, struct in_addr *, + u_int8_t **); +struct ifaddr *carp_iamatch6(void *, struct in6_addr *); +void *carp_macmatch6(void *, struct mbuf *, const struct in6_addr *); +struct ifnet *carp_forus (void *, void *); +#endif +#endif /* _IP_CARP_H */ diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 3021285bfde1..ecf79aef7041 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -35,6 +35,7 @@ #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_carp.h" #include #include @@ -66,6 +67,9 @@ #include #include #include +#ifdef DEV_CARP +#include +#endif #include @@ -509,10 +513,17 @@ passin: * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. + * + * XXX - This is the case for carp vhost IPs as well so we + * insert a workaround. If the packet got here, we already + * checked with carp_iamatch() and carp_forus(). */ checkif = ip_checkinterface && (ipforwarding == 0) && m->m_pkthdr.rcvif != NULL && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && +#ifdef DEV_CARP + !m->m_pkthdr.rcvif->if_carp && +#endif (dchg == 0); /* diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 163dbfe8b245..7bd5e6464ada 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -206,7 +206,7 @@ in6_ifloop_request(int cmd, struct ifaddr *ifa) * rely on the cloning mechanism from the corresponding interface route * any more. */ -static void +void in6_ifaddloop(struct ifaddr *ifa) { struct rtentry *rt; @@ -226,7 +226,7 @@ in6_ifaddloop(struct ifaddr *ifa) * Remove loopback rtentry of ownaddr generated by in6_ifaddloop(), * if it exists. */ -static void +void in6_ifremloop(struct ifaddr *ifa) { struct in6_ifaddr *ia; @@ -1551,6 +1551,39 @@ in6_ifinit(ifp, ia, sin6, newhost) return (error); } +struct in6_multi_mship * +in6_joingroup(ifp, addr, errorp) + struct ifnet *ifp; + struct in6_addr *addr; + int *errorp; +{ + struct in6_multi_mship *imm; + + imm = malloc(sizeof(*imm), M_IPMADDR, M_NOWAIT); + if (!imm) { + *errorp = ENOBUFS; + return NULL; + } + imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp); + if (!imm->i6mm_maddr) { + /* *errorp is alrady set */ + free(imm, M_IPMADDR); + return NULL; + } + return imm; +} + +int +in6_leavegroup(imm) + struct in6_multi_mship *imm; +{ + + if (imm->i6mm_maddr) + in6_delmulti(imm->i6mm_maddr); + free(imm, M_IPMADDR); + return 0; +} + /* * Find an IPv6 interface link-local address specific to an interface. */ diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c index fe12e4bf1f34..8837a91ee375 100644 --- a/sys/netinet6/in6_ifattach.c +++ b/sys/netinet6/in6_ifattach.c @@ -671,6 +671,7 @@ in6_ifattach(ifp, altifp) #endif case IFT_PFLOG: case IFT_PFSYNC: + case IFT_CARP: return; } diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index 129c5f3a57f1..790030383669 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -64,6 +64,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_carp.h" #include #include @@ -121,6 +122,10 @@ #endif #endif /* IPSEC */ +#ifdef DEV_CARP +#include +#endif + #ifdef FAST_IPSEC #include #define IPSEC @@ -241,6 +246,14 @@ struct ip6protosw inet6sw[] = { 0, 0, 0, 0, &rip6_usrreqs }, +#ifdef DEV_CARP +{ SOCK_RAW, &inet6domain, IPPROTO_CARP, PR_ATOMIC|PR_ADDR, + carp6_input, rip6_output, 0, rip6_ctloutput, + 0, + 0, 0, 0, 0, + &rip6_usrreqs +}, +#endif /* DEV_CARP */ /* raw wildcard */ { SOCK_RAW, &inet6domain, 0, PR_ATOMIC|PR_ADDR, rip6_input, rip6_output, 0, rip6_ctloutput, diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h index c89a9b59b05c..05ec3d591816 100644 --- a/sys/netinet6/in6_var.h +++ b/sys/netinet6/in6_var.h @@ -578,6 +578,8 @@ do { \ struct in6_multi *in6_addmulti __P((struct in6_addr *, struct ifnet *, int *)); void in6_delmulti __P((struct in6_multi *)); +struct in6_multi_mship *in6_joingroup(struct ifnet *, struct in6_addr *, int *); +int in6_leavegroup(struct in6_multi_mship *); int in6_mask2len __P((struct in6_addr *, u_char *)); int in6_control __P((struct socket *, u_long, caddr_t, struct ifnet *, struct thread *)); @@ -604,6 +606,8 @@ int in6_prefix_ioctl __P((struct socket *, u_long, caddr_t, int in6_prefix_add_ifid __P((int, struct in6_ifaddr *)); void in6_prefix_remove_ifid __P((int, struct in6_ifaddr *)); void in6_purgeprefix __P((struct ifnet *)); +void in6_ifremloop(struct ifaddr *); +void in6_ifaddloop(struct ifaddr *); int in6_is_addr_deprecated __P((struct sockaddr_in6 *)); struct inpcb; diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index 2c639b0761e1..e274966d4b0e 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -2024,6 +2024,9 @@ nd6_need_cache(ifp) #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: +#endif +#ifdef IFT_CARP + case IFT_CARP: #endif case IFT_GIF: /* XXX need more cases? */ return (1); diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c index 3abcf36faf7c..e428bd42aef6 100644 --- a/sys/netinet6/nd6_nbr.c +++ b/sys/netinet6/nd6_nbr.c @@ -32,6 +32,8 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_carp.h" #include #include @@ -59,6 +61,10 @@ #include #include +#ifdef DEV_CARP +#include +#endif + #include #define SDL(s) ((struct sockaddr_dl *)s) @@ -94,7 +100,7 @@ nd6_ns_input(m, off, icmp6len) struct in6_addr taddr6; struct in6_addr myaddr6; char *lladdr = NULL; - struct ifaddr *ifa; + struct ifaddr *ifa = NULL; int lladdrlen = 0; int anycast = 0, proxy = 0, tentative = 0; int tlladdr; @@ -193,7 +199,14 @@ nd6_ns_input(m, off, icmp6len) * (3) "tentative" address on which DAD is being performed. */ /* (1) and (3) check. */ +#ifdef DEV_CARP + if (ifp->if_carp) + ifa = carp_iamatch6(ifp->if_carp, &taddr6); + if (!ifa) + ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); +#else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); +#endif /* (2) check. */ if (!ifa) { @@ -888,9 +901,16 @@ nd6_na_output(ifp, daddr6, taddr6, flags, tlladdr, sdl0) * lladdr in sdl0. If we are not proxying (sending NA for * my address) use lladdr configured for the interface. */ - if (sdl0 == NULL) + if (sdl0 == NULL) { +#ifdef DEV_CARP + if (ifp->if_carp) + mac = carp_macmatch6(ifp->if_carp, m, taddr6); + if (mac == NULL) + mac = nd6_ifptomac(ifp); +#else mac = nd6_ifptomac(ifp); - else if (sdl0->sa_family == AF_LINK) { +#endif + } else if (sdl0->sa_family == AF_LINK) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)sdl0; if (sdl->sdl_alen == ifp->if_addrlen) @@ -942,6 +962,9 @@ nd6_ifptomac(ifp) #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: +#endif +#ifdef IFT_CARP + case IFT_CARP: #endif case IFT_ISO88025: return ((caddr_t)(ifp + 1)); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index c954f92088af..1621fa716252 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -664,6 +664,7 @@ struct mbuf *m_uiotombuf(struct uio *, int, int); #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_PF_TRANSLATE_LOCALHOST 26 /* PF translate localhost */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ +#define PACKET_TAG_CARP 28 /* CARP info */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index ea679f81af7a..6590848e904b 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef INET6 #include #endif /* INET6 */ @@ -525,6 +526,50 @@ udp_stats(u_long off __unused, const char *name, int af1 __unused) #undef p1a } +/* + * Dump CARP statistics structure. + */ +void +carp_stats(u_long off, const char *name, int af1 __unused) +{ + struct carpstats carpstat, zerostat; + size_t len = sizeof(struct carpstats); + + if (zflag) + memset(&zerostat, 0, len); + if (sysctlbyname("net.inet.carp.stats", &carpstat, &len, + zflag ? &zerostat : NULL, zflag ? len : 0) < 0) { + warn("sysctl: net.inet.carp.stats"); + return; + } + + printf("%s:\n", name); + +#define p(f, m) if (carpstat.f || sflag <= 1) \ + printf(m, (unsigned long long)carpstat.f, plural((int)carpstat.f)) +#define p2(f, m) if (carpstat.f || sflag <= 1) \ + printf(m, (unsigned long long)carpstat.f) + + p(carps_ipackets, "\t%llu packet%s received (IPv4)\n"); + p(carps_ipackets6, "\t%llu packet%s received (IPv6)\n"); + p(carps_badttl, "\t\t%llu packet%s discarded for wrong TTL\n"); + p(carps_hdrops, "\t\t%llu packet%s shorter than header\n"); + p(carps_badsum, "\t\t%llu discarded for bad checksum%s\n"); + p(carps_badver, "\t\t%llu discarded packet%s with a bad version\n"); + p2(carps_badlen, "\t\t%llu discarded because packet too short\n"); + p2(carps_badauth, "\t\t%llu discarded for bad authentication\n"); + p2(carps_badvhid, "\t\t%llu discarded for bad vhid\n"); + p2(carps_badaddrs, "\t\t%llu discarded because of a bad address list\n"); + p(carps_opackets, "\t%llu packet%s sent (IPv4)\n"); + p(carps_opackets6, "\t%llu packet%s sent (IPv6)\n"); + p2(carps_onomem, "\t\t%llu send failed due to mbuf memory error\n"); +#if notyet + p(carps_ostates, "\t\t%s state update%s sent\n"); +#endif +#undef p +#undef p2 +} + /* * Dump IP statistics structure. */ diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c index e5cd2fdfa6a2..fc32e82acfa6 100644 --- a/usr.bin/netstat/main.c +++ b/usr.bin/netstat/main.c @@ -136,6 +136,8 @@ static struct nlist nl[] = { { "_mbuf_lowm" }, #define N_CLLO 32 { "_clust_lowm" }, +#define N_CARPSTAT 33 + { "_carpstats" }, { "" }, }; @@ -171,6 +173,8 @@ struct protox { bdg_stats, NULL, "bdg", 1 /* bridging... */ }, { -1, -1, 1, protopr, pim_stats, NULL, "pim", IPPROTO_PIM }, + { -1, N_CARPSTAT, 1, 0, + carp_stats, NULL, "carp", 0}, { -1, -1, 0, NULL, NULL, NULL, NULL, 0 } }; diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h index e2b3f291a6e5..3cafa26de4d2 100644 --- a/usr.bin/netstat/netstat.h +++ b/usr.bin/netstat/netstat.h @@ -71,6 +71,7 @@ void ip_stats(u_long, const char *, int); void icmp_stats(u_long, const char *, int); void igmp_stats(u_long, const char *, int); void pim_stats(u_long, const char *, int); +void carp_stats (u_long, const char *, int); #ifdef IPSEC void ipsec_stats(u_long, const char *, int); #endif