From 08b68b0e4c6b132127919cfbaf7275c727ca7843 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 16 Dec 2011 12:16:56 +0000 Subject: [PATCH] A major overhaul of the CARP implementation. The ip_carp.c was started from scratch, copying needed functionality from the old implemenation on demand, with a thorough review of all code. The main change is that interface layer has been removed from the CARP. Now redundant addresses are configured exactly on the interfaces, they run on. The CARP configuration itself is, as before, configured and read via SIOCSVH/SIOCGVH ioctls. A new prefix created with SIOCAIFADDR or SIOCAIFADDR_IN6 may now be configured to a particular virtual host id, which makes the prefix redundant. ifconfig(8) semantics has been changed too: now one doesn't need to clone carpXX interface, he/she should directly configure a vhid on a Ethernet interface. To supply vhid data from the kernel to an application the getifaddrs(8) function had been changed to pass ifam_data with each address. [1] The new implementation definitely closes all PRs related to carp(4) being an interface, and may close several others. It also allows to run a single redundant IP per interface. Big thanks to Bjoern Zeeb for his help with inet6 part of patch, for idea on using ifam_data and for several rounds of reviewing! PR: kern/117000, kern/126945, kern/126714, kern/120130, kern/117448 Reviewed by: bz Submitted by: bz [1] --- UPDATING | 11 + lib/libc/net/getifaddrs.c | 2 +- sbin/ifconfig/af_inet.c | 5 +- sbin/ifconfig/af_inet6.c | 2 + sbin/ifconfig/ifcarp.c | 194 +-- sbin/ifconfig/ifconfig.8 | 46 +- sbin/ifconfig/ifconfig.c | 15 + sbin/ifconfig/ifconfig.h | 3 + share/man/man4/carp.4 | 246 ++-- sys/net/if.c | 22 +- sys/net/if.h | 5 +- sys/net/if_ethersubr.c | 2 +- sys/net/if_types.h | 1 - sys/net/if_var.h | 2 + sys/net/rtsock.c | 16 +- sys/netinet/if_ether.c | 32 +- sys/netinet/if_ether.h | 1 + sys/netinet/in.c | 63 +- sys/netinet/in_var.h | 3 + sys/netinet/ip_carp.c | 2438 +++++++++++++++-------------------- sys/netinet/ip_carp.h | 37 +- sys/netinet6/in6.c | 24 +- sys/netinet6/in6_ifattach.c | 1 - sys/netinet6/in6_var.h | 1 + sys/netinet6/nd6.c | 3 - sys/netinet6/nd6_nbr.c | 14 +- sys/sys/param.h | 2 +- 27 files changed, 1466 insertions(+), 1725 deletions(-) diff --git a/UPDATING b/UPDATING index 44f902d85a98..575a037f9317 100644 --- a/UPDATING +++ b/UPDATING @@ -22,6 +22,17 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 10.x IS SLOW: machines to maximize performance. (To disable malloc debugging, run ln -s aj /etc/malloc.conf.) +20111215: + The carp(4) facility has been changed significantly. Configuration + of the CARP protocol via ifconfig(8) has changed, as well as format + of CARP events submitted to devd(8) has changed. See manual pages + for more information. The arpbalance feature of carp(4) is currently + not supported anymore. + + Size of struct in_aliasreq, struct in6_aliasreq has changed. User + utilities using SIOCAIFADDR, SIOCAIFADDR_IN6, e.g. ifconfig(8), + need to be recompiled. + 20111122: The acpi_wmi(4) status device /dev/wmistat has been renamed to /dev/wmistat0. diff --git a/lib/libc/net/getifaddrs.c b/lib/libc/net/getifaddrs.c index 41ef3f464ab6..aada929c1e42 100644 --- a/lib/libc/net/getifaddrs.c +++ b/lib/libc/net/getifaddrs.c @@ -76,7 +76,7 @@ __FBSDID("$FreeBSD$"); #define HAVE_IFM_DATA #endif -#if _BSDI_VERSION >= 199802 +#if (_BSDI_VERSION >= 199802) || (__FreeBSD_version >= 1000003) /* ifam_data is very specific to recent versions of bsdi */ #define HAVE_IFAM_DATA #endif diff --git a/sbin/ifconfig/af_inet.c b/sbin/ifconfig/af_inet.c index edb9b80f408d..bcd17c4f4957 100644 --- a/sbin/ifconfig/af_inet.c +++ b/sbin/ifconfig/af_inet.c @@ -84,8 +84,11 @@ in_status(int s __unused, const struct ifaddrs *ifa) if (ifa->ifa_flags & IFF_BROADCAST) { sin = (struct sockaddr_in *)ifa->ifa_broadaddr; if (sin != NULL && sin->sin_addr.s_addr != 0) - printf("broadcast %s", inet_ntoa(sin->sin_addr)); + printf("broadcast %s ", inet_ntoa(sin->sin_addr)); } + + print_vhid(ifa, " "); + putchar('\n'); } diff --git a/sbin/ifconfig/af_inet6.c b/sbin/ifconfig/af_inet6.c index e39c1c8cfc20..0731238bf9bc 100644 --- a/sbin/ifconfig/af_inet6.c +++ b/sbin/ifconfig/af_inet6.c @@ -307,6 +307,8 @@ in6_status(int s __unused, const struct ifaddrs *ifa) printf("infty "); } + print_vhid(ifa, " "); + putchar('\n'); } diff --git a/sbin/ifconfig/ifcarp.c b/sbin/ifconfig/ifcarp.c index 2306717e63b8..2c58fcb1e931 100644 --- a/sbin/ifconfig/ifcarp.c +++ b/sbin/ifconfig/ifcarp.c @@ -35,10 +35,11 @@ #include #include -#include #include +#include +#include +#include #include -#include #include #include @@ -52,127 +53,153 @@ static const char *carp_states[] = { CARP_STATES }; -void carp_status(int s); -void setcarp_advbase(const char *,int, int, const struct afswtch *rafp); -void setcarp_advskew(const char *, int, int, const struct afswtch *rafp); -void setcarp_passwd(const char *, int, int, const struct afswtch *rafp); -void setcarp_vhid(const char *, int, int, const struct afswtch *rafp); +static void carp_status(int s); +static void setcarp_vhid(const char *, int, int, const struct afswtch *rafp); +static void setcarp_callback(int, void *); +static void setcarp_advbase(const char *,int, int, const struct afswtch *rafp); +static void setcarp_advskew(const char *, int, int, const struct afswtch *rafp); +static void setcarp_passwd(const char *, int, int, const struct afswtch *rafp); -void +static int carpr_vhid = -1; +static int carpr_advskew = -1; +static int carpr_advbase = -1; +static int carpr_state = -1; +static unsigned char const *carpr_key; + +static void carp_status(int s) { - const char *state; - struct carpreq carpr; + struct carpreq carpr[CARP_MAXVHID]; + int i; - memset((char *)&carpr, 0, sizeof(struct carpreq)); + bzero(carpr, sizeof(struct carpreq) * CARP_MAXVHID); + carpr[0].carpr_count = CARP_MAXVHID; ifr.ifr_data = (caddr_t)&carpr; if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) return; - if (carpr.carpr_vhid > 0) { - if (carpr.carpr_state > CARP_MAXSTATE) - state = ""; + for (i = 0; i < carpr[0].carpr_count; i++) { + printf("\tcarp: %s vhid %d advbase %d advskew %d", + carp_states[carpr[i].carpr_state], carpr[i].carpr_vhid, + carpr[i].carpr_advbase, carpr[i].carpr_advskew); + if (printkeys && carpr[i].carpr_key[0] != '\0') + printf(" key \"%s\"\n", carpr[i].carpr_key); else - state = carp_states[carpr.carpr_state]; - - printf("\tcarp: %s vhid %d advbase %d advskew %d\n", - state, carpr.carpr_vhid, carpr.carpr_advbase, - carpr.carpr_advskew); + printf("\n"); } - - return; - } -void -setcarp_passwd(const char *val, int d, int s, const struct afswtch *afp) -{ - struct carpreq carpr; - - memset((char *)&carpr, 0, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; - - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); - - memset(carpr.carpr_key, 0, sizeof(carpr.carpr_key)); - /* XXX Should hash the password into the key here, perhaps? */ - strlcpy(carpr.carpr_key, val, CARP_KEY_LEN); - - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); - - return; -} - -void +static void setcarp_vhid(const char *val, int d, int s, const struct afswtch *afp) { - int vhid; + + carpr_vhid = atoi(val); + + if (carpr_vhid <= 0 || carpr_vhid > CARP_MAXVHID) + errx(1, "vhid must be greater than 0 and less than %u", + CARP_MAXVHID); + + switch (afp->af_af) { +#ifdef INET + case AF_INET: + { + struct in_aliasreq *ifra; + + ifra = (struct in_aliasreq *)afp->af_addreq; + ifra->ifra_vhid = carpr_vhid; + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_aliasreq *ifra; + + ifra = (struct in6_aliasreq *)afp->af_addreq; + ifra->ifra_vhid = carpr_vhid; + break; + } +#endif + default: + errx(1, "%s doesn't support carp(4)", afp->af_name); + } + + callback_register(setcarp_callback, NULL); +} + +static void +setcarp_callback(int s, void *arg __unused) +{ struct carpreq carpr; - vhid = atoi(val); - - if (vhid <= 0) - errx(1, "vhid must be greater than 0"); - - memset((char *)&carpr, 0, sizeof(struct carpreq)); + bzero(&carpr, sizeof(struct carpreq)); + carpr.carpr_vhid = carpr_vhid; + carpr.carpr_count = 1; ifr.ifr_data = (caddr_t)&carpr; - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1 && errno != ENOENT) err(1, "SIOCGVH"); - carpr.carpr_vhid = vhid; + if (carpr_key != NULL) + /* XXX Should hash the password into the key here? */ + strlcpy(carpr.carpr_key, carpr_key, CARP_KEY_LEN); + if (carpr_advskew > -1) + carpr.carpr_advskew = carpr_advskew; + if (carpr_advbase > -1) + carpr.carpr_advbase = carpr_advbase; + if (carpr_state > -1) + carpr.carpr_state = carpr_state; if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) err(1, "SIOCSVH"); - - return; } -void +static void +setcarp_passwd(const char *val, int d, int s, const struct afswtch *afp) +{ + + if (carpr_vhid == -1) + errx(1, "passwd requires vhid"); + + carpr_key = val; +} + +static void setcarp_advskew(const char *val, int d, int s, const struct afswtch *afp) { - int advskew; - struct carpreq carpr; - advskew = atoi(val); + if (carpr_vhid == -1) + errx(1, "advskew requires vhid"); - memset((char *)&carpr, 0, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; - - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); - - carpr.carpr_advskew = advskew; - - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); - - return; + carpr_advskew = atoi(val); } -void +static void setcarp_advbase(const char *val, int d, int s, const struct afswtch *afp) { - int advbase; - struct carpreq carpr; - advbase = atoi(val); + if (carpr_vhid == -1) + errx(1, "advbase requires vhid"); - memset((char *)&carpr, 0, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; + carpr_advbase = atoi(val); +} - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); +static void +setcarp_state(const char *val, int d, int s, const struct afswtch *afp) +{ + int i; - carpr.carpr_advbase = advbase; + if (carpr_vhid == -1) + errx(1, "state requires vhid"); - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); + for (i = 0; i <= CARP_MAXSTATE; i++) + if (strcasecmp(carp_states[i], val) == 0) { + carpr_state = i; + return; + } - return; + errx(1, "unknown state"); } static struct cmd carp_cmds[] = { @@ -180,6 +207,7 @@ static struct cmd carp_cmds[] = { DEF_CMD_ARG("advskew", setcarp_advskew), DEF_CMD_ARG("pass", setcarp_passwd), DEF_CMD_ARG("vhid", setcarp_vhid), + DEF_CMD_ARG("state", setcarp_state), }; static struct afswtch af_carp = { .af_name = "af_carp", diff --git a/sbin/ifconfig/ifconfig.8 b/sbin/ifconfig/ifconfig.8 index c6b8ea3e4f74..afee726cc4b8 100644 --- a/sbin/ifconfig/ifconfig.8 +++ b/sbin/ifconfig/ifconfig.8 @@ -28,7 +28,7 @@ .\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94 .\" $FreeBSD$ .\" -.Dd November 12, 2011 +.Dd December 16, 2011 .Dt IFCONFIG 8 .Os .Sh NAME @@ -2445,16 +2445,36 @@ The argument is useless and hence deprecated. .El .Pp -The following parameters are specific to +The following parameters are used to configure .Xr carp 4 -interfaces: +protocol on an interface: .Bl -tag -width indent +.It Cm vhid Ar n +Set the virtual host ID. +This is a required setting to initiate +.Xr carp 4 . +If the virtual host ID doesn't exist yet, it is created and attached to the +interface, otherwise configuration of an existing vhid is adjusted. +If the +.Cm vhid +keyword is supplied along with an +.Dq inet6 +or +.Dq inet +address, then this address is configured to be run under control of the +specified vhid. +Whenever a last address that refers to a particular vhid is removed from an +interface, the vhid is automatically removed from interface and destroyed. +Any other configuration parameters for the +.Xr carp 4 +protocol should be supplied along with the +.Cm vhid +keyword. +Acceptable values for vhid are 1 to 255. .It Cm advbase Ar seconds Specifies the base of the advertisement interval in seconds. The acceptable values are 1 to 255. The default value is 1. -.\" The default value is -.\" .Dv CARP_DFLTINTV . .It Cm advskew Ar interval Specifies the skew to add to the base advertisement interval to make one host advertise slower than another host. @@ -2464,10 +2484,8 @@ The default value is 0. .It Cm pass Ar phrase Set the authentication key to .Ar phrase . -.It Cm vhid Ar n -Set the virtual host ID. -This is a required setting. -Acceptable values are 1 to 255. +.It Cm state Ar MASTER|BACKUP +Forcibly change state of a given vhid. .El .Pp The @@ -2530,8 +2548,9 @@ The .Fl k flag causes keying information for the interface, if available, to be printed. -For example, the values of 802.11 WEP keys will be printed, if accessible to -the current user. +For example, the values of 802.11 WEP keys and +.Xr carp 4 +passphrases will be printed, if accessible to the current user. This information is not printed by default, as it may be considered sensitive. .Pp @@ -2593,6 +2612,11 @@ as a synonym for the canonical form of the option .Fl alias : .Dl # ifconfig em0 inet6 2001:db8:bdbd::123/48 delete .Pp +Configure a single CARP redundant address on igb0, and then switch it +to be master: +.Dl # ifconfig igb0 vhid 1 10.0.0.1/24 pass foobar +.Dl # ifconfig igb0 vhid 1 state master +.Pp Configure the interface .Li xl0 , to use 100baseTX, full duplex Ethernet media options: diff --git a/sbin/ifconfig/ifconfig.c b/sbin/ifconfig/ifconfig.c index af280ce24020..0e3c3a30bae9 100644 --- a/sbin/ifconfig/ifconfig.c +++ b/sbin/ifconfig/ifconfig.c @@ -1077,6 +1077,21 @@ printb(const char *s, unsigned v, const char *bits) } } +void +print_vhid(const struct ifaddrs *ifa, const char *s) +{ + struct if_data *ifd; + + if (ifa->ifa_data == NULL) + return; + + ifd = ifa->ifa_data; + if (ifd->ifi_vhid == 0) + return; + + printf("vhid %d ", ifd->ifi_vhid); +} + void ifmaybeload(const char *name) { diff --git a/sbin/ifconfig/ifconfig.h b/sbin/ifconfig/ifconfig.h index d6f534993e77..ea21db5f52b9 100644 --- a/sbin/ifconfig/ifconfig.h +++ b/sbin/ifconfig/ifconfig.h @@ -148,3 +148,6 @@ void clone_setdefcallback(const char *, clone_callback_func *); * operations on ifmedia can avoid cmd line ordering confusion. */ struct ifmediareq *ifmedia_getstate(int s); + +void print_vhid(const struct ifaddrs *, const char *); + diff --git a/share/man/man4/carp.4 b/share/man/man4/carp.4 index 4064c6c684e5..7c214ff9b7e4 100644 --- a/share/man/man4/carp.4 +++ b/share/man/man4/carp.4 @@ -1,6 +1,7 @@ .\" $OpenBSD: carp.4,v 1.16 2004/12/07 23:41:35 jmc Exp $ .\" .\" Copyright (c) 2003, Ryan McBride. All rights reserved. +.\" Copyright (c) 2011, Gleb Smirnoff .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions @@ -25,7 +26,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 15, 2011 +.Dd December 16, 2011 .Dt CARP 4 .Os .Sh NAME @@ -34,33 +35,17 @@ .Sh SYNOPSIS .Cd "device carp" .Sh DESCRIPTION -The -.Nm -interface is a pseudo-device that implements and controls the -CARP protocol. -CARP allows multiple hosts on the same local network to share a set of IP addresses. +The CARP allows multiple hosts on the same local network to share a set of +IPv4 and/or IPv6 addresses. Its primary purpose is to ensure that these -addresses are always available, but in some configurations -.Nm -can also provide load balancing functionality. -.Pp -A -.Nm -interface can be created at runtime using the -.Nm ifconfig Li carp Ns Ar N Cm create -command or by configuring -it via -.Va cloned_interfaces -in the -.Pa /etc/rc.conf -file. +addresses are always available. .Pp To use .Nm , -the administrator needs to configure at minimum a common virtual host ID (VHID) -and virtual host IP address on each machine which is to take part in the virtual -group. -Additional parameters can also be set on a per-interface basis: +the administrator needs to configure at minimum a common virtual host ID +(vhid) and attach at least one IP address to this vhid on each machine which +is to take part in the virtual group. +Additional parameters can also be set on a per-vhid basis: .Cm advbase and .Cm advskew , @@ -93,9 +78,20 @@ or through the .Dv SIOCSVH .Xr ioctl 2 . .Pp +CARP virtual hosts can be configured on multicast capable interfaces: Ethernet, +layer 2 VLAN, FDDI and Token Ring. +An arbitrary number of virtual host IDs can be configured on an interface. +An arbitrary number of IPv4 or IPv6 addresses can be attached to a particular +vhid. +It is important that all hosts participating in a vhid have the same list +of prefixes configured on the vhid, since all prefixes are included in the +cryptographic checksum supplied in each advertisement. +Multiple vhids running on one interface participate in master/backup +elections independently. +.Pp Additionally, there are a number of global parameters which can be set using .Xr sysctl 8 : -.Bl -tag -width ".Va net.inet.carp.arpbalance" +.Bl -tag -width ".Va net.inet.carp.preempt" .It Va net.inet.carp.allow Accept incoming .Nm @@ -125,9 +121,6 @@ Values above 1 enable logging of bad .Nm packets. Default value is 1. -.It Va net.inet.carp.arpbalance -Balance local traffic using ARP (see below). -Disabled by default. .It Va net.inet.carp.suppress_preempt A read only value showing the status of preemption suppression. Preemption can be suppressed if link on an interface is down @@ -138,36 +131,36 @@ Value of 0 means that preemption is not suppressed, since no problems are detected. Every problem increments suppression counter. .El -.Sh ARP level load balancing -The -.Nm -has limited abilities for load balancing the incoming connections -between hosts in Ethernet network. -For load balancing operation, one needs several CARP interfaces that -are configured to the same IP address, but to a different VHIDs. -Once an ARP request is received, the CARP protocol will use a hashing -function against the source IP address in the ARP request to determine -which VHID should this request belong to. -If the corresponding CARP interface is in master state, the ARP request -will be replied, otherwise it will be ignored. -See the -.Sx EXAMPLES -section for a practical example of load balancing. -.Pp -The ARP load balancing has some limitations. -First, ARP balancing only works on the local network segment. -It cannot balance traffic that crosses a router, because the -router itself will always be balanced to the same virtual host. -Second, ARP load balancing can lead to asymmetric routing -of incoming and outgoing traffic, and thus combining it with -.Xr pfsync 4 -is dangerous, because this creates a race condition between -balanced routers and a host they are serving. -Imagine an incoming packet creating state on the first router, being -forwarded to its destination, and destination replying faster -than the state information is packed and synced with the second router. -If the reply would be load balanced to second router, it will be -dropped due to no state. +.\".Sh ARP level load balancing +.\"The +.\".Nm +.\"has limited abilities for load balancing the incoming connections +.\"between hosts in Ethernet network. +.\"For load balancing operation, one needs several CARP interfaces that +.\"are configured to the same IP address, but to a different vhids. +.\"Once an ARP request is received, the CARP protocol will use a hashing +.\"function against the source IP address in the ARP request to determine +.\"which vhid should this request belong to. +.\"If the corresponding CARP interface is in master state, the ARP request +.\"will be replied, otherwise it will be ignored. +.\"See the +.\".Sx EXAMPLES +.\"section for a practical example of load balancing. +.\".Pp +.\"The ARP load balancing has some limitations. +.\"First, ARP balancing only works on the local network segment. +.\"It cannot balance traffic that crosses a router, because the +.\"router itself will always be balanced to the same virtual host. +.\"Second, ARP load balancing can lead to asymmetric routing +.\"of incoming and outgoing traffic, and thus combining it with +.\".Xr pfsync 4 +.\"is dangerous, because this creates a race condition between +.\"balanced routers and a host they are serving. +.\"Imagine an incoming packet creating state on the first router, being +.\"forwarded to its destination, and destination replying faster +.\"than the state information is packed and synced with the second router. +.\"If the reply would be load balanced to second router, it will be +.\"dropped due to no state. .Sh STATE CHANGE NOTIFICATIONS Sometimes it is useful to get notified about .Nm @@ -175,13 +168,10 @@ status change events. This can be accomplished by using .Xr devd 8 hooks. -Master/slave events are signalled as -.Nm -interface -.Dv LINK_UP -or -.Dv LINK_DOWN -event. +Master/slave events are signalled under system +.Dv CARP . +Subsystem specifies vhid and name of interface, where event occured. +Type of the message displays new state of vhid. Please see .Xr devd.conf 5 and @@ -197,23 +187,19 @@ Enable it on both host A and B: .Pp .Dl sysctl net.inet.carp.preempt=1 .Pp -Assume that host A is the preferred master and 192.168.1.x/24 is -configured on one physical interface and 192.168.2.y/24 on another. +Assume that host A is the preferred master and we are running the +192.168.1.0/24 prefix on em0 and 192.168.2.0/24 on em1. This is the setup for host A: .Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 pass mekmitasdigoat 192.168.1.1/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 pass mekmitasdigoat 192.168.2.1/24 +ifconfig em0 vhid 1 pass mekmitasdigoat 192.168.1.1/24 +ifconfig em1 vhid 2 pass mekmitasdigoat 192.168.2.1/24 .Ed .Pp The setup for host B is identical, but it has a higher .Cm advskew : .Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.1/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.2.1/24 +ifconfig em0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.1/24 +ifconfig em1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.2.1/24 .Ed .Pp Because of the preempt option, when one of the physical interfaces of @@ -224,67 +210,60 @@ is adjusted to 240 on all its interfaces. This will cause host B to preempt on both interfaces instead of just the failed one. -.Pp -In order to set up an ARP balanced virtual host, it is necessary to configure -one virtual host for each physical host which would respond to ARP requests -and thus handle the traffic. -In the following example, two virtual hosts are configured on two hosts to -provide balancing and failover for the IP address 192.168.1.10. -.Pp -First the -.Nm -interfaces on host A are configured. -The -.Cm advskew -of 100 on the second virtual host means that its advertisements will be sent -out slightly less frequently. -.Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 pass mekmitasdigoat 192.168.1.10/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.1.10/24 -.Ed -.Pp -The configuration for host B is identical, except the -.Cm advskew -is on virtual host 1 rather than virtual host 2. -.Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.10/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 pass mekmitasdigoat 192.168.1.10/24 -.Ed -.Pp -Finally, the ARP balancing feature must be enabled on both hosts: -.Pp -.Dl sysctl net.inet.carp.arpbalance=1 -.Pp -When the hosts receive an ARP request for 192.168.1.10, the source IP address -of the request is used to compute which virtual host should answer the request. -The host which is master of the selected virtual host will reply to the -request, the other(s) will ignore it. -.Pp -This way, locally connected systems will receive different ARP replies and -subsequent IP traffic will be balanced among the hosts. -If one of the hosts fails, the other will take over the virtual MAC address, -and begin answering ARP requests on its behalf. +.\".Pp +.\"In order to set up an ARP balanced virtual host, it is necessary to configure +.\"one virtual host for each physical host which would respond to ARP requests +.\"and thus handle the traffic. +.\"In the following example, two virtual hosts are configured on two hosts to +.\"provide balancing and failover for the IP address 192.168.1.10. +.\".Pp +.\"First the +.\".Nm +.\"interfaces on host A are configured. +.\"The +.\".Cm advskew +.\"of 100 on the second virtual host means that its advertisements will be sent +.\"out slightly less frequently. +.\".Bd -literal -offset indent +.\"ifconfig carp0 create +.\"ifconfig carp0 vhid 1 pass mekmitasdigoat 192.168.1.10/24 +.\"ifconfig carp1 create +.\"ifconfig carp1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.1.10/24 +.\".Ed +.\".Pp +.\"The configuration for host B is identical, except the +.\".Cm advskew +.\"is on virtual host 1 rather than virtual host 2. +.\".Bd -literal -offset indent +.\"ifconfig carp0 create +.\"ifconfig carp0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.10/24 +.\"ifconfig carp1 create +.\"ifconfig carp1 vhid 2 pass mekmitasdigoat 192.168.1.10/24 +.\".Ed +.\".Pp +.\"Finally, the ARP balancing feature must be enabled on both hosts: +.\".Pp +.\".Dl sysctl net.inet.carp.arpbalance=1 +.\".Pp +.\"When the hosts receive an ARP request for 192.168.1.10, the source IP address +.\"of the request is used to compute which virtual host should answer the request. +.\"The host which is master of the selected virtual host will reply to the +.\"request, the other(s) will ignore it. +.\".Pp +.\"This way, locally connected systems will receive different ARP replies and +.\"subsequent IP traffic will be balanced among the hosts. +.\"If one of the hosts fails, the other will take over the virtual MAC address, +.\"and begin answering ARP requests on its behalf. .Pp Processing of .Nm -status change events can be set up by using the following devd.conf rules: +status change events can be set up by using the following devd.conf rule: .Bd -literal -offset indent notify 0 { - match "system" "IFNET"; - match "type" "LINK_UP"; - match "subsystem" "carp*"; - action "/root/carpcontrol.sh $type $subsystem"; -}; - -notify 0 { - match "system" "IFNET"; - match "type" "LINK_DOWN"; - match "subsystem" "carp*"; - action "/root/carpcontrol.sh $type $subsystem"; + match "system" "CARP"; + match "subsystem" "[0-9]+@"; + match "type" "(MASTER|BACKUP)"; + action "/root/carpcontrol.sh $subsystem $type"; }; .Ed .Sh SEE ALSO @@ -303,3 +282,8 @@ The .Nm device was imported into .Fx 5.4 . +In +.Fx 10 +the +.Nm +was significantly rewritten, and is no longer a pseudo-interface. diff --git a/sys/net/if.c b/sys/net/if.c index 8c6c24cf9187..437734354af3 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -130,17 +130,19 @@ void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); #if defined(INET) || defined(INET6) -struct ifnet *(*carp_forus_p)(struct ifnet *ifp, u_char *dhost); +int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *sa, struct rtentry *rt); + struct sockaddr *sa); +int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); +int (*carp_attach_p)(struct ifaddr *, int); +void (*carp_detach_p)(struct ifaddr *); #endif #ifdef INET -int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, struct in_addr *, - u_int8_t **); +int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); -caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, +caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif @@ -2506,6 +2508,16 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) error = if_getgroupmembers((struct ifgroupreq *)data); CURVNET_RESTORE(); return (error); +#if defined(INET) || defined(INET6) + case SIOCSVH: + case SIOCGVH: + if (carp_ioctl_p == NULL) + error = EPROTONOSUPPORT; + else + error = (*carp_ioctl_p)(ifr, cmd, td); + CURVNET_RESTORE(); + return (error); +#endif } ifp = ifunit_ref(ifr->ifr_name); diff --git a/sys/net/if.h b/sys/net/if.h index 4f2dc6f62452..08420f4c3b1a 100644 --- a/sys/net/if.h +++ b/sys/net/if.h @@ -85,7 +85,7 @@ struct if_data { u_char ifi_addrlen; /* media address length */ u_char ifi_hdrlen; /* media header length */ u_char ifi_link_state; /* current link state */ - u_char ifi_spare_char1; /* spare byte */ + u_char ifi_vhid; /* carp vhid */ u_char ifi_spare_char2; /* spare byte */ u_char ifi_datalen; /* length of this data struct */ u_long ifi_mtu; /* maximum transmission unit */ @@ -267,6 +267,8 @@ struct ifa_msghdr { int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ int ifam_metric; /* value of ifa_metric */ + struct if_data ifam_data;/* statistics and other data about if or + * address */ }; /* @@ -357,6 +359,7 @@ struct ifaliasreq { struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; + int ifra_vhid; }; struct ifmediareq { diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 9eae68389156..253371a683d1 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -397,7 +397,7 @@ ether_output(struct ifnet *ifp, struct mbuf *m, #if defined(INET) || defined(INET6) if (ifp->if_carp && - (error = (*carp_output_p)(ifp, m, dst, NULL))) + (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif diff --git a/sys/net/if_types.h b/sys/net/if_types.h index c2effacd7d0c..fe6ab5ea16b3 100644 --- a/sys/net/if_types.h +++ b/sys/net/if_types.h @@ -250,6 +250,5 @@ #define IFT_ENC 0xf4 #define IFT_PFLOG 0xf6 #define IFT_PFSYNC 0xf7 -#define IFT_CARP 0xf8 /* Common Address Redundancy Protocol */ #define IFT_IPXIP 0xf9 /* IPX over IP tunneling; no longer used. */ #endif /* !_NET_IF_TYPES_H_ */ diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 1cb84400172c..ea609448b18c 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -69,6 +69,7 @@ struct rt_addrinfo; struct socket; struct ether_header; struct carp_if; +struct carp_softc; struct ifvlantrunk; struct route; struct vnet; @@ -729,6 +730,7 @@ struct ifaddr { struct sockaddr *ifa_netmask; /* used to determine subnet */ struct if_data if_data; /* not all members are meaningful */ struct ifnet *ifa_ifp; /* back-pointer to interface */ + struct carp_softc *ifa_carp; /* pointer to CARP data */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index b2313c0a90e1..352082403330 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -63,6 +63,7 @@ #include #include +#include #ifdef INET6 #include #endif @@ -83,7 +84,7 @@ struct if_data32 { uint8_t ifi_addrlen; uint8_t ifi_hdrlen; uint8_t ifi_link_state; - uint8_t ifi_spare_char1; + uint8_t ifi_vhid; uint8_t ifi_spare_char2; uint8_t ifi_datalen; uint32_t ifi_mtu; @@ -122,6 +123,9 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; +/* These are external hooks for CARP. */ +int (*carp_get_vhid_p)(struct ifaddr *); + /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. @@ -1508,6 +1512,7 @@ copy_ifdata32(struct if_data *src, struct if_data32 *dst) CP(*src, *dst, ifi_addrlen); CP(*src, *dst, ifi_hdrlen); CP(*src, *dst, ifi_link_state); + CP(*src, *dst, ifi_vhid); dst->ifi_datalen = sizeof(struct if_data32); CP(*src, *dst, ifi_mtu); CP(*src, *dst, ifi_metric); @@ -1559,6 +1564,9 @@ sysctl_iflist(int af, struct walkarg *w) ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); + if (carp_get_vhid_p != NULL) + ifm32->ifm_data.ifi_vhid = + (*carp_get_vhid_p)(ifa); ifm32->ifm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len); @@ -1569,6 +1577,9 @@ sysctl_iflist(int af, struct walkarg *w) ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_data = ifp->if_data; + if (carp_get_vhid_p != NULL) + ifm->ifm_data.ifi_vhid = + (*carp_get_vhid_p)(ifa); ifm->ifm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)ifm, len); #ifdef COMPAT_FREEBSD32 @@ -1595,6 +1606,9 @@ sysctl_iflist(int af, struct walkarg *w) ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_metric = ifa->ifa_metric; ifam->ifam_addrs = info.rti_addrs; + if (carp_get_vhid_p != NULL) + ifam->ifam_data.ifi_vhid = + (*carp_get_vhid_p)(ifa); error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) goto done; diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 60fa944a78b3..546714f68398 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -139,8 +139,6 @@ static const struct netisr_handler arp_nh = { }; #ifdef AF_INET -void arp_ifscrub(struct ifnet *ifp, uint32_t addr); - /* * called by in_ifscrub to remove entry from the table when * the interface goes away @@ -516,7 +514,7 @@ in_arpinput(struct mbuf *m) int op, flags; int req_len; int bridged = 0, is_bridge = 0; - int carp_match = 0; + int carped; struct sockaddr_in sin; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; @@ -561,24 +559,14 @@ in_arpinput(struct mbuf *m) * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). - * If the interface does not match, but the recieving interface - * is part of carp, we call carp_iamatch to see if this is a - * request for the virtual host ip. - * XXX: This is really ugly! */ IN_IFADDR_RLOCK(); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); - goto match; - } - if (ifp->if_carp != NULL && - (*carp_iamatch_p)(ifp, ia, &isaddr, &enaddr) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { - carp_match = 1; + itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && + (ia->ia_ifa.ifa_carp == NULL || + (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); goto match; @@ -643,6 +631,7 @@ in_arpinput(struct mbuf *m) match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); + carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) @@ -659,9 +648,9 @@ match: * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ - if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { - log(LOG_ERR, - "arp: %*D is using my IP address %s on %s!\n", + if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && + myaddr.s_addr != 0) { + log(LOG_ERR, "arp: %*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; @@ -682,7 +671,7 @@ match: IF_AFDATA_UNLOCK(ifp); if (la != NULL) { /* the following is not an error when doing bridging */ - if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) { + if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) log(LOG_WARNING, "arp: %s is on %s " "but got reply from %*D on %s\n", @@ -879,6 +868,9 @@ arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { struct llentry *lle; + if (ifa->ifa_carp != NULL) + return; + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h index 1b69436df3ad..4b3ebd7cd2b1 100644 --- a/sys/netinet/if_ether.h +++ b/sys/netinet/if_ether.h @@ -117,6 +117,7 @@ int arpresolve(struct ifnet *ifp, struct rtentry *rt, struct llentry **lle); void arp_ifinit(struct ifnet *, struct ifaddr *); void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *); +void arp_ifscrub(struct ifnet *, uint32_t); #include typedef void (*llevent_arp_update_fn)(void *, struct llentry *); diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 51bb470248cf..7e9b1ed91a59 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -56,10 +56,12 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include +#include #include #include #include @@ -69,17 +71,15 @@ static void in_len2mask(struct in_addr *, int); static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); -static int in_addprefix(struct in_ifaddr *, int); -static int in_scrubprefix(struct in_ifaddr *, u_int); static void in_socktrim(struct sockaddr_in *); -static int in_ifinit(struct ifnet *, - struct in_ifaddr *, struct sockaddr_in *, int, int); +static int in_ifinit(struct ifnet *, struct in_ifaddr *, + struct sockaddr_in *, int, int, int); static void in_purgemaddrs(struct ifnet *); -static VNET_DEFINE(int, sameprefixcarponly); -#define V_sameprefixcarponly VNET(sameprefixcarponly) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, - &VNET_NAME(sameprefixcarponly), 0, +static VNET_DEFINE(int, nosameprefix); +#define V_nosameprefix VNET(nosameprefix) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_RW, + &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); @@ -517,7 +517,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCSIFADDR: error = in_ifinit(ifp, ia, - (struct sockaddr_in *) &ifr->ifr_addr, 1, 0); + (struct sockaddr_in *) &ifr->ifr_addr, 1, 0, 0); if (error != 0 && iaIsNew) break; if (error == 0) { @@ -570,7 +570,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } if (hostIsNew || maskIsNew) error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0, - maskIsNew); + maskIsNew, ifra->ifra_vhid); if (error != 0 && iaIsNew) break; @@ -609,6 +609,9 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, panic("in_control: unsupported ioctl"); } + if (ia->ia_ifa.ifa_carp) + (*carp_detach_p)(&ia->ia_ifa); + IF_ADDR_LOCK(ifp); /* Re-check that ia is still part of the list. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { @@ -843,7 +846,7 @@ in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia, u_int flags) */ static int in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, - int scrub, int masksupplied) + int scrub, int masksupplied, int vhid) { register u_long i = ntohl(sin->sin_addr.s_addr); int flags = RTF_UP, error = 0; @@ -859,6 +862,15 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, ia, ia_hash); IN_IFADDR_WUNLOCK(); + if (vhid > 0) { + if (carp_attach_p != NULL) + error = (*carp_attach_p)(&ia->ia_ifa, vhid); + else + error = EPROTONOSUPPORT; + } + if (error) + return (error); + /* * Give the interface a chance to initialize * if this is its first address, @@ -884,11 +896,6 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, } ia->ia_subnet = i & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); - /* - * XXX: carp(4) does not have interface route - */ - if (ifp->if_type == IFT_CARP) - return (0); /* * Add route for the network. */ @@ -907,7 +914,7 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, return (0); flags |= RTF_HOST; } - if ((error = in_addprefix(ia, flags)) != 0) + if (!vhid && (error = in_addprefix(ia, flags)) != 0) return (error); if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY) @@ -920,7 +927,7 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, /* * add a loopback route to self */ - if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) { + if (V_useloopback && !vhid && !(ifp->if_flags & IFF_LOOPBACK)) { struct route ia_ro; bzero(&ia_ro, sizeof(ia_ro)); @@ -992,7 +999,7 @@ static void in_addralias_rtmsg(int cmd, struct in_addr *prefix, /* * Check if we have a route for the given prefix already or add one accordingly. */ -static int +int in_addprefix(struct in_ifaddr *target, int flags) { struct in_ifaddr *ia; @@ -1038,9 +1045,7 @@ in_addprefix(struct in_ifaddr *target, int flags) } else break; #endif - if (V_sameprefixcarponly && - target->ia_ifp->if_type != IFT_CARP && - ia->ia_ifp->if_type != IFT_CARP) { + if (V_nosameprefix) { IN_IFADDR_RUNLOCK(); return (EEXIST); } else { @@ -1061,14 +1066,12 @@ in_addprefix(struct in_ifaddr *target, int flags) return (error); } -extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr); - /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ -static int +int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct in_ifaddr *ia; @@ -1156,13 +1159,8 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. - * - * XXX: a special case for carp(4) interface - this should - * be more generally specified as an interface that - * doesn't support such action. */ - if ((ia->ia_flags & IFA_ROUTE) == 0 - && (ia->ia_ifp->if_type != IFT_CARP)) { + if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, @@ -1310,9 +1308,6 @@ in_purgemaddrs(struct ifnet *ifp) IN_MULTI_UNLOCK(); } -#include -#include - struct in_llentry { struct llentry base; struct sockaddr_in l3_addr4; diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h index 3a5e32824a5a..dea839392820 100644 --- a/sys/netinet/in_var.h +++ b/sys/netinet/in_var.h @@ -77,6 +77,7 @@ struct in_aliasreq { struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; + int ifra_vhid; }; /* * Given a pointer to an in_ifaddr (ifaddr), @@ -442,6 +443,8 @@ int in_leavegroup_locked(struct in_multi *, int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); void in_rtqdrain(void); +int in_addprefix(struct in_ifaddr *, int); +int in_scrubprefix(struct in_ifaddr *, u_int); void ip_input(struct mbuf *); int in_ifadown(struct ifaddr *ifa, int); void in_ifscrub(struct ifnet *, struct in_ifaddr *, u_int); diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index 73024f78f555..2875537228d5 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -1,6 +1,8 @@ -/* - * Copyright (c) 2002 Michael Shalayeff. All rights reserved. - * Copyright (c) 2003 Ryan McBride. All rights reserved. +/*- + * Copyright (c) 2002 Michael Shalayeff. + * Copyright (c) 2003 Ryan McBride. + * Copyright (c) 2011 Gleb Smirnoff + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,38 +33,30 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" -#include #include #include -#include +#include +#include #include #include #include #include #include -#include #include #include #include +#include +#include #include #include -#include -#include -#include -#include -#include - -#include - -#include #include #include -#include #include -#include #include +#include #include +#include #include #include @@ -71,12 +65,9 @@ __FBSDID("$FreeBSD$"); #include #include #include - #include #endif - #ifdef INET -#include #include #include #endif @@ -85,65 +76,117 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include -#include #include #endif #include -#define CARP_IFNAME "carp" -static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); -SYSCTL_DECL(_net_inet_carp); +static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { - struct ifnet *sc_ifp; /* Interface clue */ - struct ifnet *sc_carpdev; /* Pointer to parent interface */ - struct in_ifaddr *sc_ia; /* primary iface address */ + struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ + struct ifaddr **sc_ifas; /* Our ifaddrs. */ + struct sockaddr_dl sc_addr; /* Our link level address. */ + struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET - struct ip_moptions sc_imo; + struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 - struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ - struct ip6_moptions sc_im6o; -#endif /* INET6 */ - TAILQ_ENTRY(carp_softc) sc_list; + struct callout sc_md6_tmo; /* XXX: Master down timeout. */ +#endif + struct mtx sc_mtx; + int sc_vhid; + int sc_advskew; + int sc_advbase; + + int sc_naddrs; + int sc_naddrs6; + int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; - - int sc_flags_backup; - int sc_suppress; - - int sc_sendad_errors; + int sc_suppress; + int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 - int sc_sendad_success; + int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 - int sc_vhid; - int sc_advskew; - int sc_naddrs; - int sc_naddrs6; - int sc_advbase; /* seconds */ - int sc_init_counter; - u_int64_t sc_counter; + int sc_init_counter; + uint64_t sc_counter; /* authentication */ -#define CARP_HMAC_PAD 64 +#define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; - struct callout sc_ad_tmo; /* advertisement timeout */ - struct callout sc_md_tmo; /* master down timeout */ - struct callout sc_md6_tmo; /* master down timeout */ - - LIST_ENTRY(carp_softc) sc_next; /* Interface clue */ + TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ + LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; -#define SC2IFP(sc) ((sc)->sc_ifp) + +struct carp_if { +#ifdef INET + int cif_naddrs; +#endif +#ifdef INET6 + int cif_naddrs6; +#endif + TAILQ_HEAD(, carp_softc) cif_vrs; +#ifdef INET + struct ip_moptions cif_imo; +#endif +#ifdef INET6 + struct ip6_moptions cif_im6o; +#endif + struct ifnet *cif_ifp; + struct mtx cif_mtx; +}; + +#define CARP_INET 0 +#define CARP_INET6 1 +static int proto_reg[] = {-1, -1}; + +/* + * Brief design of carp(4). + * + * Any carp-capable ifnet may have a list of carp softcs hanging off + * its ifp->if_carp pointer. Each softc represents one unique virtual + * host id, or vhid. The softc has a back pointer to the ifnet. All + * softcs are joined in a global list, which has quite limited use. + * + * Any interface address that takes part in CARP negotiation has a + * pointer to the softc of its vhid, ifa->ifa_carp. That could be either + * AF_INET or AF_INET6 address. + * + * Although, one can get the softc's backpointer to ifnet and traverse + * through its ifp->if_addrhead queue to find all interface addresses + * involved in CARP, we keep a growable array of ifaddr pointers. This + * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that + * do calls into the network stack, thus avoiding LORs. + * + * Locking: + * + * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), + * callout-driven events and ioctl()s. + * + * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to + * traverse the global list we use the mutex carp_mtx. + * + * Known issues with locking: + * + * - There is no protection for races between two ioctl() requests, + * neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all + * interface ioctl()s should be serialized right in net/if.c. + * - Sending ad, we put the pointer to the softc in an mtag, and no reference + * counting is done on the softc. + * - On module unload we may race (?) with packet processing thread + * dereferencing our function pointers. + */ int carp_suppress_preempt = 0; -int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */ +int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, }; SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW, &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets"); @@ -151,8 +194,6 @@ SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW, &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW, &carp_opts[CARPCTL_LOG], 0, "log bad carp packets"); -SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW, - &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses"); SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD, &carp_suppress_preempt, 0, "Preemption is suppressed"); @@ -161,36 +202,22 @@ SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW, &carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); -struct carp_if { - TAILQ_HEAD(, carp_softc) vhif_vrs; - int vhif_nvrs; - - struct ifnet *vhif_ifp; - struct mtx vhif_mtx; -}; - -#define CARP_INET 0 -#define CARP_INET6 1 -static int proto_reg[] = {-1, -1}; - -/* Get carp_if from softc. Valid after carp_set_addr{,6}. */ -#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp) - -/* lock per carp_if queue */ -#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \ +#define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) -#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx) -#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED) -#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx) -#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx) - -#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx) -#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx) -#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED) +#define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) +#define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) +#define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) +#define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) +#define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ + NULL, MTX_DEF) +#define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) +#define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) +#define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) +#define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CARP_LOG(...) do { \ if (carp_opts[CARPCTL_LOG] > 0) \ - log(LOG_INFO, __VA_ARGS__); \ + log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ @@ -198,58 +225,43 @@ static int proto_reg[] = {-1, -1}; log(LOG_DEBUG, __VA_ARGS__); \ } while (0) -static void carp_hmac_prepare(struct carp_softc *); -static void carp_hmac_generate(struct carp_softc *, u_int32_t *, - unsigned char *); -static int carp_hmac_verify(struct carp_softc *, u_int32_t *, - unsigned char *); -static void carp_setroute(struct carp_softc *, int); +#define IFNET_FOREACH_IFA(ifp, ifa) \ + IF_ADDR_LOCK_ASSERT(ifp); \ + TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ + if ((ifa)->ifa_carp != NULL) + +#define CARP_FOREACH_IFA(sc, ifa) \ + CARP_LOCK_ASSERT(sc); \ + for (int _i = 0; \ + _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ + ((ifa) = sc->sc_ifas[_i]) != NULL; \ + ++_i) + +#define IFNET_FOREACH_CARP(ifp, sc) \ + CIF_LOCK_ASSERT(ifp->if_carp); \ + TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) + static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); -static int carp_clone_create(struct if_clone *, int, caddr_t); -static void carp_clone_destroy(struct ifnet *); -static void carpdetach(struct carp_softc *, int); -static int carp_prepare_ad(struct mbuf *, struct carp_softc *, - struct carp_header *); -static void carp_send_ad_all(void); -static void carp_send_ad(void *); -static void carp_send_ad_locked(struct carp_softc *); -#ifdef INET -static void carp_send_arp(struct carp_softc *); -#endif +static struct carp_softc + *carp_alloc(struct ifnet *); +static void carp_destroy(struct carp_softc *); +static struct carp_if + *carp_alloc_if(struct ifnet *); +static void carp_free_if(struct carp_if *); +static void carp_set_state(struct carp_softc *, int); +static void carp_sc_state(struct carp_softc *); +static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *); -static int carp_ioctl(struct ifnet *, u_long, caddr_t); -static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *); -static void carp_start(struct ifnet *); -static void carp_setrun(struct carp_softc *, sa_family_t); -static void carp_set_state(struct carp_softc *, int); -#ifdef INET -static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); -#endif -enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; +static void carp_send_ad(void *); +static void carp_send_ad_locked(struct carp_softc *); +static void carp_addroute(struct carp_softc *); +static void carp_delroute(struct carp_softc *); -#ifdef INET -static void carp_multicast_cleanup(struct carp_softc *, int dofree); -static int carp_set_addr(struct carp_softc *, struct sockaddr_in *); -static int carp_del_addr(struct carp_softc *, struct sockaddr_in *); -#endif -static void carp_carpdev_state_locked(struct carp_if *); -static void carp_sc_state_locked(struct carp_softc *); -#ifdef INET6 -static void carp_send_na(struct carp_softc *); -static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *); -static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *); -static void carp_multicast6_cleanup(struct carp_softc *, int dofree); -#endif - -static LIST_HEAD(, carp_softc) carpif_list; +static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; -IFC_SIMPLE_DECLARE(carp, 0); -static eventhandler_tag if_detach_event_tag; - -static __inline u_int16_t +static __inline uint16_t carp_cksum(struct mbuf *m, int len) { return (in_cksum(m, len)); @@ -258,8 +270,8 @@ carp_cksum(struct mbuf *m, int len) static void carp_hmac_prepare(struct carp_softc *sc) { - u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; - u_int8_t vhid = sc->sc_vhid & 0xff; + uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; + uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET @@ -269,18 +281,15 @@ carp_hmac_prepare(struct carp_softc *sc) struct in6_addr last6, cur6, in6; #endif - if (sc->sc_carpdev) - CARP_SCLOCK(sc); + CARP_LOCK_ASSERT(sc); - /* XXX: possible race here */ - - /* compute ipad from key */ + /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; - /* precompute first part of inner hash */ + /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); @@ -292,8 +301,7 @@ carp_hmac_prepare(struct carp_softc *sc) found = 0; last = cur; cur.s_addr = 0xffffffff; - IF_ADDR_LOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && @@ -302,7 +310,6 @@ carp_hmac_prepare(struct carp_softc *sc) found++; } } - IF_ADDR_UNLOCK(SC2IFP(sc)); if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); @@ -313,8 +320,7 @@ carp_hmac_prepare(struct carp_softc *sc) found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); - IF_ADDR_LOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; @@ -325,7 +331,6 @@ carp_hmac_prepare(struct carp_softc *sc) found++; } } - IF_ADDR_UNLOCK(SC2IFP(sc)); if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); @@ -334,17 +339,16 @@ carp_hmac_prepare(struct carp_softc *sc) /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; - - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); } static void -carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], +carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; + CARP_LOCK_ASSERT(sc); + /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); @@ -359,203 +363,18 @@ carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], } static int -carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2], +carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } -static void -carp_setroute(struct carp_softc *sc, int cmd) -{ - struct ifaddr *ifa; - int s; - - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); - - s = splnet(); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { -#ifdef INET - if (ifa->ifa_addr->sa_family == AF_INET && - sc->sc_carpdev != NULL) { - int count = carp_addrcount( - (struct carp_if *)sc->sc_carpdev->if_carp, - ifatoia(ifa), CARP_COUNT_MASTER); - - if ((cmd == RTM_ADD && count == 1) || - (cmd == RTM_DELETE && count == 0)) - rtinit(ifa, cmd, RTF_UP | RTF_HOST); - } -#endif - } - splx(s); -} - -static int -carp_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - - struct carp_softc *sc; - struct ifnet *ifp; - - sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); - ifp = SC2IFP(sc) = if_alloc(IFT_ETHER); - if (ifp == NULL) { - free(sc, M_CARP); - return (ENOSPC); - } - - sc->sc_flags_backup = 0; - sc->sc_suppress = 0; - sc->sc_advbase = CARP_DFLTINTV; - sc->sc_vhid = -1; /* required setting */ - sc->sc_advskew = 0; - sc->sc_init_counter = 1; - sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ -#ifdef INET - sc->sc_imo.imo_membership = (struct in_multi **)malloc( - (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - sc->sc_imo.imo_mfilters = NULL; - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; - sc->sc_imo.imo_multicast_vif = -1; -#endif -#ifdef INET6 - sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc( - (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - sc->sc_im6o.im6o_mfilters = NULL; - sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; - sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL; -#endif - - callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE); - callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE); - callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE); - - ifp->if_softc = sc; - if_initname(ifp, CARP_IFNAME, unit); - ifp->if_mtu = ETHERMTU; - ifp->if_flags = IFF_LOOPBACK; - ifp->if_ioctl = carp_ioctl; - ifp->if_output = carp_looutput; - ifp->if_start = carp_start; - ifp->if_type = IFT_CARP; - ifp->if_snd.ifq_maxlen = ifqmaxlen; - ifp->if_hdrlen = 0; - if_attach(ifp); - bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t)); - mtx_lock(&carp_mtx); - LIST_INSERT_HEAD(&carpif_list, sc, sc_next); - mtx_unlock(&carp_mtx); - return (0); -} - -static void -carp_clone_destroy(struct ifnet *ifp) -{ - struct carp_softc *sc = ifp->if_softc; - - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carpdetach(sc, 1); /* Returns unlocked. */ - - mtx_lock(&carp_mtx); - LIST_REMOVE(sc, sc_next); - mtx_unlock(&carp_mtx); - bpfdetach(ifp); - if_detach(ifp); - if_free(ifp); -#ifdef INET - free(sc->sc_imo.imo_membership, M_CARP); -#endif -#ifdef INET6 - free(sc->sc_im6o.im6o_membership, M_CARP); -#endif - free(sc, M_CARP); -} - -/* - * This function can be called on CARP interface destroy path, - * and in case of the removal of the underlying interface as - * well. We differentiate these two cases: in case of destruction - * of the underlying interface, we do not cleanup our multicast - * memberships, since they are already freed. But we purge pointers - * to multicast structures, since they are no longer valid, to - * avoid panic in future calls to carpdetach(). Also, we do not - * release the lock on return, because the function will be - * called once more, for another CARP instance on the same - * interface. - */ -static void -carpdetach(struct carp_softc *sc, int unlock) -{ - struct carp_if *cif; - - callout_stop(&sc->sc_ad_tmo); - callout_stop(&sc->sc_md_tmo); - callout_stop(&sc->sc_md6_tmo); - - if (sc->sc_suppress) - carp_suppress_preempt--; - sc->sc_suppress = 0; - - if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) - carp_suppress_preempt--; - sc->sc_sendad_errors = 0; - - carp_set_state(sc, INIT); - SC2IFP(sc)->if_flags &= ~IFF_UP; - carp_setrun(sc, 0); -#ifdef INET - carp_multicast_cleanup(sc, unlock); -#endif -#ifdef INET6 - carp_multicast6_cleanup(sc, unlock); -#endif - - if (sc->sc_carpdev != NULL) { - cif = (struct carp_if *)sc->sc_carpdev->if_carp; - CARP_LOCK_ASSERT(cif); - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - ifpromisc(sc->sc_carpdev, 0); - sc->sc_carpdev->if_carp = NULL; - CARP_LOCK_DESTROY(cif); - free(cif, M_CARP); - } else if (unlock) - CARP_UNLOCK(cif); - sc->sc_carpdev = NULL; - } -} - -/* Detach an interface from the carp. */ -static void -carp_ifdetach(void *arg __unused, struct ifnet *ifp) -{ - struct carp_if *cif = (struct carp_if *)ifp->if_carp; - struct carp_softc *sc, *nextsc; - - if (cif == NULL) - return; - - /* - * XXX: At the end of for() cycle the lock will be destroyed. - */ - CARP_LOCK(cif); - for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) { - nextsc = TAILQ_NEXT(sc, sc_list); - carpdetach(sc, 0); - } -} - /* * process input packet. * we have rearranged checks order compared to the rfc, @@ -576,20 +395,10 @@ carp_input(struct mbuf *m, int hlen) return; } - /* check if received on a valid carp interface */ - if (m->m_pkthdr.rcvif->if_carp == NULL) { - CARPSTATS_INC(carps_badif); - CARP_DEBUG("carp_input: packet received on non-carp " - "interface: %s\n", - m->m_pkthdr.rcvif->if_xname); - m_freem(m); - return; - } - /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); - CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n", + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); @@ -600,9 +409,8 @@ carp_input(struct mbuf *m, int hlen) if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp_input: received len %zd < " - "sizeof(struct carp_header) on %s\n", - m->m_len - sizeof(struct ip), + CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " + "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); return; @@ -611,7 +419,7 @@ carp_input(struct mbuf *m, int hlen) if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); - CARP_DEBUG("carp_input: pullup failed\n"); + CARP_DEBUG("%s: pullup failed\n", __func__); return; } ip = mtod(m, struct ip *); @@ -625,7 +433,7 @@ carp_input(struct mbuf *m, int hlen) len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp_input: packet too short %d on %s\n", + CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); @@ -643,7 +451,7 @@ carp_input(struct mbuf *m, int hlen) m->m_data += iplen; if (carp_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); - CARP_DEBUG("carp_input: checksum failed on %s\n", + CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return; @@ -673,9 +481,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto) /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); - CARP_DEBUG("carp6_input: packet received on non-carp " - "interface: %s\n", - m->m_pkthdr.rcvif->if_xname); + CARP_DEBUG("%s: packet received on non-carp interface: %s\n", + __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } @@ -683,9 +490,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto) /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); - CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n", - ip6->ip6_hlim, - m->m_pkthdr.rcvif->if_xname); + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, + ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } @@ -695,7 +501,7 @@ carp6_input(struct mbuf **mp, int *offp, int proto) IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp6_input: packet size %u too small\n", len); + CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } @@ -704,7 +510,7 @@ carp6_input(struct mbuf **mp, int *offp, int proto) m->m_data += *offp; if (carp_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); - CARP_DEBUG("carp6_input: checksum failed, on %s\n", + CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); @@ -720,62 +526,46 @@ static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ifaddr *ifa; struct carp_softc *sc; - u_int64_t tmp_counter; + uint64_t tmp_counter; struct timeval sc_tv, ch_tv; /* verify that the VHID is valid on the receiving interface */ - CARP_LOCK(ifp->if_carp); - TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list) - if (sc->sc_vhid == ch->carp_vhid) + IF_ADDR_LOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == af && + ifa->ifa_carp->sc_vhid == ch->carp_vhid) { + ifa_ref(ifa); break; + } + IF_ADDR_UNLOCK(ifp); - if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { + if (ifa == NULL) { CARPSTATS_INC(carps_badvhid); - CARP_UNLOCK(ifp->if_carp); m_freem(m); return; } - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_ipackets++; - SC2IFP(sc)->if_ibytes += m->m_pkthdr.len; - - if (bpf_peers_present(SC2IFP(sc)->if_bpf)) { - uint32_t af1 = af; -#ifdef INET - struct ip *ip = mtod(m, struct ip *); - - /* BPF wants net byte order */ - if (af == AF_INET) { - ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); - ip->ip_off = htons(ip->ip_off); - } -#endif - bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m); - } - /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); - SC2IFP(sc)->if_ierrors++; - CARP_UNLOCK(ifp->if_carp); - CARP_DEBUG("%s; invalid version %d\n", - SC2IFP(sc)->if_xname, + CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); + ifa_free(ifa); m_freem(m); return; } - /* verify the hash */ + sc = ifa->ifa_carp; + CARP_LOCK(sc); + ifa_free(ifa); + if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); - SC2IFP(sc)->if_ierrors++; - CARP_UNLOCK(ifp->if_carp); - CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname); - m_freem(m); - return; + CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, + sc->sc_vhid, ifp->if_xname); + goto out; } tmp_counter = ntohl(ch->carp_counter[0]); @@ -806,12 +596,13 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); - CARP_LOG("%s: MASTER -> BACKUP " - "(more frequent advertisement received)\n", - SC2IFP(sc)->if_xname); + CARP_LOG("VHID %u@%s: MASTER -> BACKUP " + "(more frequent advertisement received)\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); - carp_setroute(sc, RTM_DELETE); + carp_delroute(sc); } break; case BACKUP: @@ -821,9 +612,10 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) */ if (carp_opts[CARPCTL_PREEMPT] && timevalcmp(&sc_tv, &ch_tv, <)) { - CARP_LOG("%s: BACKUP -> MASTER " + CARP_LOG("VHID %u@%s: BACKUP -> MASTER " "(preempting a slower master)\n", - SC2IFP(sc)->if_xname); + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_master_down_locked(sc); break; } @@ -835,9 +627,10 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { - CARP_LOG("%s: BACKUP -> MASTER " + CARP_LOG("VHID %u@%s: BACKUP -> MASTER " "(master timed out)\n", - SC2IFP(sc)->if_xname); + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_master_down_locked(sc); break; } @@ -850,17 +643,15 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) break; } - CARP_UNLOCK(ifp->if_carp); - +out: + CARP_UNLOCK(sc); m_freem(m); - return; } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; - struct ifnet *ifp = SC2IFP(sc); if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ @@ -876,35 +667,38 @@ carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ - mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); - if (mtag == NULL) { + if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), + M_NOWAIT)) == NULL) { m_freem(m); - SC2IFP(sc)->if_oerrors++; + CARPSTATS_INC(carps_onomem); return (ENOMEM); } - bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); + bcopy(&sc, (caddr_t)(mtag + 1), sizeof(struct carp_softc *)); m_tag_prepend(m, mtag); return (0); } static void -carp_send_ad_all(void) +carp_send_ad_all(struct carp_softc *badsc) { struct carp_softc *sc; + /* + * Avoid LOR and recursive call to carp_send_ad_locked(). + */ + CARP_UNLOCK(badsc); + mtx_lock(&carp_mtx); - LIST_FOREACH(sc, &carpif_list, sc_next) { - if (sc->sc_carpdev == NULL) - continue; - CARP_SCLOCK(sc); - if ((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) && - sc->sc_state == MASTER) + LIST_FOREACH(sc, &carp_list, sc_next) + if (sc != badsc && sc->sc_state == MASTER) { + CARP_LOCK(sc); carp_send_ad_locked(sc); - CARP_SCUNLOCK(sc); - } + CARP_UNLOCK(sc); + } mtx_unlock(&carp_mtx); + + CARP_LOCK(badsc); } static void @@ -912,9 +706,9 @@ carp_send_ad(void *v) { struct carp_softc *sc = v; - CARP_SCLOCK(sc); + CARP_LOCK_ASSERT(sc); carp_send_ad_locked(sc); - CARP_SCUNLOCK(sc); + CARP_UNLOCK(sc); } static void @@ -922,48 +716,42 @@ carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; + struct sockaddr sa; + struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; - int len, advbase, advskew; + int len, advskew; - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); - /* bow out if we've lost our UPness or RUNNINGuiness */ - if (!((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { - advbase = 255; - advskew = 255; - } else { - advbase = sc->sc_advbase; - if (!carp_suppress_preempt || sc->sc_advskew > 240) - advskew = sc->sc_advskew; - else - advskew = 240; - tv.tv_sec = advbase; - tv.tv_usec = advskew * 1000000 / 256; - } + if (!carp_suppress_preempt || sc->sc_advskew > 240) + advskew = sc->sc_advskew; + else + advskew = 240; + tv.tv_sec = sc->sc_advbase; + tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; - ch.carp_advbase = advbase; + ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; + /* XXXGL: OpenBSD picks first ifaddr with needed family. */ + #ifdef INET - if (sc->sc_ia) { + if (sc->sc_naddrs) { struct ip *ip; - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_NOWAIT, MT_HEADER); if (m == NULL) { - SC2IFP(sc)->if_oerrors++; CARPSTATS_INC(carps_onomem); /* XXX maybe less ? */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); return; } len = sizeof(*ip) + sizeof(ch); @@ -982,7 +770,16 @@ carp_send_ad_locked(struct carp_softc *sc) ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; - ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr; + + bzero(&sa, sizeof(sa)); + sa.sa_family = AF_INET; + ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); + if (ifa != NULL) { + ip->ip_src.s_addr = + ifatoia(ifa)->ia_addr.sin_addr.s_addr; + ifa_free(ifa); + } else + ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); @@ -994,22 +791,16 @@ carp_send_ad_locked(struct carp_softc *sc) ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_opackets++; - SC2IFP(sc)->if_obytes += len; CARPSTATS_INC(carps_opackets); - if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) { - SC2IFP(sc)->if_oerrors++; + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, + &sc->sc_carpdev->if_carp->cif_imo, NULL)) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } + if (carp_suppress_preempt == 1) + carp_send_ad_all(sc); } sc->sc_sendad_success = 0; } else { @@ -1025,17 +816,15 @@ carp_send_ad_locked(struct carp_softc *sc) } #endif /* INET */ #ifdef INET6 - if (sc->sc_ia6) { + if (sc->sc_naddrs6) { struct ip6_hdr *ip6; - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_NOWAIT, MT_HEADER); if (m == NULL) { - SC2IFP(sc)->if_oerrors++; CARPSTATS_INC(carps_onomem); /* XXX maybe less ? */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); return; } len = sizeof(*ip6) + sizeof(ch); @@ -1049,14 +838,23 @@ carp_send_ad_locked(struct carp_softc *sc) ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; - bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src, - sizeof(struct in6_addr)); - /* set the multicast destination */ + bzero(&sa, sizeof(sa)); + /* set the source address */ + sa.sa_family = AF_INET6; + ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); + if (ifa != NULL) { + bcopy(IFA_IN6(ifa), &ip6->ip6_src, + sizeof(struct in6_addr)); + ifa_free(ifa); + } else + /* This should never happen with IPv6. */ + bzero(&ip6->ip6_src, sizeof(struct in6_addr)); + + /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { - SC2IFP(sc)->if_oerrors++; m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); return; @@ -1071,22 +869,16 @@ carp_send_ad_locked(struct carp_softc *sc) ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_opackets++; - SC2IFP(sc)->if_obytes += len; CARPSTATS_INC(carps_opackets6); - if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) { - SC2IFP(sc)->if_oerrors++; + if (ip6_output(m, NULL, NULL, 0, + &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } + if (carp_suppress_preempt == 1) + carp_send_ad_all(sc); } sc->sc_sendad_success = 0; } else { @@ -1102,10 +894,55 @@ carp_send_ad_locked(struct carp_softc *sc) } #endif /* INET6 */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); +} +static void +carp_addroute(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + CARP_FOREACH_IFA(sc, ifa) + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + in_addprefix(ifatoia(ifa), RTF_UP); + ifa_add_loopback_route(ifa, + (struct sockaddr *)&ifatoia(ifa)->ia_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + ifa_add_loopback_route(ifa, + (struct sockaddr *)&ifatoia6(ifa)->ia_addr); + in6_ifaddloop(ifa); + break; +#endif + } +} + +static void +carp_delroute(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + CARP_FOREACH_IFA(sc, ifa) + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifa_del_loopback_route(ifa, + (struct sockaddr *)&ifatoia(ifa)->ia_addr); + in_scrubprefix(ifatoia(ifa), LLE_STATIC); + break; +#endif +#ifdef INET6 + case AF_INET6: + ifa_del_loopback_route(ifa, + (struct sockaddr *)&ifatoia6(ifa)->ia_addr); + in6_ifremloop(ifa); + break; +#endif + } } #ifdef INET @@ -1119,16 +956,22 @@ carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) + if (ifa->ifa_addr->sa_family == AF_INET) + arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr)); +} - if (ifa->ifa_addr->sa_family != AF_INET) - continue; +int +carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) +{ + struct carp_softc *sc = ifa->ifa_carp; -/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */ - arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp)); - - DELAY(1000); /* XXX */ + if (sc->sc_state == MASTER) { + *enaddr = LLADDR(&sc->sc_addr); + return (1); } + + return (0); } #endif @@ -1136,226 +979,95 @@ carp_send_arp(struct carp_softc *sc) static void carp_send_na(struct carp_softc *sc) { + static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; - static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; - - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; - in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; + in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } -#endif /* INET6 */ -#ifdef INET -static int -carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) -{ - struct carp_softc *vh; - struct ifaddr *ifa; - int count = 0; - - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((type == CARP_COUNT_RUNNING && - (SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) || - (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) { - IF_ADDR_LOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, - ifa_list) { - if (ifa->ifa_addr->sa_family == AF_INET && - ia->ia_addr.sin_addr.s_addr == - ifatoia(ifa)->ia_addr.sin_addr.s_addr) - count++; - } - IF_ADDR_UNLOCK(SC2IFP(vh)); - } - } - return (count); -} - -int -carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia, - struct in_addr *isaddr, u_int8_t **enaddr) -{ - struct carp_if *cif; - struct carp_softc *vh; - int index, count = 0; - struct ifaddr *ifa; - - cif = ifp->if_carp; - CARP_LOCK(cif); - - if (carp_opts[CARPCTL_ARPBALANCE]) { - /* - * XXX proof of concept implementation. - * We use the source ip to decide which virtual host should - * handle the request. If we're master of that virtual host, - * then we respond, otherwise, just drop the arp packet on - * the floor. - */ - count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING); - if (count == 0) { - /* should never reach this */ - CARP_UNLOCK(cif); - return (0); - } - - /* this should be a hash, like pf_hash() */ - index = ntohl(isaddr->s_addr) % count; - count = 0; - - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) { - IF_ADDR_LOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, - ifa_list) { - if (ifa->ifa_addr->sa_family == - AF_INET && - ia->ia_addr.sin_addr.s_addr == - ifatoia(ifa)->ia_addr.sin_addr.s_addr) { - if (count == index) { - if (vh->sc_state == - MASTER) { - *enaddr = IF_LLADDR(vh->sc_ifp); - IF_ADDR_UNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (1); - } else { - IF_ADDR_UNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (0); - } - } - count++; - } - } - IF_ADDR_UNLOCK(SC2IFP(vh)); - } - } - } else { - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - ia->ia_ifp == SC2IFP(vh) && - vh->sc_state == MASTER) { - *enaddr = IF_LLADDR(vh->sc_ifp); - CARP_UNLOCK(cif); - return (1); - } - } - } - CARP_UNLOCK(cif); - return (0); -} -#endif - -#ifdef INET6 struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { - struct carp_if *cif; - struct carp_softc *vh; struct ifaddr *ifa; - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - IF_ADDR_LOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { - if (IN6_ARE_ADDR_EQUAL(taddr, - &ifatoia6(ifa)->ia_addr.sin6_addr) && - (SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - vh->sc_state == MASTER) { - ifa_ref(ifa); - IF_ADDR_UNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (ifa); - } + IF_ADDR_LOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == AF_INET6 && + ifa->ifa_carp->sc_state == MASTER && + IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + return (ifa); } - IF_ADDR_UNLOCK(SC2IFP(vh)); - } - CARP_UNLOCK(cif); - + IF_ADDR_UNLOCK(ifp); + return (NULL); } caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { - struct m_tag *mtag; - struct carp_if *cif; - struct carp_softc *sc; struct ifaddr *ifa; - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { - IF_ADDR_LOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { - if (IN6_ARE_ADDR_EQUAL(taddr, - &ifatoia6(ifa)->ia_addr.sin6_addr) && - (SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) { - struct ifnet *ifp = SC2IFP(sc); - mtag = m_tag_get(PACKET_TAG_CARP, - sizeof(struct ifnet *), M_NOWAIT); - if (mtag == NULL) { - /* better a bit than nothing */ - IF_ADDR_UNLOCK(SC2IFP(sc)); - CARP_UNLOCK(cif); - return (IF_LLADDR(sc->sc_ifp)); - } - bcopy(&ifp, (caddr_t)(mtag + 1), - sizeof(struct ifnet *)); - m_tag_prepend(m, mtag); + IF_ADDR_LOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { + struct carp_softc *sc = ifa->ifa_carp; + struct m_tag *mtag; - IF_ADDR_UNLOCK(SC2IFP(sc)); - CARP_UNLOCK(cif); - return (IF_LLADDR(sc->sc_ifp)); - } + IF_ADDR_UNLOCK(ifp); + + mtag = m_tag_get(PACKET_TAG_CARP, + sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) + /* Better a bit than nothing. */ + return (LLADDR(&sc->sc_addr)); + + bcopy(&ifp, (caddr_t)(mtag + 1), + sizeof(struct ifnet *)); + m_tag_prepend(m, mtag); + + return (LLADDR(&sc->sc_addr)); } - IF_ADDR_UNLOCK(SC2IFP(sc)); - } - CARP_UNLOCK(cif); + IF_ADDR_UNLOCK(ifp); return (NULL); } -#endif +#endif /* INET6 */ -struct ifnet * +int carp_forus(struct ifnet *ifp, u_char *dhost) { - struct carp_if *cif; - struct carp_softc *vh; - u_int8_t *ena = dhost; + struct carp_softc *sc; + uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) - return (NULL); + return (0); - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - vh->sc_state == MASTER && - !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) { - CARP_UNLOCK(cif); - return (SC2IFP(vh)); + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) { + CARP_LOCK(sc); + if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), + ETHER_ADDR_LEN)) { + CARP_UNLOCK(sc); + CIF_UNLOCK(ifp->if_carp); + return (1); } + CARP_UNLOCK(sc); + } + CIF_UNLOCK(ifp->if_carp); - CARP_UNLOCK(cif); - return (NULL); + return (0); } static void @@ -1363,24 +1075,25 @@ carp_master_down(void *v) { struct carp_softc *sc = v; - CARP_SCLOCK(sc); - carp_master_down_locked(sc); - CARP_SCUNLOCK(sc); + CARP_LOCK_ASSERT(sc); + + if (sc->sc_state == BACKUP) { + CARP_LOG("VHID %u@%s: BACKUP -> MASTER (preempting)\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname); + carp_master_down_locked(sc); + } + + CARP_UNLOCK(sc); } static void carp_master_down_locked(struct carp_softc *sc) { - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); + + CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { - case INIT: - printf("%s: master_down event in INIT state\n", - SC2IFP(sc)->if_xname); - break; - case MASTER: - break; case BACKUP: carp_set_state(sc, MASTER); carp_send_ad_locked(sc); @@ -1389,9 +1102,18 @@ carp_master_down_locked(struct carp_softc *sc) #endif #ifdef INET6 carp_send_na(sc); -#endif /* INET6 */ +#endif carp_setrun(sc, 0); - carp_setroute(sc, RTM_ADD); + carp_addroute(sc); + break; + case INIT: + case MASTER: +#ifdef INVARIANTS + panic("carp: VHID %u@%s: master_down event in %s state\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname, + sc->sc_state ? "MASTER" : "INIT"); +#endif break; } } @@ -1405,28 +1127,19 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; - if (sc->sc_carpdev == NULL) { - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - carp_set_state(sc, INIT); - return; - } else - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); - if (SC2IFP(sc)->if_flags & IFF_UP && - sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6) && - sc->sc_carpdev->if_link_state == LINK_STATE_UP) - SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; - else { - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - carp_setroute(sc, RTM_DELETE); + if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || + sc->sc_carpdev->if_link_state != LINK_STATE_UP || + (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)) return; - } switch (sc->sc_state) { case INIT: - CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname); + CARP_LOG("VHID %u@%s: INIT -> BACKUP\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_set_state(sc, BACKUP); - carp_setroute(sc, RTM_DELETE); carp_setrun(sc, 0); break; case BACKUP: @@ -1439,20 +1152,24 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; -#endif /* INET */ +#endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; -#endif /* INET6 */ +#endif default: +#ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); +#endif +#ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); +#endif break; } break; @@ -1465,691 +1182,154 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) } } -#ifdef INET -static void -carp_multicast_cleanup(struct carp_softc *sc, int dofree) -{ - struct ip_moptions *imo = &sc->sc_imo; - u_int16_t n = imo->imo_num_memberships; - - /* Clean up our own multicast memberships */ - while (n-- > 0) { - if (imo->imo_membership[n] != NULL) { - if (dofree) - in_delmulti(imo->imo_membership[n]); - imo->imo_membership[n] = NULL; - } - } - KASSERT(imo->imo_mfilters == NULL, - ("%s: imo_mfilters != NULL", __func__)); - imo->imo_num_memberships = 0; - imo->imo_multicast_ifp = NULL; -} -#endif - -#ifdef INET6 -static void -carp_multicast6_cleanup(struct carp_softc *sc, int dofree) -{ - struct ip6_moptions *im6o = &sc->sc_im6o; - u_int16_t n = im6o->im6o_num_memberships; - - while (n-- > 0) { - if (im6o->im6o_membership[n] != NULL) { - if (dofree) - in6_mc_leave(im6o->im6o_membership[n], NULL); - im6o->im6o_membership[n] = NULL; - } - } - KASSERT(im6o->im6o_mfilters == NULL, - ("%s: im6o_mfilters != NULL", __func__)); - im6o->im6o_num_memberships = 0; - im6o->im6o_multicast_ifp = NULL; -} -#endif - -#ifdef INET +/* + * Setup multicast structures. + */ static int -carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) +carp_multicast_setup(struct carp_softc *sc, sa_family_t sa) { - struct ifnet *ifp; - struct carp_if *cif; - struct in_ifaddr *ia, *ia_if; - struct ip_moptions *imo = &sc->sc_imo; - struct in_addr addr; - u_long iaddr = htonl(sin->sin_addr.s_addr); - int own, error; + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; + int error = 0; - if (sin->sin_addr.s_addr == 0) { - if (!(SC2IFP(sc)->if_flags & IFF_UP)) - carp_set_state(sc, INIT); - if (sc->sc_naddrs) - SC2IFP(sc)->if_flags |= IFF_UP; - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carp_setrun(sc, 0); - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); - return (0); - } + switch (sa) { +#ifdef INET + case AF_INET: + { + struct ip_moptions *imo = &cif->cif_imo; + struct in_addr addr; - /* we have to do it by hands to check we won't match on us */ - ia_if = NULL; own = 0; - IN_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - /* and, yeah, we need a multicast-capable iface too */ - if (ia->ia_ifp != SC2IFP(sc) && - (ia->ia_ifp->if_flags & IFF_MULTICAST) && - (iaddr & ia->ia_subnetmask) == ia->ia_subnet) { - if (!ia_if) - ia_if = ia; - if (sin->sin_addr.s_addr == - ia->ia_addr.sin_addr.s_addr) - own++; - } - } + if (imo->imo_membership) + return (0); - if (!ia_if) { - IN_IFADDR_RUNLOCK(); - return (EADDRNOTAVAIL); - } + imo->imo_membership = (struct in_multi **)malloc( + (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, + M_WAITOK); + imo->imo_mfilters = NULL; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_multicast_vif = -1; - ia = ia_if; - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); - - ifp = ia->ia_ifp; - - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || - (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) { - ifa_free(&ia->ia_ifa); - return (EADDRNOTAVAIL); - } - - if (imo->imo_num_memberships == 0) { addr.s_addr = htonl(INADDR_CARP_GROUP); - if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == - NULL) { - ifa_free(&ia->ia_ifa); - return (ENOBUFS); + if ((error = in_joingroup(ifp, &addr, NULL, + &imo->imo_membership[0])) != 0) { + free(imo->imo_membership, M_CARP); + break; } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; - } - - if (!ifp->if_carp) { - - cif = malloc(sizeof(*cif), M_CARP, - M_WAITOK|M_ZERO); - if (!cif) { - error = ENOBUFS; - goto cleanup; - } - if ((error = ifpromisc(ifp, 1))) { - free(cif, M_CARP); - goto cleanup; - } - - CARP_LOCK_INIT(cif); - CARP_LOCK(cif); - cif->vhif_ifp = ifp; - TAILQ_INIT(&cif->vhif_vrs); - ifp->if_carp = cif; - - } else { - struct carp_softc *vr; - - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && vr->sc_vhid == sc->sc_vhid) { - CARP_UNLOCK(cif); - error = EEXIST; - goto cleanup; - } - } - sc->sc_ia = ia; - sc->sc_carpdev = ifp; - - { /* XXX prevent endless loop if already in queue */ - struct carp_softc *vr, *after = NULL; - int myself = 0; - cif = (struct carp_if *)ifp->if_carp; - - /* XXX: cif should not change, right? So we still hold the lock */ - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { - if (vr == sc) - myself = 1; - if (vr->sc_vhid < sc->sc_vhid) - after = vr; - } - - if (!myself) { - /* We're trying to keep things in order */ - if (after == NULL) { - TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); - } else { - TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); - } - cif->vhif_nvrs++; - } - } - - sc->sc_naddrs++; - SC2IFP(sc)->if_flags |= IFF_UP; - if (own) - sc->sc_advskew = 0; - carp_sc_state_locked(sc); - carp_setrun(sc, 0); - - CARP_UNLOCK(cif); - ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ - - return (0); - -cleanup: - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - ifa_free(&ia->ia_ifa); - return (error); -} - -static int -carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) -{ - int error = 0; - - if (!--sc->sc_naddrs) { - struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; - struct ip_moptions *imo = &sc->sc_imo; - - CARP_LOCK(cif); - callout_stop(&sc->sc_ad_tmo); - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - sc->sc_vhid = -1; - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_multicast_ifp = NULL; - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - sc->sc_carpdev->if_carp = NULL; - CARP_LOCK_DESTROY(cif); - free(cif, M_CARP); - } else { - CARP_UNLOCK(cif); - } - } - - return (error); -} + break; + } #endif - #ifdef INET6 -static int -carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) -{ - struct ifnet *ifp; - struct carp_if *cif; - struct in6_ifaddr *ia, *ia_if; - struct ip6_moptions *im6o = &sc->sc_im6o; - struct in6_addr in6; - int own, error; - - error = 0; - - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - if (!(SC2IFP(sc)->if_flags & IFF_UP)) - carp_set_state(sc, INIT); - if (sc->sc_naddrs6) - SC2IFP(sc)->if_flags |= IFF_UP; - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carp_setrun(sc, 0); - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); - return (0); - } - - /* we have to do it by hands to check we won't match on us */ - ia_if = NULL; own = 0; - IN6_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - int i; - - for (i = 0; i < 4; i++) { - if ((sin6->sin6_addr.s6_addr32[i] & - ia->ia_prefixmask.sin6_addr.s6_addr32[i]) != - (ia->ia_addr.sin6_addr.s6_addr32[i] & - ia->ia_prefixmask.sin6_addr.s6_addr32[i])) - break; - } - /* and, yeah, we need a multicast-capable iface too */ - if (ia->ia_ifp != SC2IFP(sc) && - (ia->ia_ifp->if_flags & IFF_MULTICAST) && - (i == 4)) { - if (!ia_if) - ia_if = ia; - if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, - &ia->ia_addr.sin6_addr)) - own++; - } - } - - if (!ia_if) { - IN6_IFADDR_RUNLOCK(); - return (EADDRNOTAVAIL); - } - ia = ia_if; - ifa_ref(&ia->ia_ifa); - IN6_IFADDR_RUNLOCK(); - ifp = ia->ia_ifp; - - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || - (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) { - ifa_free(&ia->ia_ifa); - return (EADDRNOTAVAIL); - } - - if (!sc->sc_naddrs6) { + case AF_INET6: + { + struct ip6_moptions *im6o = &cif->cif_im6o; + struct in6_addr in6; struct in6_multi *in6m; + if (im6o->im6o_membership) + return (0); + + im6o->im6o_membership = (struct in6_multi **)malloc( + (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, + M_ZERO|M_WAITOK); + im6o->im6o_mfilters = NULL; + im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; - /* join CARP multicast address */ + /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; - if (in6_setscope(&in6, ifp, NULL) != 0) - goto cleanup; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { + free(im6o->im6o_membership, M_CARP); + break; + } in6m = NULL; - error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); - if (error) - goto cleanup; + if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + free(im6o->im6o_membership, M_CARP); + break; + } im6o->im6o_membership[0] = in6m; im6o->im6o_num_memberships++; - /* join solicited multicast address */ + /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); - in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3]; + in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; - if (in6_setscope(&in6, ifp, NULL) != 0) - goto cleanup; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { + in6_mc_leave(im6o->im6o_membership[0], NULL); + free(im6o->im6o_membership, M_CARP); + break; + } in6m = NULL; - error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); - if (error) - goto cleanup; + if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + in6_mc_leave(im6o->im6o_membership[0], NULL); + free(im6o->im6o_membership, M_CARP); + break; + } im6o->im6o_membership[1] = in6m; im6o->im6o_num_memberships++; - } - - if (!ifp->if_carp) { - cif = malloc(sizeof(*cif), M_CARP, - M_WAITOK|M_ZERO); - if (!cif) { - error = ENOBUFS; - goto cleanup; - } - if ((error = ifpromisc(ifp, 1))) { - free(cif, M_CARP); - goto cleanup; - } - - CARP_LOCK_INIT(cif); - CARP_LOCK(cif); - cif->vhif_ifp = ifp; - TAILQ_INIT(&cif->vhif_vrs); - ifp->if_carp = cif; - - } else { - struct carp_softc *vr; - - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && vr->sc_vhid == sc->sc_vhid) { - CARP_UNLOCK(cif); - error = EINVAL; - goto cleanup; - } - } - sc->sc_ia6 = ia; - sc->sc_carpdev = ifp; - - { /* XXX prevent endless loop if already in queue */ - struct carp_softc *vr, *after = NULL; - int myself = 0; - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { - if (vr == sc) - myself = 1; - if (vr->sc_vhid < sc->sc_vhid) - after = vr; - } - - if (!myself) { - /* We're trying to keep things in order */ - if (after == NULL) { - TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); - } else { - TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); - } - cif->vhif_nvrs++; - } - } - - sc->sc_naddrs6++; - SC2IFP(sc)->if_flags |= IFF_UP; - if (own) - sc->sc_advskew = 0; - carp_sc_state_locked(sc); - carp_setrun(sc, 0); - - CARP_UNLOCK(cif); - ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ - - return (0); - -cleanup: - if (!sc->sc_naddrs6) - carp_multicast6_cleanup(sc, 1); - ifa_free(&ia->ia_ifa); - return (error); -} - -static int -carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) -{ - int error = 0; - - if (!--sc->sc_naddrs6) { - struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; - - CARP_LOCK(cif); - callout_stop(&sc->sc_ad_tmo); - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - sc->sc_vhid = -1; - carp_multicast6_cleanup(sc, 1); - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - CARP_LOCK_DESTROY(cif); - sc->sc_carpdev->if_carp = NULL; - free(cif, M_CARP); - } else - CARP_UNLOCK(cif); - } - - return (error); -} -#endif /* INET6 */ - -static int -carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) -{ - struct carp_softc *sc = ifp->if_softc, *vr; - struct carpreq carpr; - struct ifaddr *ifa; - struct ifreq *ifr; - struct ifaliasreq *ifra; - int locked = 0, error = 0; - - ifa = (struct ifaddr *)addr; - ifra = (struct ifaliasreq *)addr; - ifr = (struct ifreq *)addr; - - switch (cmd) { - case SIOCSIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - SC2IFP(sc)->if_flags |= IFF_UP; - bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, - sizeof(struct sockaddr)); - error = carp_set_addr(sc, satosin(ifa->ifa_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - SC2IFP(sc)->if_flags |= IFF_UP; - error = carp_set_addr6(sc, satosin6(ifa->ifa_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } break; - - case SIOCAIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - SC2IFP(sc)->if_flags |= IFF_UP; - bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, - sizeof(struct sockaddr)); - error = carp_set_addr(sc, satosin(&ifra->ifra_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - SC2IFP(sc)->if_flags |= IFF_UP; - error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCDIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - error = carp_del_addr(sc, satosin(&ifra->ifra_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCSIFFLAGS: - if (sc->sc_carpdev) { - locked = 1; - CARP_SCLOCK(sc); - } - if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) { - callout_stop(&sc->sc_ad_tmo); - callout_stop(&sc->sc_md_tmo); - callout_stop(&sc->sc_md6_tmo); - if (sc->sc_state == MASTER) - carp_send_ad_locked(sc); - carp_set_state(sc, INIT); - carp_setrun(sc, 0); - } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) { - SC2IFP(sc)->if_flags |= IFF_UP; - carp_setrun(sc, 0); - } - break; - - case SIOCSVH: - error = priv_check(curthread, PRIV_NETINET_CARP); - if (error) - break; - if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) - break; - error = 1; - if (sc->sc_carpdev) { - locked = 1; - CARP_SCLOCK(sc); - } - if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { - switch (carpr.carpr_state) { - case BACKUP: - callout_stop(&sc->sc_ad_tmo); - carp_set_state(sc, BACKUP); - carp_setrun(sc, 0); - carp_setroute(sc, RTM_DELETE); - break; - case MASTER: - carp_master_down_locked(sc); - break; - default: - break; - } - } - if (carpr.carpr_vhid > 0) { - if (carpr.carpr_vhid > 255) { - error = EINVAL; - break; - } - if (sc->sc_carpdev) { - struct carp_if *cif; - cif = (struct carp_if *)sc->sc_carpdev->if_carp; - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && - vr->sc_vhid == carpr.carpr_vhid) { - error = EEXIST; - break; - } - if (error == EEXIST) - break; - } - sc->sc_vhid = carpr.carpr_vhid; - IF_LLADDR(sc->sc_ifp)[0] = 0; - IF_LLADDR(sc->sc_ifp)[1] = 0; - IF_LLADDR(sc->sc_ifp)[2] = 0x5e; - IF_LLADDR(sc->sc_ifp)[3] = 0; - IF_LLADDR(sc->sc_ifp)[4] = 1; - IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid; - error--; - } - if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) { - if (carpr.carpr_advskew >= 255) { - error = EINVAL; - break; - } - if (carpr.carpr_advbase > 255) { - error = EINVAL; - break; - } - sc->sc_advbase = carpr.carpr_advbase; - sc->sc_advskew = carpr.carpr_advskew; - error--; - } - bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); - if (error > 0) - error = EINVAL; - else { - error = 0; - carp_setrun(sc, 0); - } - break; - - case SIOCGVH: - /* XXX: lockless read */ - bzero(&carpr, sizeof(carpr)); - carpr.carpr_state = sc->sc_state; - carpr.carpr_vhid = sc->sc_vhid; - carpr.carpr_advbase = sc->sc_advbase; - carpr.carpr_advskew = sc->sc_advskew; - error = priv_check(curthread, PRIV_NETINET_CARP); - if (error == 0) - bcopy(sc->sc_key, carpr.carpr_key, - sizeof(carpr.carpr_key)); - error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); - break; - - default: - error = EINVAL; - } - - if (locked) - CARP_SCUNLOCK(sc); - - carp_hmac_prepare(sc); - - return (error); -} - -/* - * XXX: this is looutput. We should eventually use it from there. - */ -static int -carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct route *ro) -{ - u_int32_t af; - struct rtentry *rt = NULL; - - M_ASSERTPKTHDR(m); /* check if we have the packet header */ - - if (ro != NULL) - rt = ro->ro_rt; - if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - m_freem(m); - return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); - } - - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - -#if 1 /* XXX */ - switch (dst->sa_family) { - case AF_INET: - case AF_INET6: - case AF_IPX: - case AF_APPLETALK: - break; - default: - printf("carp_looutput: af=%d unexpected\n", dst->sa_family); - m_freem(m); - return (EAFNOSUPPORT); - } + } #endif - return(if_simloop(ifp, m, dst->sa_family, 0)); + } + + return (error); } /* - * Start output on carp interface. This function should never be called. + * Free multicast structures. */ static void -carp_start(struct ifnet *ifp) +carp_multicast_cleanup(struct carp_softc *sc, sa_family_t sa) { -#ifdef DEBUG - printf("%s: start called\n", ifp->if_xname); + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; + + switch (sa) { +#ifdef INET + case AF_INET: + if (sc->sc_naddrs == 0) { + struct ip_moptions *imo = &cif->cif_imo; + + in_leavegroup(imo->imo_membership[0], NULL); + KASSERT(imo->imo_mfilters == NULL, + ("%s: imo_mfilters != NULL", __func__)); + free(imo->imo_membership, M_CARP); + imo->imo_membership = NULL; + + } + break; #endif +#ifdef INET6 + case AF_INET6: + if (sc->sc_naddrs6 == 0) { + struct ip6_moptions *im6o = &cif->cif_im6o; + + in6_mc_leave(im6o->im6o_membership[0], NULL); + in6_mc_leave(im6o->im6o_membership[1], NULL); + KASSERT(im6o->im6o_mfilters == NULL, + ("%s: im6o_mfilters != NULL", __func__)); + free(im6o->im6o_membership, M_CARP); + im6o->im6o_membership = NULL; + } + break; +#endif + } } int -carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, - struct rtentry *rt) +carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; - struct ifnet *carp_ifp; if (!sa) return (0); @@ -2158,11 +1338,11 @@ carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, #ifdef INET case AF_INET: break; -#endif /* INET */ +#endif #ifdef INET6 case AF_INET6: break; -#endif /* INET6 */ +#endif default: return (0); } @@ -2171,10 +1351,9 @@ carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, if (mtag == NULL) return (0); - bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *)); - sc = carp_ifp->if_softc; + bcopy(mtag + 1, &sc, sizeof(struct carp_softc *)); - /* Set the source MAC address to Virtual Router MAC Address */ + /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: { @@ -2213,96 +1392,544 @@ carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, } break; default: - printf("%s: carp is not supported for this interface type\n", - ifp->if_xname); + printf("%s: carp is not supported for the %d interface type\n", + ifp->if_xname, ifp->if_type); return (EOPNOTSUPP); } return (0); } -static void -carp_set_state(struct carp_softc *sc, int state) +static struct carp_softc* +carp_alloc(struct ifnet *ifp) { - int link_state; + struct carp_softc *sc; + struct carp_if *cif; - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); - - if (sc->sc_state == state) - return; - - sc->sc_state = state; - switch (state) { - case BACKUP: - link_state = LINK_STATE_DOWN; - break; - case MASTER: - link_state = LINK_STATE_UP; - break; - default: - link_state = LINK_STATE_UNKNOWN; - break; + if ((cif = ifp->if_carp) == NULL) { + cif = carp_alloc_if(ifp); + if (cif == NULL) + return (NULL); } - if_link_state_change(SC2IFP(sc), link_state); + + sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); + + sc->sc_advbase = CARP_DFLTINTV; + sc->sc_vhid = -1; /* required setting */ + sc->sc_init_counter = 1; + sc->sc_state = INIT; + + sc->sc_ifasiz = sizeof(struct ifaddr *); + sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); + sc->sc_carpdev = ifp; + + CARP_LOCK_INIT(sc); +#ifdef INET + callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); +#endif +#ifdef INET6 + callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); +#endif + callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); + + CIF_LOCK(cif); + TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); + CIF_UNLOCK(cif); + + mtx_lock(&carp_mtx); + LIST_INSERT_HEAD(&carp_list, sc, sc_next); + mtx_unlock(&carp_mtx); + + return (sc); } -void -carp_carpdev_state(struct ifnet *ifp) +static int +carp_grow_ifas(struct carp_softc *sc) +{ + struct ifaddr **new; + + CARP_LOCK_ASSERT(sc); + + new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO); + if (new == NULL) + return (ENOMEM); + bcopy(sc->sc_ifas, new, sc->sc_ifasiz); + free(sc->sc_ifas, M_CARP); + sc->sc_ifas = new; + sc->sc_ifasiz *= 2; + + return (0); +} + +static void +carp_destroy(struct carp_softc *sc) +{ + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; + + CIF_LOCK(cif); + TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); + if (TAILQ_EMPTY(&cif->cif_vrs)) + carp_free_if(cif); + else + CIF_UNLOCK(cif); + + mtx_lock(&carp_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&carp_mtx); + + CARP_LOCK(sc); + callout_drain(&sc->sc_ad_tmo); +#ifdef INET + callout_drain(&sc->sc_md_tmo); +#endif +#ifdef INET6 + callout_drain(&sc->sc_md6_tmo); +#endif + CARP_LOCK_DESTROY(sc); + + free(sc->sc_ifas, M_CARP); + free(sc, M_CARP); +} + +static struct carp_if* +carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; - cif = ifp->if_carp; - CARP_LOCK(cif); - carp_carpdev_state_locked(cif); - CARP_UNLOCK(cif); + cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); + + if (ifpromisc(ifp, 1) != 0) + goto cleanup; + + CIF_LOCK_INIT(cif); + cif->cif_ifp = ifp; + TAILQ_INIT(&cif->cif_vrs); + + IF_ADDR_LOCK(ifp); + ifp->if_carp = cif; + if_ref(ifp); + IF_ADDR_UNLOCK(ifp); + + return (cif); + +cleanup: + free(cif, M_CARP); + + return (NULL); } static void -carp_carpdev_state_locked(struct carp_if *cif) +carp_free_if(struct carp_if *cif) +{ + struct ifnet *ifp = cif->cif_ifp; + + CIF_LOCK_ASSERT(cif); + KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", + __func__)); + + IF_ADDR_LOCK(ifp); + ifp->if_carp = NULL; + if_rele(ifp); + IF_ADDR_UNLOCK(ifp); + + CIF_LOCK_DESTROY(cif); + + ifpromisc(ifp, 0); + + free(cif, M_CARP); +} + +static void +carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) +{ + + CARP_LOCK(sc); + carpr->carpr_state = sc->sc_state; + carpr->carpr_vhid = sc->sc_vhid; + carpr->carpr_advbase = sc->sc_advbase; + carpr->carpr_advskew = sc->sc_advskew; + if (priv) + bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); + else + bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); + CARP_UNLOCK(sc); +} + +int +carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) +{ + struct carpreq carpr; + struct ifnet *ifp; + struct carp_softc *sc = NULL; + int error = 0, locked = 0; + + if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) + return (error); + + ifp = ifunit_ref(ifr->ifr_name); + if (ifp == NULL) + return (ENXIO); + + switch (ifp->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + case IFT_FDDI: + case IFT_ISO88025: + break; + default: + error = EOPNOTSUPP; + goto out; + } + + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + goto out; + } + + switch (cmd) { + case SIOCSVH: + if ((error = priv_check(td, PRIV_NETINET_CARP))) + break; + if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || + carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { + error = EINVAL; + break; + } + + if (ifp->if_carp) { + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == carpr.carpr_vhid) + break; + CIF_UNLOCK(ifp->if_carp); + } + if (sc == NULL) { + sc = carp_alloc(ifp); + if (sc == NULL) { + error = EINVAL; /* XXX: ifpromisc failed */ + break; + } + + CARP_LOCK(sc); + sc->sc_vhid = carpr.carpr_vhid; + LLADDR(&sc->sc_addr)[0] = 0; + LLADDR(&sc->sc_addr)[1] = 0; + LLADDR(&sc->sc_addr)[2] = 0x5e; + LLADDR(&sc->sc_addr)[3] = 0; + LLADDR(&sc->sc_addr)[4] = 1; + LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; + } else + CARP_LOCK(sc); + locked = 1; + if (carpr.carpr_advbase > 0) { + if (carpr.carpr_advbase > 255 || + carpr.carpr_advbase < CARP_DFLTINTV) { + error = EINVAL; + break; + } + sc->sc_advbase = carpr.carpr_advbase; + } + if (carpr.carpr_advskew > 0) { + if (carpr.carpr_advskew >= 255) { + error = EINVAL; + break; + } + sc->sc_advskew = carpr.carpr_advskew; + } + if (carpr.carpr_key[0] != '\0') { + bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); + carp_hmac_prepare(sc); + } + if (sc->sc_state != INIT && + carpr.carpr_state != sc->sc_state) { + switch (carpr.carpr_state) { + case BACKUP: + callout_stop(&sc->sc_ad_tmo); + carp_set_state(sc, BACKUP); + carp_setrun(sc, 0); + carp_delroute(sc); + break; + case MASTER: + carp_master_down_locked(sc); + break; + default: + break; + } + } + break; + + case SIOCGVH: + { + int priveleged; + + if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { + error = EINVAL; + break; + } + if (carpr.carpr_count < 1) { + error = EMSGSIZE; + break; + } + if (ifp->if_carp == NULL) { + error = ENOENT; + break; + } + + priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); + if (carpr.carpr_vhid != 0) { + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == carpr.carpr_vhid) + break; + CIF_UNLOCK(ifp->if_carp); + if (sc == NULL) { + error = ENOENT; + break; + } + carp_carprcp(&carpr, sc, priveleged); + error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); + } else { + int i, count; + + count = 0; + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + count++; + + if (count > carpr.carpr_count) { + CIF_UNLOCK(ifp->if_carp); + error = EMSGSIZE; + break; + } + + i = 0; + IFNET_FOREACH_CARP(ifp, sc) { + carp_carprcp(&carpr, sc, priveleged); + carpr.carpr_count = count; + error = copyout(&carpr, ifr->ifr_data + + (i * sizeof(carpr)), sizeof(carpr)); + if (error) { + CIF_UNLOCK(ifp->if_carp); + break; + } + i++; + } + CIF_UNLOCK(ifp->if_carp); + } + break; + } + default: + error = EINVAL; + } + +out: + if (locked) + CARP_UNLOCK(sc); + if_rele(ifp); + + return (error); +} + +static int +carp_get_vhid(struct ifaddr *ifa) +{ + + if (ifa == NULL || ifa->ifa_carp == NULL) + return (0); + + return (ifa->ifa_carp->sc_vhid); +} + +int +carp_attach(struct ifaddr *ifa, int vhid) +{ + struct ifnet *ifp = ifa->ifa_ifp; + struct carp_softc *sc; + int index, error; + + if (ifp->if_carp == NULL) + return (ENOPROTOOPT); + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: +#endif +#ifdef INET6 + case AF_INET6: +#endif + break; + default: + return (EPROTOTYPE); + } + + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == vhid) + break; + CIF_UNLOCK(ifp->if_carp); + if (sc == NULL) + return (ENOENT); + + if (ifa->ifa_carp) { + if (ifa->ifa_carp->sc_vhid != vhid) + carp_detach(ifa); + else + return (0); + } + + error = carp_multicast_setup(sc, ifa->ifa_addr->sa_family); + if (error) + return (error); + + CARP_LOCK(sc); + index = sc->sc_naddrs + sc->sc_naddrs6 + 1; + if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) + if ((error = carp_grow_ifas(sc)) != 0) { + carp_multicast_cleanup(sc, + ifa->ifa_addr->sa_family); + CARP_UNLOCK(sc); + return (error); + } + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + sc->sc_naddrs++; + break; +#endif +#ifdef INET6 + case AF_INET6: + sc->sc_naddrs6++; + break; +#endif + } + + ifa_ref(ifa); + sc->sc_ifas[index - 1] = ifa; + ifa->ifa_carp = sc; + + carp_hmac_prepare(sc); + carp_sc_state(sc); + + CARP_UNLOCK(sc); + + return (0); +} + +void +carp_detach(struct ifaddr *ifa) +{ + struct carp_softc *sc = ifa->ifa_carp; + int i, index; + + KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); + + CARP_LOCK(sc); + + /* Shift array. */ + index = sc->sc_naddrs + sc->sc_naddrs6; + for (i = 0; i < index; i++) + if (sc->sc_ifas[i] == ifa) + break; + KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); + for (; i < index - 1; i++) + sc->sc_ifas[i] = sc->sc_ifas[i+1]; + sc->sc_ifas[index - 1] = NULL; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + sc->sc_naddrs--; + break; +#endif +#ifdef INET6 + case AF_INET6: + sc->sc_naddrs6--; + break; +#endif + } + + carp_multicast_cleanup(sc, ifa->ifa_addr->sa_family); + + ifa->ifa_carp = NULL; + ifa_free(ifa); + + carp_hmac_prepare(sc); + carp_sc_state(sc); + + if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) { + CARP_UNLOCK(sc); + carp_destroy(sc); + } else + CARP_UNLOCK(sc); +} + +static void +carp_set_state(struct carp_softc *sc, int state) +{ + + CARP_LOCK_ASSERT(sc); + + if (sc->sc_state != state) { + const char *carp_states[] = { CARP_STATES }; + char subsys[IFNAMSIZ+5]; + + sc->sc_state = state; + + snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, + sc->sc_carpdev->if_xname); + devctl_notify("CARP", subsys, carp_states[state], NULL); + } +} + +static void +carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; - TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) - carp_sc_state_locked(sc); + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) { + CARP_LOCK(sc); + carp_sc_state(sc); + CARP_UNLOCK(sc); + } + CIF_UNLOCK(ifp->if_carp); } static void -carp_sc_state_locked(struct carp_softc *sc) +carp_sc_state(struct carp_softc *sc) { - CARP_SCLOCK_ASSERT(sc); + + CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP)) { - sc->sc_flags_backup = SC2IFP(sc)->if_flags; - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->sc_ad_tmo); +#ifdef INET callout_stop(&sc->sc_md_tmo); +#endif +#ifdef INET6 callout_stop(&sc->sc_md6_tmo); +#endif carp_set_state(sc, INIT); carp_setrun(sc, 0); if (!sc->sc_suppress) { carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } + if (carp_suppress_preempt == 1) + carp_send_ad_all(sc); } sc->sc_suppress = 1; } else { - SC2IFP(sc)->if_flags |= sc->sc_flags_backup; carp_set_state(sc, INIT); carp_setrun(sc, 0); if (sc->sc_suppress) carp_suppress_preempt--; sc->sc_suppress = 0; } - - return; } + #ifdef INET extern struct domain inetdomain; static struct protosw in_carp_protosw = { @@ -2335,10 +1962,6 @@ static void carp_mod_cleanup(void) { - if (if_detach_event_tag == NULL) - return; - EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); - if_clone_detach(&carp_cloner); #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); @@ -2356,6 +1979,10 @@ carp_mod_cleanup(void) carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif + carp_ioctl_p = NULL; + carp_attach_p = NULL; + carp_detach_p = NULL; + carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; @@ -2367,22 +1994,21 @@ carp_mod_load(void) { int err; - if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, - carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY); - if (if_detach_event_tag == NULL) - return (ENOMEM); mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); - LIST_INIT(&carpif_list); - if_clone_attach(&carp_cloner); - carp_linkstate_p = carp_carpdev_state; + LIST_INIT(&carp_list); + carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; + carp_linkstate_p = carp_linkstate; + carp_ioctl_p = carp_ioctl; + carp_attach_p = carp_attach; + carp_detach_p = carp_detach; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); - if (proto_reg[CARP_INET6] != 0) { + if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); @@ -2398,7 +2024,7 @@ carp_mod_load(void) #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); - if (proto_reg[CARP_INET] != 0) { + if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); @@ -2411,7 +2037,7 @@ carp_mod_load(void) return (err); } #endif - return 0; + return (0); } static int @@ -2422,17 +2048,13 @@ carp_modevent(module_t mod, int type, void *data) return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: - /* - * XXX: For now, disallow module unloading by default due to - * a race condition where a thread may dereference one of the - * function pointer hooks after the module has been - * unloaded, during processing of a packet, causing a panic. - */ -#ifdef CARPMOD_CAN_UNLOAD - carp_mod_cleanup(); -#else - return (EBUSY); -#endif + mtx_lock(&carp_mtx); + if (LIST_EMPTY(&carp_list)) + carp_mod_cleanup(); + else { + mtx_unlock(&carp_mtx); + return (EBUSY); + } break; default: diff --git a/sys/netinet/ip_carp.h b/sys/netinet/ip_carp.h index 2f2b4f283a96..d8b82a8d00fc 100644 --- a/sys/netinet/ip_carp.h +++ b/sys/netinet/ip_carp.h @@ -126,10 +126,12 @@ struct carpstats { * Configuration structure for SIOCSVH SIOCGVH */ struct carpreq { + int carpr_count; + int carpr_vhid; +#define CARP_MAXVHID 255 int carpr_state; #define CARP_STATES "INIT", "BACKUP", "MASTER" #define CARP_MAXSTATE 2 - int carpr_vhid; int carpr_advskew; int carpr_advbase; unsigned char carpr_key[CARP_KEY_LEN]; @@ -144,8 +146,7 @@ struct carpreq { #define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */ #define CARPCTL_LOG 3 /* log bad packets */ #define CARPCTL_STATS 4 /* statistics (read-only) */ -#define CARPCTL_ARPBALANCE 5 /* balance arp responses */ -#define CARPCTL_MAXID 6 +#define CARPCTL_MAXID 5 #define CARPCTL_NAMES { \ { 0, 0 }, \ @@ -153,33 +154,37 @@ struct carpreq { { "preempt", CTLTYPE_INT }, \ { "log", CTLTYPE_INT }, \ { "stats", CTLTYPE_STRUCT }, \ - { "arpbalance", CTLTYPE_INT }, \ } #ifdef _KERNEL -void carp_carpdev_state(struct ifnet *); -void carp_input (struct mbuf *, int); -int carp6_input (struct mbuf **, int *, int); -int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -int carp_iamatch (struct ifnet *, struct in_ifaddr *, struct in_addr *, - u_int8_t **); +int carp_ioctl(struct ifreq *, u_long, struct thread *); +int carp_attach(struct ifaddr *, int); +void carp_detach(struct ifaddr *); +void carp_carpdev_state(struct ifnet *); +void carp_input (struct mbuf *, int); +int carp6_input (struct mbuf **, int *, int); +int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *); +int carp_iamatch(struct ifaddr *, uint8_t **); struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *); caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *); -struct ifnet *carp_forus (struct ifnet *, u_char *); +int carp_forus(struct ifnet *, u_char *); /* These are external networking stack hooks for CARP */ /* net/if.c */ +extern int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); +extern int (*carp_attach_p)(struct ifaddr *, int); +extern void (*carp_detach_p)(struct ifaddr *); extern void (*carp_linkstate_p)(struct ifnet *); /* net/if_bridge.c net/if_ethersubr.c */ -extern struct ifnet *(*carp_forus_p)(struct ifnet *, u_char *); +extern int (*carp_forus_p)(struct ifnet *, u_char *); /* net/if_ethersubr.c */ extern int (*carp_output_p)(struct ifnet *, struct mbuf *, - struct sockaddr *, struct rtentry *); + struct sockaddr *); +/* net/rtsock.c */ +extern int (*carp_get_vhid_p)(struct ifaddr *); #ifdef INET /* netinet/if_ether.c */ -extern int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, - struct in_addr *, u_int8_t **); +extern int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 /* netinet6/nd6_nbr.c */ diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 8514f73ffe4f..ef22f03d7171 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -95,6 +95,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -272,6 +273,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; + int carp_attached = 0; int error; switch (cmd) { @@ -652,6 +654,18 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, break; } + if (ifra->ifra_vhid > 0) { + if (carp_attach_p != NULL) + error = (*carp_attach_p)(&ia->ia_ifa, + ifra->ifra_vhid); + else + error = EPROTONOSUPPORT; + if (error) + goto out; + else + carp_attached = 1; + } + /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but @@ -695,9 +709,14 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, * nd6_prelist_add will install the corresponding * interface route. */ - if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) + if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { + if (carp_attached) + (*carp_detach_p)(&ia->ia_ifa); goto out; + } if (pr == NULL) { + if (carp_attached) + (*carp_detach_p)(&ia->ia_ifa); log(LOG_ERR, "nd6_prelist_add succeeded but " "no prefix\n"); error = EINVAL; @@ -1301,6 +1320,9 @@ in6_purgeaddr(struct ifaddr *ifa) struct rtentry *rt; struct ifaddr *ifa0, *nifa; + if (ifa->ifa_carp) + (*carp_detach_p)(ifa); + /* * find another IPv6 address as the gateway for the * link-local and node-local all-nodes multicast diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c index 2ff7455fe4da..1dae2c023d42 100644 --- a/sys/netinet6/in6_ifattach.c +++ b/sys/netinet6/in6_ifattach.c @@ -705,7 +705,6 @@ in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: - case IFT_CARP: return; } diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h index 00342fde7621..3e93e7182a8a 100644 --- a/sys/netinet6/in6_var.h +++ b/sys/netinet6/in6_var.h @@ -287,6 +287,7 @@ struct in6_aliasreq { struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; + int ifra_vhid; }; /* prefix type macro */ diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index 875daa081bff..5d6f8c0d7b92 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -2172,9 +2172,6 @@ nd6_need_cache(struct ifnet *ifp) #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: -#endif -#ifdef IFT_CARP - case IFT_CARP: #endif case IFT_INFINIBAND: case IFT_GIF: /* XXX need more cases? */ diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c index 0221c72b44e7..a2aaeeacb8fd 100644 --- a/sys/netinet6/nd6_nbr.c +++ b/sys/netinet6/nd6_nbr.c @@ -225,7 +225,7 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) /* (1) and (3) check. */ if (ifp->if_carp) ifa = (*carp_iamatch6_p)(ifp, &taddr6); - if (ifa == NULL) + else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ @@ -688,7 +688,14 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } - ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); + /* + * This effectively disables the DAD check on a non-master CARP + * address. + */ + if (ifp->if_carp) + ifa = (*carp_iamatch6_p)(ifp, &taddr6); + else + ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* * Target address matches one of my interface address. @@ -1132,9 +1139,6 @@ nd6_ifptomac(struct ifnet *ifp) #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: -#endif -#ifdef IFT_CARP - case IFT_CARP: #endif case IFT_INFINIBAND: case IFT_BRIDGE: diff --git a/sys/sys/param.h b/sys/sys/param.h index 1ce8e353d5a8..5e9b4e5cd907 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1000002 /* Master, propagated to newvers */ +#define __FreeBSD_version 1000003 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,