1581 lines
39 KiB
C
1581 lines
39 KiB
C
/* $OpenBSD: tcp_usrreq.c,v 1.231 2024/04/12 16:07:09 bluhm Exp $ */
|
|
/* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
|
|
|
|
/*
|
|
* Copyright (c) 1982, 1986, 1988, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
|
|
*
|
|
* NRL grants permission for redistribution and use in source and binary
|
|
* forms, with or without modification, of the software and documentation
|
|
* created at NRL provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgements:
|
|
* This product includes software developed by the University of
|
|
* California, Berkeley and its contributors.
|
|
* This product includes software developed at the Information
|
|
* Technology Division, US Naval Research Laboratory.
|
|
* 4. Neither the name of the NRL nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
|
|
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* The views and conclusions contained in the software and documentation
|
|
* are those of the authors and should not be interpreted as representing
|
|
* official policies, either expressed or implied, of the US Naval
|
|
* Research Laboratory (NRL).
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/mbuf.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/socketvar.h>
|
|
#include <sys/protosw.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/sysctl.h>
|
|
#include <sys/domain.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/pool.h>
|
|
#include <sys/proc.h>
|
|
|
|
#include <net/if.h>
|
|
#include <net/if_var.h>
|
|
#include <net/route.h>
|
|
|
|
#include <netinet/in.h>
|
|
#include <netinet/in_var.h>
|
|
#include <netinet/ip.h>
|
|
#include <netinet/in_pcb.h>
|
|
#include <netinet/ip_var.h>
|
|
#include <netinet6/ip6_var.h>
|
|
#include <netinet/tcp.h>
|
|
#include <netinet/tcp_fsm.h>
|
|
#include <netinet/tcp_seq.h>
|
|
#include <netinet/tcp_timer.h>
|
|
#include <netinet/tcp_var.h>
|
|
#include <netinet/tcp_debug.h>
|
|
|
|
#ifdef INET6
|
|
#include <netinet6/in6_var.h>
|
|
#endif
|
|
|
|
#ifndef TCP_SENDSPACE
|
|
#define TCP_SENDSPACE 1024*16
|
|
#endif
|
|
u_int tcp_sendspace = TCP_SENDSPACE;
|
|
#ifndef TCP_RECVSPACE
|
|
#define TCP_RECVSPACE 1024*16
|
|
#endif
|
|
u_int tcp_recvspace = TCP_RECVSPACE;
|
|
u_int tcp_autorcvbuf_inc = 16 * 1024;
|
|
|
|
const struct pr_usrreqs tcp_usrreqs = {
|
|
.pru_attach = tcp_attach,
|
|
.pru_detach = tcp_detach,
|
|
.pru_bind = tcp_bind,
|
|
.pru_listen = tcp_listen,
|
|
.pru_connect = tcp_connect,
|
|
.pru_accept = tcp_accept,
|
|
.pru_disconnect = tcp_disconnect,
|
|
.pru_shutdown = tcp_shutdown,
|
|
.pru_rcvd = tcp_rcvd,
|
|
.pru_send = tcp_send,
|
|
.pru_abort = tcp_abort,
|
|
.pru_sense = tcp_sense,
|
|
.pru_rcvoob = tcp_rcvoob,
|
|
.pru_sendoob = tcp_sendoob,
|
|
.pru_control = in_control,
|
|
.pru_sockaddr = tcp_sockaddr,
|
|
.pru_peeraddr = tcp_peeraddr,
|
|
};
|
|
|
|
#ifdef INET6
|
|
const struct pr_usrreqs tcp6_usrreqs = {
|
|
.pru_attach = tcp_attach,
|
|
.pru_detach = tcp_detach,
|
|
.pru_bind = tcp_bind,
|
|
.pru_listen = tcp_listen,
|
|
.pru_connect = tcp_connect,
|
|
.pru_accept = tcp_accept,
|
|
.pru_disconnect = tcp_disconnect,
|
|
.pru_shutdown = tcp_shutdown,
|
|
.pru_rcvd = tcp_rcvd,
|
|
.pru_send = tcp_send,
|
|
.pru_abort = tcp_abort,
|
|
.pru_sense = tcp_sense,
|
|
.pru_rcvoob = tcp_rcvoob,
|
|
.pru_sendoob = tcp_sendoob,
|
|
.pru_control = in6_control,
|
|
.pru_sockaddr = tcp_sockaddr,
|
|
.pru_peeraddr = tcp_peeraddr,
|
|
};
|
|
#endif
|
|
|
|
const struct sysctl_bounded_args tcpctl_vars[] = {
|
|
{ TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
|
|
{ TCPCTL_SACK, &tcp_do_sack, 0, 1 },
|
|
{ TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
|
|
{ TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
|
|
{ TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
|
|
#ifdef TCP_ECN
|
|
{ TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
|
|
#endif
|
|
{ TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
|
|
{ TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
|
|
{ TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
|
|
{ TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
|
|
{ TCPCTL_TSO, &tcp_do_tso, 0, 1 },
|
|
};
|
|
|
|
struct inpcbtable tcbtable;
|
|
#ifdef INET6
|
|
struct inpcbtable tcb6table;
|
|
#endif
|
|
|
|
int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
|
|
int tcp_ident(void *, size_t *, void *, size_t, int);
|
|
|
|
static inline int tcp_sogetpcb(struct socket *, struct inpcb **,
|
|
struct tcpcb **);
|
|
|
|
static inline int
|
|
tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
|
|
/*
|
|
* When a TCP is attached to a socket, then there will be
|
|
* a (struct inpcb) pointed at by the socket, and this
|
|
* structure will point at a subsidiary (struct tcpcb).
|
|
*/
|
|
if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
|
|
if (so->so_error)
|
|
return so->so_error;
|
|
return EINVAL;
|
|
}
|
|
|
|
*rinp = inp;
|
|
*rtp = tp;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Export internal TCP state information via a struct tcp_info without
|
|
* leaking any sensitive information. Sequence numbers are reported
|
|
* relative to the initial sequence number.
|
|
*/
|
|
int
|
|
tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
|
|
{
|
|
struct proc *p = curproc;
|
|
struct tcp_info *ti;
|
|
u_int t = 1000; /* msec => usec */
|
|
uint64_t now;
|
|
|
|
if (sizeof(*ti) > MLEN) {
|
|
MCLGETL(m, M_WAITOK, sizeof(*ti));
|
|
if (!ISSET(m->m_flags, M_EXT))
|
|
return ENOMEM;
|
|
}
|
|
ti = mtod(m, struct tcp_info *);
|
|
m->m_len = sizeof(*ti);
|
|
memset(ti, 0, sizeof(*ti));
|
|
now = tcp_now();
|
|
|
|
ti->tcpi_state = tp->t_state;
|
|
if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
|
|
ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
|
|
if (tp->t_flags & TF_SACK_PERMIT)
|
|
ti->tcpi_options |= TCPI_OPT_SACK;
|
|
if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
|
|
ti->tcpi_options |= TCPI_OPT_WSCALE;
|
|
ti->tcpi_snd_wscale = tp->snd_scale;
|
|
ti->tcpi_rcv_wscale = tp->rcv_scale;
|
|
}
|
|
#ifdef TCP_ECN
|
|
if (tp->t_flags & TF_ECN_PERMIT)
|
|
ti->tcpi_options |= TCPI_OPT_ECN;
|
|
#endif
|
|
|
|
ti->tcpi_rto = tp->t_rxtcur * t;
|
|
ti->tcpi_snd_mss = tp->t_maxseg;
|
|
ti->tcpi_rcv_mss = tp->t_peermss;
|
|
|
|
ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t;
|
|
ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t;
|
|
ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t;
|
|
ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t;
|
|
|
|
ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
|
|
(TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
|
|
ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
|
|
(TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
|
|
ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
|
|
ti->tcpi_snd_cwnd = tp->snd_cwnd;
|
|
|
|
ti->tcpi_rcv_space = tp->rcv_wnd;
|
|
|
|
/*
|
|
* Provide only minimal information for unprivileged processes.
|
|
*/
|
|
if (suser(p) != 0)
|
|
return 0;
|
|
|
|
/* FreeBSD-specific extension fields for tcp_info. */
|
|
ti->tcpi_snd_wnd = tp->snd_wnd;
|
|
ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
|
|
ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
|
|
/* missing tcpi_toe_tid */
|
|
ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
|
|
ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
|
|
ti->tcpi_snd_zerowin = tp->t_sndzerowin;
|
|
|
|
/* OpenBSD extensions */
|
|
ti->tcpi_rttmin = tp->t_rttmin * t;
|
|
ti->tcpi_max_sndwnd = tp->max_sndwnd;
|
|
ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
|
|
ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
|
|
ti->tcpi_snd_una = tp->snd_una - tp->iss;
|
|
ti->tcpi_snd_up = tp->snd_up - tp->iss;
|
|
ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
|
|
ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
|
|
ti->tcpi_snd_max = tp->snd_max - tp->iss;
|
|
|
|
ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
|
|
ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t;
|
|
ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
|
|
ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t;
|
|
|
|
ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
|
|
ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
|
|
ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
|
|
ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
|
|
ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
|
|
ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
|
|
ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
|
|
ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
tcp_ctloutput(int op, struct socket *so, int level, int optname,
|
|
struct mbuf *m)
|
|
{
|
|
int error = 0;
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int i;
|
|
|
|
inp = sotoinpcb(so);
|
|
if (inp == NULL)
|
|
return (ECONNRESET);
|
|
if (level != IPPROTO_TCP) {
|
|
#ifdef INET6
|
|
if (ISSET(inp->inp_flags, INP_IPV6))
|
|
error = ip6_ctloutput(op, so, level, optname, m);
|
|
else
|
|
#endif
|
|
error = ip_ctloutput(op, so, level, optname, m);
|
|
return (error);
|
|
}
|
|
tp = intotcpcb(inp);
|
|
|
|
switch (op) {
|
|
|
|
case PRCO_SETOPT:
|
|
switch (optname) {
|
|
|
|
case TCP_NODELAY:
|
|
if (m == NULL || m->m_len < sizeof (int))
|
|
error = EINVAL;
|
|
else if (*mtod(m, int *))
|
|
tp->t_flags |= TF_NODELAY;
|
|
else
|
|
tp->t_flags &= ~TF_NODELAY;
|
|
break;
|
|
|
|
case TCP_NOPUSH:
|
|
if (m == NULL || m->m_len < sizeof (int))
|
|
error = EINVAL;
|
|
else if (*mtod(m, int *))
|
|
tp->t_flags |= TF_NOPUSH;
|
|
else if (tp->t_flags & TF_NOPUSH) {
|
|
tp->t_flags &= ~TF_NOPUSH;
|
|
if (TCPS_HAVEESTABLISHED(tp->t_state))
|
|
error = tcp_output(tp);
|
|
}
|
|
break;
|
|
|
|
case TCP_MAXSEG:
|
|
if (m == NULL || m->m_len < sizeof (int)) {
|
|
error = EINVAL;
|
|
break;
|
|
}
|
|
|
|
i = *mtod(m, int *);
|
|
if (i > 0 && i <= tp->t_maxseg)
|
|
tp->t_maxseg = i;
|
|
else
|
|
error = EINVAL;
|
|
break;
|
|
|
|
case TCP_SACK_ENABLE:
|
|
if (m == NULL || m->m_len < sizeof (int)) {
|
|
error = EINVAL;
|
|
break;
|
|
}
|
|
|
|
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
|
|
error = EPERM;
|
|
break;
|
|
}
|
|
|
|
if (tp->t_flags & TF_SIGNATURE) {
|
|
error = EPERM;
|
|
break;
|
|
}
|
|
|
|
if (*mtod(m, int *))
|
|
tp->sack_enable = 1;
|
|
else
|
|
tp->sack_enable = 0;
|
|
break;
|
|
#ifdef TCP_SIGNATURE
|
|
case TCP_MD5SIG:
|
|
if (m == NULL || m->m_len < sizeof (int)) {
|
|
error = EINVAL;
|
|
break;
|
|
}
|
|
|
|
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
|
|
error = EPERM;
|
|
break;
|
|
}
|
|
|
|
if (*mtod(m, int *)) {
|
|
tp->t_flags |= TF_SIGNATURE;
|
|
tp->sack_enable = 0;
|
|
} else
|
|
tp->t_flags &= ~TF_SIGNATURE;
|
|
break;
|
|
#endif /* TCP_SIGNATURE */
|
|
default:
|
|
error = ENOPROTOOPT;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case PRCO_GETOPT:
|
|
switch (optname) {
|
|
case TCP_NODELAY:
|
|
m->m_len = sizeof(int);
|
|
*mtod(m, int *) = tp->t_flags & TF_NODELAY;
|
|
break;
|
|
case TCP_NOPUSH:
|
|
m->m_len = sizeof(int);
|
|
*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
|
|
break;
|
|
case TCP_MAXSEG:
|
|
m->m_len = sizeof(int);
|
|
*mtod(m, int *) = tp->t_maxseg;
|
|
break;
|
|
case TCP_SACK_ENABLE:
|
|
m->m_len = sizeof(int);
|
|
*mtod(m, int *) = tp->sack_enable;
|
|
break;
|
|
case TCP_INFO:
|
|
error = tcp_fill_info(tp, so, m);
|
|
break;
|
|
#ifdef TCP_SIGNATURE
|
|
case TCP_MD5SIG:
|
|
m->m_len = sizeof(int);
|
|
*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
|
|
break;
|
|
#endif
|
|
default:
|
|
error = ENOPROTOOPT;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Attach TCP protocol to socket, allocating
|
|
* internet protocol control block, tcp control block,
|
|
* buffer space, and entering LISTEN state to accept connections.
|
|
*/
|
|
int
|
|
tcp_attach(struct socket *so, int proto, int wait)
|
|
{
|
|
struct inpcbtable *table;
|
|
struct tcpcb *tp;
|
|
struct inpcb *inp;
|
|
int error;
|
|
|
|
if (so->so_pcb)
|
|
return EISCONN;
|
|
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
|
|
sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
|
|
sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
|
|
error = soreserve(so, tcp_sendspace, tcp_recvspace);
|
|
if (error)
|
|
return (error);
|
|
}
|
|
|
|
NET_ASSERT_LOCKED();
|
|
#ifdef INET6
|
|
if (so->so_proto->pr_domain->dom_family == PF_INET6)
|
|
table = &tcb6table;
|
|
else
|
|
#endif
|
|
table = &tcbtable;
|
|
error = in_pcballoc(so, table, wait);
|
|
if (error)
|
|
return (error);
|
|
inp = sotoinpcb(so);
|
|
tp = tcp_newtcpcb(inp, wait);
|
|
if (tp == NULL) {
|
|
unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */
|
|
|
|
so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
|
|
in_pcbdetach(inp);
|
|
so->so_state |= nofd;
|
|
return (ENOBUFS);
|
|
}
|
|
tp->t_state = TCPS_CLOSED;
|
|
#ifdef INET6
|
|
if (ISSET(inp->inp_flags, INP_IPV6))
|
|
tp->pf = PF_INET6;
|
|
else
|
|
#endif
|
|
tp->pf = PF_INET;
|
|
if ((so->so_options & SO_LINGER) && so->so_linger == 0)
|
|
so->so_linger = TCP_LINGERTIME;
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
tcp_detach(struct socket *so)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *otp = NULL, *tp;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if (so->so_options & SO_DEBUG) {
|
|
otp = tp;
|
|
ostate = tp->t_state;
|
|
}
|
|
|
|
/*
|
|
* Detach the TCP protocol from the socket.
|
|
* If the protocol state is non-embryonic, then can't
|
|
* do this directly: have to initiate a PRU_DISCONNECT,
|
|
* which may finish later; embryonic TCB's can just
|
|
* be discarded here.
|
|
*/
|
|
tp = tcp_dodisconnect(tp);
|
|
|
|
if (otp)
|
|
tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Give the socket an address.
|
|
*/
|
|
int
|
|
tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
ostate = tp->t_state;
|
|
|
|
error = in_pcbbind(inp, nam, p);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Prepare to accept connections.
|
|
*/
|
|
int
|
|
tcp_listen(struct socket *so)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp, *otp = NULL;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if (so->so_options & SO_DEBUG) {
|
|
otp = tp;
|
|
ostate = tp->t_state;
|
|
}
|
|
|
|
if (inp->inp_lport == 0)
|
|
if ((error = in_pcbbind(inp, NULL, curproc)))
|
|
goto out;
|
|
|
|
/*
|
|
* If the in_pcbbind() above is called, the tp->pf
|
|
* should still be whatever it was before.
|
|
*/
|
|
tp->t_state = TCPS_LISTEN;
|
|
|
|
out:
|
|
if (otp)
|
|
tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Initiate connection to peer.
|
|
* Create a template for use in transmissions on this connection.
|
|
* Enter SYN_SENT state, and mark socket as connecting.
|
|
* Start keep-alive timer, and seed output sequence space.
|
|
* Send initial segment on connection.
|
|
*/
|
|
int
|
|
tcp_connect(struct socket *so, struct mbuf *nam)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp, *otp = NULL;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if (so->so_options & SO_DEBUG) {
|
|
otp = tp;
|
|
ostate = tp->t_state;
|
|
}
|
|
|
|
#ifdef INET6
|
|
if (ISSET(inp->inp_flags, INP_IPV6)) {
|
|
struct sockaddr_in6 *sin6;
|
|
|
|
if ((error = in6_nam2sin6(nam, &sin6)))
|
|
goto out;
|
|
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
|
|
IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
} else
|
|
#endif
|
|
{
|
|
struct sockaddr_in *sin;
|
|
|
|
if ((error = in_nam2sin(nam, &sin)))
|
|
goto out;
|
|
if ((sin->sin_addr.s_addr == INADDR_ANY) ||
|
|
(sin->sin_addr.s_addr == INADDR_BROADCAST) ||
|
|
IN_MULTICAST(sin->sin_addr.s_addr) ||
|
|
in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
error = in_pcbconnect(inp, nam);
|
|
if (error)
|
|
goto out;
|
|
|
|
tp->t_template = tcp_template(tp);
|
|
if (tp->t_template == 0) {
|
|
in_pcbunset_faddr(inp);
|
|
in_pcbdisconnect(inp);
|
|
error = ENOBUFS;
|
|
goto out;
|
|
}
|
|
|
|
so->so_state |= SS_CONNECTOUT;
|
|
|
|
/* Compute window scaling to request. */
|
|
tcp_rscale(tp, sb_max);
|
|
|
|
soisconnecting(so);
|
|
tcpstat_inc(tcps_connattempt);
|
|
tp->t_state = TCPS_SYN_SENT;
|
|
TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
|
|
tcp_set_iss_tsm(tp);
|
|
tcp_sendseqinit(tp);
|
|
tp->snd_last = tp->snd_una;
|
|
error = tcp_output(tp);
|
|
|
|
out:
|
|
if (otp)
|
|
tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Accept a connection. Essentially all the work is done at higher
|
|
* levels; just return the address of the peer, storing through addr.
|
|
*/
|
|
int
|
|
tcp_accept(struct socket *so, struct mbuf *nam)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
in_setpeeraddr(inp, nam);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Initiate disconnect from peer.
|
|
* If connection never passed embryonic stage, just drop;
|
|
* else if don't need to let data drain, then can just drop anyways,
|
|
* else have to begin TCP shutdown process: mark socket disconnecting,
|
|
* drain unread data, state switch to reflect user close, and
|
|
* send segment (e.g. FIN) to peer. Socket will be really disconnected
|
|
* when peer sends FIN and acks ours.
|
|
*
|
|
* SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
|
|
*/
|
|
int
|
|
tcp_disconnect(struct socket *so)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp, *otp = NULL;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if (so->so_options & SO_DEBUG) {
|
|
otp = tp;
|
|
ostate = tp->t_state;
|
|
}
|
|
|
|
tp = tcp_dodisconnect(tp);
|
|
|
|
if (otp)
|
|
tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Mark the connection as being incapable of further output.
|
|
*/
|
|
int
|
|
tcp_shutdown(struct socket *so)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp, *otp = NULL;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if (so->so_options & SO_DEBUG) {
|
|
otp = tp;
|
|
ostate = tp->t_state;
|
|
}
|
|
|
|
if (so->so_snd.sb_state & SS_CANTSENDMORE)
|
|
goto out;
|
|
|
|
socantsendmore(so);
|
|
tp = tcp_usrclosed(tp);
|
|
if (tp)
|
|
error = tcp_output(tp);
|
|
|
|
out:
|
|
if (otp)
|
|
tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* After a receive, possibly send window update to peer.
|
|
*/
|
|
void
|
|
tcp_rcvd(struct socket *so)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if (tcp_sogetpcb(so, &inp, &tp))
|
|
return;
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
ostate = tp->t_state;
|
|
|
|
/*
|
|
* soreceive() calls this function when a user receives
|
|
* ancillary data on a listening socket. We don't call
|
|
* tcp_output in such a case, since there is no header
|
|
* template for a listening socket and hence the kernel
|
|
* will panic.
|
|
*/
|
|
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
|
|
(void) tcp_output(tp);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0);
|
|
}
|
|
|
|
/*
|
|
* Do a send by putting data in output queue and updating urgent
|
|
* marker if URG set. Possibly send more data.
|
|
*/
|
|
int
|
|
tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
|
|
struct mbuf *control)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if (control && control->m_len) {
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
goto out;
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
ostate = tp->t_state;
|
|
|
|
sbappendstream(so, &so->so_snd, m);
|
|
m = NULL;
|
|
|
|
error = tcp_output(tp);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0);
|
|
|
|
out:
|
|
m_freem(control);
|
|
m_freem(m);
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Abort the TCP.
|
|
*/
|
|
void
|
|
tcp_abort(struct socket *so)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp, *otp = NULL;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if (tcp_sogetpcb(so, &inp, &tp))
|
|
return;
|
|
|
|
if (so->so_options & SO_DEBUG) {
|
|
otp = tp;
|
|
ostate = tp->t_state;
|
|
}
|
|
|
|
tp = tcp_drop(tp, ECONNABORTED);
|
|
|
|
if (otp)
|
|
tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0);
|
|
}
|
|
|
|
int
|
|
tcp_sense(struct socket *so, struct stat *ub)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
ub->st_blksize = so->so_snd.sb_hiwat;
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
tcp_rcvoob(struct socket *so, struct mbuf *m, int flags)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
if ((so->so_oobmark == 0 &&
|
|
(so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
|
|
so->so_options & SO_OOBINLINE ||
|
|
tp->t_oobflags & TCPOOB_HADDATA) {
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
|
|
error = EWOULDBLOCK;
|
|
goto out;
|
|
}
|
|
m->m_len = 1;
|
|
*mtod(m, caddr_t) = tp->t_iobc;
|
|
if ((flags & MSG_PEEK) == 0)
|
|
tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
|
|
out:
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0);
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam,
|
|
struct mbuf *control)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
short ostate;
|
|
|
|
soassertlocked(so);
|
|
|
|
if (control && control->m_len) {
|
|
error = EINVAL;
|
|
goto release;
|
|
}
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
goto release;
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
ostate = tp->t_state;
|
|
|
|
if (sbspace(so, &so->so_snd) < -512) {
|
|
error = ENOBUFS;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* According to RFC961 (Assigned Protocols),
|
|
* the urgent pointer points to the last octet
|
|
* of urgent data. We continue, however,
|
|
* to consider it to indicate the first octet
|
|
* of data past the urgent section.
|
|
* Otherwise, snd_up should be one lower.
|
|
*/
|
|
sbappendstream(so, &so->so_snd, m);
|
|
m = NULL;
|
|
tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
|
|
tp->t_force = 1;
|
|
error = tcp_output(tp);
|
|
tp->t_force = 0;
|
|
|
|
out:
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0);
|
|
|
|
release:
|
|
m_freem(control);
|
|
m_freem(m);
|
|
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
tcp_sockaddr(struct socket *so, struct mbuf *nam)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
in_setsockaddr(inp, nam);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
|
|
PRU_SOCKADDR, 0);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
tcp_peeraddr(struct socket *so, struct mbuf *nam)
|
|
{
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp;
|
|
int error;
|
|
|
|
soassertlocked(so);
|
|
|
|
if ((error = tcp_sogetpcb(so, &inp, &tp)))
|
|
return (error);
|
|
|
|
in_setpeeraddr(inp, nam);
|
|
|
|
if (so->so_options & SO_DEBUG)
|
|
tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Initiate (or continue) disconnect.
|
|
* If embryonic state, just send reset (once).
|
|
* If in ``let data drain'' option and linger null, just drop.
|
|
* Otherwise (hard), mark socket disconnecting and drop
|
|
* current input data; switch states based on user close, and
|
|
* send segment to peer (with FIN).
|
|
*/
|
|
struct tcpcb *
|
|
tcp_dodisconnect(struct tcpcb *tp)
|
|
{
|
|
struct socket *so = tp->t_inpcb->inp_socket;
|
|
|
|
if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
|
|
tp = tcp_close(tp);
|
|
else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
|
|
tp = tcp_drop(tp, 0);
|
|
else {
|
|
soisdisconnecting(so);
|
|
sbflush(so, &so->so_rcv);
|
|
tp = tcp_usrclosed(tp);
|
|
if (tp)
|
|
(void) tcp_output(tp);
|
|
}
|
|
return (tp);
|
|
}
|
|
|
|
/*
|
|
* User issued close, and wish to trail through shutdown states:
|
|
* if never received SYN, just forget it. If got a SYN from peer,
|
|
* but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
|
|
* If already got a FIN from peer, then almost done; go to LAST_ACK
|
|
* state. In all other cases, have already sent FIN to peer (e.g.
|
|
* after PRU_SHUTDOWN), and just have to play tedious game waiting
|
|
* for peer to send FIN or not respond to keep-alives, etc.
|
|
* We can let the user exit from the close as soon as the FIN is acked.
|
|
*/
|
|
struct tcpcb *
|
|
tcp_usrclosed(struct tcpcb *tp)
|
|
{
|
|
|
|
switch (tp->t_state) {
|
|
|
|
case TCPS_CLOSED:
|
|
case TCPS_LISTEN:
|
|
case TCPS_SYN_SENT:
|
|
tp->t_state = TCPS_CLOSED;
|
|
tp = tcp_close(tp);
|
|
break;
|
|
|
|
case TCPS_SYN_RECEIVED:
|
|
case TCPS_ESTABLISHED:
|
|
tp->t_state = TCPS_FIN_WAIT_1;
|
|
break;
|
|
|
|
case TCPS_CLOSE_WAIT:
|
|
tp->t_state = TCPS_LAST_ACK;
|
|
break;
|
|
}
|
|
if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
|
|
soisdisconnected(tp->t_inpcb->inp_socket);
|
|
/*
|
|
* If we are in FIN_WAIT_2, we arrived here because the
|
|
* application did a shutdown of the send side. Like the
|
|
* case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
|
|
* a full close, we start a timer to make sure sockets are
|
|
* not left in FIN_WAIT_2 forever.
|
|
*/
|
|
if (tp->t_state == TCPS_FIN_WAIT_2)
|
|
TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
|
|
}
|
|
return (tp);
|
|
}
|
|
|
|
/*
|
|
* Look up a socket for ident or tcpdrop, ...
|
|
*/
|
|
int
|
|
tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
|
|
{
|
|
int error = 0;
|
|
struct tcp_ident_mapping tir;
|
|
struct inpcb *inp;
|
|
struct tcpcb *tp = NULL;
|
|
struct sockaddr_in *fin, *lin;
|
|
#ifdef INET6
|
|
struct sockaddr_in6 *fin6, *lin6;
|
|
struct in6_addr f6, l6;
|
|
#endif
|
|
|
|
NET_ASSERT_LOCKED();
|
|
|
|
if (dodrop) {
|
|
if (oldp != NULL || *oldlenp != 0)
|
|
return (EINVAL);
|
|
if (newp == NULL)
|
|
return (EPERM);
|
|
if (newlen < sizeof(tir))
|
|
return (ENOMEM);
|
|
if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
|
|
return (error);
|
|
} else {
|
|
if (oldp == NULL)
|
|
return (EINVAL);
|
|
if (*oldlenp < sizeof(tir))
|
|
return (ENOMEM);
|
|
if (newp != NULL || newlen != 0)
|
|
return (EINVAL);
|
|
if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
|
|
return (error);
|
|
}
|
|
switch (tir.faddr.ss_family) {
|
|
#ifdef INET6
|
|
case AF_INET6:
|
|
fin6 = (struct sockaddr_in6 *)&tir.faddr;
|
|
error = in6_embedscope(&f6, fin6, NULL, NULL);
|
|
if (error)
|
|
return EINVAL; /*?*/
|
|
lin6 = (struct sockaddr_in6 *)&tir.laddr;
|
|
error = in6_embedscope(&l6, lin6, NULL, NULL);
|
|
if (error)
|
|
return EINVAL; /*?*/
|
|
break;
|
|
#endif
|
|
case AF_INET:
|
|
fin = (struct sockaddr_in *)&tir.faddr;
|
|
lin = (struct sockaddr_in *)&tir.laddr;
|
|
break;
|
|
default:
|
|
return (EINVAL);
|
|
}
|
|
|
|
switch (tir.faddr.ss_family) {
|
|
#ifdef INET6
|
|
case AF_INET6:
|
|
inp = in6_pcblookup(&tcb6table, &f6,
|
|
fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
|
|
break;
|
|
#endif
|
|
case AF_INET:
|
|
inp = in_pcblookup(&tcbtable, fin->sin_addr,
|
|
fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
|
|
break;
|
|
default:
|
|
unhandled_af(tir.faddr.ss_family);
|
|
}
|
|
|
|
if (dodrop) {
|
|
if (inp && (tp = intotcpcb(inp)) &&
|
|
((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
|
|
tp = tcp_drop(tp, ECONNABORTED);
|
|
else
|
|
error = ESRCH;
|
|
in_pcbunref(inp);
|
|
return (error);
|
|
}
|
|
|
|
if (inp == NULL) {
|
|
tcpstat_inc(tcps_pcbhashmiss);
|
|
switch (tir.faddr.ss_family) {
|
|
#ifdef INET6
|
|
case AF_INET6:
|
|
inp = in6_pcblookup_listen(&tcb6table,
|
|
&l6, lin6->sin6_port, NULL, tir.rdomain);
|
|
break;
|
|
#endif
|
|
case AF_INET:
|
|
inp = in_pcblookup_listen(&tcbtable,
|
|
lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
|
|
tir.ruid = inp->inp_socket->so_ruid;
|
|
tir.euid = inp->inp_socket->so_euid;
|
|
} else {
|
|
tir.ruid = -1;
|
|
tir.euid = -1;
|
|
}
|
|
|
|
*oldlenp = sizeof (tir);
|
|
error = copyout((void *)&tir, oldp, sizeof (tir));
|
|
in_pcbunref(inp);
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
|
|
{
|
|
uint64_t counters[tcps_ncounters];
|
|
struct tcpstat tcpstat;
|
|
struct syn_cache_set *set;
|
|
int i = 0;
|
|
|
|
#define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0)
|
|
|
|
memset(&tcpstat, 0, sizeof tcpstat);
|
|
counters_read(tcpcounters, counters, nitems(counters), NULL);
|
|
ASSIGN(tcps_connattempt);
|
|
ASSIGN(tcps_accepts);
|
|
ASSIGN(tcps_connects);
|
|
ASSIGN(tcps_drops);
|
|
ASSIGN(tcps_conndrops);
|
|
ASSIGN(tcps_closed);
|
|
ASSIGN(tcps_segstimed);
|
|
ASSIGN(tcps_rttupdated);
|
|
ASSIGN(tcps_delack);
|
|
ASSIGN(tcps_timeoutdrop);
|
|
ASSIGN(tcps_rexmttimeo);
|
|
ASSIGN(tcps_persisttimeo);
|
|
ASSIGN(tcps_persistdrop);
|
|
ASSIGN(tcps_keeptimeo);
|
|
ASSIGN(tcps_keepprobe);
|
|
ASSIGN(tcps_keepdrops);
|
|
ASSIGN(tcps_sndtotal);
|
|
ASSIGN(tcps_sndpack);
|
|
ASSIGN(tcps_sndbyte);
|
|
ASSIGN(tcps_sndrexmitpack);
|
|
ASSIGN(tcps_sndrexmitbyte);
|
|
ASSIGN(tcps_sndrexmitfast);
|
|
ASSIGN(tcps_sndacks);
|
|
ASSIGN(tcps_sndprobe);
|
|
ASSIGN(tcps_sndurg);
|
|
ASSIGN(tcps_sndwinup);
|
|
ASSIGN(tcps_sndctrl);
|
|
ASSIGN(tcps_rcvtotal);
|
|
ASSIGN(tcps_rcvpack);
|
|
ASSIGN(tcps_rcvbyte);
|
|
ASSIGN(tcps_rcvbadsum);
|
|
ASSIGN(tcps_rcvbadoff);
|
|
ASSIGN(tcps_rcvmemdrop);
|
|
ASSIGN(tcps_rcvnosec);
|
|
ASSIGN(tcps_rcvshort);
|
|
ASSIGN(tcps_rcvduppack);
|
|
ASSIGN(tcps_rcvdupbyte);
|
|
ASSIGN(tcps_rcvpartduppack);
|
|
ASSIGN(tcps_rcvpartdupbyte);
|
|
ASSIGN(tcps_rcvoopack);
|
|
ASSIGN(tcps_rcvoobyte);
|
|
ASSIGN(tcps_rcvpackafterwin);
|
|
ASSIGN(tcps_rcvbyteafterwin);
|
|
ASSIGN(tcps_rcvafterclose);
|
|
ASSIGN(tcps_rcvwinprobe);
|
|
ASSIGN(tcps_rcvdupack);
|
|
ASSIGN(tcps_rcvacktoomuch);
|
|
ASSIGN(tcps_rcvacktooold);
|
|
ASSIGN(tcps_rcvackpack);
|
|
ASSIGN(tcps_rcvackbyte);
|
|
ASSIGN(tcps_rcvwinupd);
|
|
ASSIGN(tcps_pawsdrop);
|
|
ASSIGN(tcps_predack);
|
|
ASSIGN(tcps_preddat);
|
|
ASSIGN(tcps_pcbhashmiss);
|
|
ASSIGN(tcps_noport);
|
|
ASSIGN(tcps_badsyn);
|
|
ASSIGN(tcps_dropsyn);
|
|
ASSIGN(tcps_rcvbadsig);
|
|
ASSIGN(tcps_rcvgoodsig);
|
|
ASSIGN(tcps_inswcsum);
|
|
ASSIGN(tcps_outswcsum);
|
|
ASSIGN(tcps_ecn_accepts);
|
|
ASSIGN(tcps_ecn_rcvece);
|
|
ASSIGN(tcps_ecn_rcvcwr);
|
|
ASSIGN(tcps_ecn_rcvce);
|
|
ASSIGN(tcps_ecn_sndect);
|
|
ASSIGN(tcps_ecn_sndece);
|
|
ASSIGN(tcps_ecn_sndcwr);
|
|
ASSIGN(tcps_cwr_ecn);
|
|
ASSIGN(tcps_cwr_frecovery);
|
|
ASSIGN(tcps_cwr_timeout);
|
|
ASSIGN(tcps_sc_added);
|
|
ASSIGN(tcps_sc_completed);
|
|
ASSIGN(tcps_sc_timed_out);
|
|
ASSIGN(tcps_sc_overflowed);
|
|
ASSIGN(tcps_sc_reset);
|
|
ASSIGN(tcps_sc_unreach);
|
|
ASSIGN(tcps_sc_bucketoverflow);
|
|
ASSIGN(tcps_sc_aborted);
|
|
ASSIGN(tcps_sc_dupesyn);
|
|
ASSIGN(tcps_sc_dropped);
|
|
ASSIGN(tcps_sc_collisions);
|
|
ASSIGN(tcps_sc_retransmitted);
|
|
ASSIGN(tcps_sc_seedrandom);
|
|
ASSIGN(tcps_sc_hash_size);
|
|
ASSIGN(tcps_sc_entry_count);
|
|
ASSIGN(tcps_sc_entry_limit);
|
|
ASSIGN(tcps_sc_bucket_maxlen);
|
|
ASSIGN(tcps_sc_bucket_limit);
|
|
ASSIGN(tcps_sc_uses_left);
|
|
ASSIGN(tcps_conndrained);
|
|
ASSIGN(tcps_sack_recovery_episode);
|
|
ASSIGN(tcps_sack_rexmits);
|
|
ASSIGN(tcps_sack_rexmit_bytes);
|
|
ASSIGN(tcps_sack_rcv_opts);
|
|
ASSIGN(tcps_sack_snd_opts);
|
|
ASSIGN(tcps_sack_drop_opts);
|
|
ASSIGN(tcps_outswtso);
|
|
ASSIGN(tcps_outhwtso);
|
|
ASSIGN(tcps_outpkttso);
|
|
ASSIGN(tcps_outbadtso);
|
|
ASSIGN(tcps_inswlro);
|
|
ASSIGN(tcps_inhwlro);
|
|
ASSIGN(tcps_inpktlro);
|
|
ASSIGN(tcps_inbadlro);
|
|
|
|
#undef ASSIGN
|
|
|
|
mtx_enter(&syn_cache_mtx);
|
|
set = &tcp_syn_cache[tcp_syn_cache_active];
|
|
tcpstat.tcps_sc_hash_size = set->scs_size;
|
|
tcpstat.tcps_sc_entry_count = set->scs_count;
|
|
tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
|
|
tcpstat.tcps_sc_bucket_maxlen = 0;
|
|
for (i = 0; i < set->scs_size; i++) {
|
|
if (tcpstat.tcps_sc_bucket_maxlen <
|
|
set->scs_buckethead[i].sch_length)
|
|
tcpstat.tcps_sc_bucket_maxlen =
|
|
set->scs_buckethead[i].sch_length;
|
|
}
|
|
tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
|
|
tcpstat.tcps_sc_uses_left = set->scs_use;
|
|
mtx_leave(&syn_cache_mtx);
|
|
|
|
return (sysctl_rdstruct(oldp, oldlenp, newp,
|
|
&tcpstat, sizeof(tcpstat)));
|
|
}
|
|
|
|
/*
|
|
* Sysctl for tcp variables.
|
|
*/
|
|
int
|
|
tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
|
|
size_t newlen)
|
|
{
|
|
int error, nval;
|
|
|
|
/* All sysctl names at this level are terminal. */
|
|
if (namelen != 1)
|
|
return (ENOTDIR);
|
|
|
|
switch (name[0]) {
|
|
case TCPCTL_KEEPINITTIME:
|
|
NET_LOCK();
|
|
nval = tcptv_keep_init / TCP_TIME(1);
|
|
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
|
|
1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1)));
|
|
if (!error)
|
|
tcptv_keep_init = TCP_TIME(nval);
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_KEEPIDLE:
|
|
NET_LOCK();
|
|
nval = tcp_keepidle / TCP_TIME(1);
|
|
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
|
|
1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1)));
|
|
if (!error)
|
|
tcp_keepidle = TCP_TIME(nval);
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_KEEPINTVL:
|
|
NET_LOCK();
|
|
nval = tcp_keepintvl / TCP_TIME(1);
|
|
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
|
|
1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1)));
|
|
if (!error)
|
|
tcp_keepintvl = TCP_TIME(nval);
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_BADDYNAMIC:
|
|
NET_LOCK();
|
|
error = sysctl_struct(oldp, oldlenp, newp, newlen,
|
|
baddynamicports.tcp, sizeof(baddynamicports.tcp));
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_ROOTONLY:
|
|
if (newp && securelevel > 0)
|
|
return (EPERM);
|
|
NET_LOCK();
|
|
error = sysctl_struct(oldp, oldlenp, newp, newlen,
|
|
rootonlyports.tcp, sizeof(rootonlyports.tcp));
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_IDENT:
|
|
NET_LOCK();
|
|
error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_DROP:
|
|
NET_LOCK();
|
|
error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_REASS_LIMIT:
|
|
NET_LOCK();
|
|
nval = tcp_reass_limit;
|
|
error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
|
|
if (!error && nval != tcp_reass_limit) {
|
|
error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
|
|
if (!error)
|
|
tcp_reass_limit = nval;
|
|
}
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_SACKHOLE_LIMIT:
|
|
NET_LOCK();
|
|
nval = tcp_sackhole_limit;
|
|
error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
|
|
if (!error && nval != tcp_sackhole_limit) {
|
|
error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
|
|
if (!error)
|
|
tcp_sackhole_limit = nval;
|
|
}
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_STATS:
|
|
return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
|
|
|
|
case TCPCTL_SYN_USE_LIMIT:
|
|
NET_LOCK();
|
|
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
|
|
&tcp_syn_use_limit, 0, INT_MAX);
|
|
if (!error && newp != NULL) {
|
|
/*
|
|
* Global tcp_syn_use_limit is used when reseeding a
|
|
* new cache. Also update the value in active cache.
|
|
*/
|
|
mtx_enter(&syn_cache_mtx);
|
|
if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
|
|
tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
|
|
if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
|
|
tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
|
|
mtx_leave(&syn_cache_mtx);
|
|
}
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
case TCPCTL_SYN_HASH_SIZE:
|
|
NET_LOCK();
|
|
nval = tcp_syn_hash_size;
|
|
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
|
|
&nval, 1, 100000);
|
|
if (!error && nval != tcp_syn_hash_size) {
|
|
/*
|
|
* If global hash size has been changed,
|
|
* switch sets as soon as possible. Then
|
|
* the actual hash array will be reallocated.
|
|
*/
|
|
mtx_enter(&syn_cache_mtx);
|
|
if (tcp_syn_cache[0].scs_size != nval)
|
|
tcp_syn_cache[0].scs_use = 0;
|
|
if (tcp_syn_cache[1].scs_size != nval)
|
|
tcp_syn_cache[1].scs_use = 0;
|
|
tcp_syn_hash_size = nval;
|
|
mtx_leave(&syn_cache_mtx);
|
|
}
|
|
NET_UNLOCK();
|
|
return (error);
|
|
|
|
default:
|
|
NET_LOCK();
|
|
error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
|
|
name, namelen, oldp, oldlenp, newp, newlen);
|
|
NET_UNLOCK();
|
|
return (error);
|
|
}
|
|
/* NOTREACHED */
|
|
}
|
|
|
|
/*
|
|
* Scale the send buffer so that inflight data is not accounted against
|
|
* the limit. The buffer will scale with the congestion window, if the
|
|
* the receiver stops acking data the window will shrink and therefore
|
|
* the buffer size will shrink as well.
|
|
* In low memory situation try to shrink the buffer to the initial size
|
|
* disabling the send buffer scaling as long as the situation persists.
|
|
*/
|
|
void
|
|
tcp_update_sndspace(struct tcpcb *tp)
|
|
{
|
|
struct socket *so = tp->t_inpcb->inp_socket;
|
|
u_long nmax = so->so_snd.sb_hiwat;
|
|
|
|
if (sbchecklowmem()) {
|
|
/* low on memory try to get rid of some */
|
|
if (tcp_sendspace < nmax)
|
|
nmax = tcp_sendspace;
|
|
} else if (so->so_snd.sb_wat != tcp_sendspace)
|
|
/* user requested buffer size, auto-scaling disabled */
|
|
nmax = so->so_snd.sb_wat;
|
|
else
|
|
/* automatic buffer scaling */
|
|
nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
|
|
tp->snd_una);
|
|
|
|
/* a writable socket must be preserved because of poll(2) semantics */
|
|
if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
|
|
if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
|
|
nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
|
|
/* keep in sync with sbreserve() calculation */
|
|
if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
|
|
nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
|
|
}
|
|
|
|
/* round to MSS boundary */
|
|
nmax = roundup(nmax, tp->t_maxseg);
|
|
|
|
if (nmax != so->so_snd.sb_hiwat)
|
|
sbreserve(so, &so->so_snd, nmax);
|
|
}
|
|
|
|
/*
|
|
* Scale the recv buffer by looking at how much data was transferred in
|
|
* one approximated RTT. If more than a big part of the recv buffer was
|
|
* transferred during that time we increase the buffer by a constant.
|
|
* In low memory situation try to shrink the buffer to the initial size.
|
|
*/
|
|
void
|
|
tcp_update_rcvspace(struct tcpcb *tp)
|
|
{
|
|
struct socket *so = tp->t_inpcb->inp_socket;
|
|
u_long nmax = so->so_rcv.sb_hiwat;
|
|
|
|
if (sbchecklowmem()) {
|
|
/* low on memory try to get rid of some */
|
|
if (tcp_recvspace < nmax)
|
|
nmax = tcp_recvspace;
|
|
} else if (so->so_rcv.sb_wat != tcp_recvspace)
|
|
/* user requested buffer size, auto-scaling disabled */
|
|
nmax = so->so_rcv.sb_wat;
|
|
else {
|
|
/* automatic buffer scaling */
|
|
if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
|
|
nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
|
|
tcp_autorcvbuf_inc);
|
|
}
|
|
|
|
/* a readable socket must be preserved because of poll(2) semantics */
|
|
if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
|
|
nmax < so->so_snd.sb_lowat)
|
|
nmax = so->so_snd.sb_lowat;
|
|
|
|
if (nmax == so->so_rcv.sb_hiwat)
|
|
return;
|
|
|
|
/* round to MSS boundary */
|
|
nmax = roundup(nmax, tp->t_maxseg);
|
|
sbreserve(so, &so->so_rcv, nmax);
|
|
}
|