From adc56f5a383771f594829b7db9c263b6f0dcf1bd Mon Sep 17 00:00:00 2001 From: Edward Tomasz Napierala Date: Mon, 2 Dec 2019 20:58:04 +0000 Subject: [PATCH] Make use of the stats(3) framework in the TCP stack. This makes it possible to retrieve per-connection statistical information such as the receive window size, RTT, or goodput, using a newly added TCP_STATS getsockopt(3) option, and extract them using the stats_voistat_fetch(3) API. See the net/tcprtt port for an example consumer of this API. Compared to the existing TCP_INFO system, the main differences are that this mechanism is easy to extend without breaking ABI, and provides statistical information instead of raw "snapshots" of values at a given point in time. stats(3) is more generic and can be used in both userland and the kernel. Reviewed by: thj Tested by: thj Obtained from: Netflix Relnotes: yes Sponsored by: Klara Inc, Netflix Differential Revision: https://reviews.freebsd.org/D20655 --- lib/libstats/Makefile | 4 +- share/man/man4/tcp.4 | 18 ++- sys/conf/files | 1 + sys/netinet/cc/cc.h | 7 +- sys/netinet/tcp.h | 15 +++ sys/netinet/tcp_input.c | 50 +++++++ sys/netinet/tcp_log_buf.c | 5 +- sys/netinet/tcp_output.c | 32 ++++- sys/netinet/tcp_stats.c | 274 ++++++++++++++++++++++++++++++++++++++ sys/netinet/tcp_subr.c | 15 +++ sys/netinet/tcp_usrreq.c | 91 +++++++++++++ sys/netinet/tcp_var.h | 18 ++- sys/sys/stats.h | 3 + 13 files changed, 523 insertions(+), 10 deletions(-) create mode 100644 sys/netinet/tcp_stats.c diff --git a/lib/libstats/Makefile b/lib/libstats/Makefile index da7ec10c2b0b..89e10aa64c5f 100644 --- a/lib/libstats/Makefile +++ b/lib/libstats/Makefile @@ -3,12 +3,12 @@ LIB= stats SHLIBDIR?= /lib SHLIB_MAJOR= 0 -SRCS= subr_stats.c +SRCS= subr_stats.c tcp_stats.c # To debug, comment WITHOUT_ASSERT_DEBUG= and uncomment CFLAGS:= WITHOUT_ASSERT_DEBUG= #CFLAGS:=${CFLAGS:C/-O[0-9]/-O0 -g3/} -DDIAGNOSTIC -.PATH: ${.CURDIR}/../../sys/kern +.PATH: ${.CURDIR}/../../sys/kern ${.CURDIR}/../../sys/netinet .include diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 index f6f0c59b96d8..b1a4e1205383 100644 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd December 1, 2019 +.Dd December 2, 2019 .Dt TCP 4 .Os .Sh NAME @@ -291,6 +291,10 @@ This entry can only be specified on a per-host basis at this time. .Pp If an SADB entry cannot be found for the destination, the system does not send any outgoing segments and drops any inbound segments. +.It Dv TCP_STATS +Manage collection of connection level statistics using the +.Xr stats 3 +framework. .Pp Each dropped segment is taken into account in the TCP protocol statistics. .It Dv TCP_TXTLS_ENABLE @@ -664,6 +668,17 @@ Default is false. When initializing the TCP timestamps, use a per connection offset instead of a per host pair offset. Default is to use per connection offsets as recommended in RFC 7323. +.It Va perconn_stats_enable +Controls the default collection of statistics for all connections using the +.Xr stats 3 +framework. +0 disables, 1 enables, 2 enables random sampling across log id connection +groups with all connections in a group receiving the same setting. +.It Va perconn_stats_sample_rates +A CSV list of template_spec=percent key-value pairs which controls the per +template sampling rates when +.Xr stats 3 +sampling is enabled. .El .Sh ERRORS A socket operation may fail with one of the following errors returned: @@ -703,6 +718,7 @@ when trying to use a TCP function block that is not available; .Sh SEE ALSO .Xr getsockopt 2 , .Xr socket 2 , +.Xr stats 3 , .Xr sysctl 3 , .Xr blackhole 4 , .Xr inet 4 , diff --git a/sys/conf/files b/sys/conf/files index cf240886a672..874081813809 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4295,6 +4295,7 @@ netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \ compile-with "${NORMAL_C} ${NO_WNONNULL}" netinet/tcp_reass.c optional inet | inet6 netinet/tcp_sack.c optional inet | inet6 +netinet/tcp_stats.c optional stats inet | stats inet6 netinet/tcp_subr.c optional inet | inet6 netinet/tcp_syncache.c optional inet | inet6 netinet/tcp_timer.c optional inet | inet6 diff --git a/sys/netinet/cc/cc.h b/sys/netinet/cc/cc.h index b1a69ca1eaaa..fec61ebe18e4 100644 --- a/sys/netinet/cc/cc.h +++ b/sys/netinet/cc/cc.h @@ -51,9 +51,7 @@ #ifndef _NETINET_CC_CC_H_ #define _NETINET_CC_CC_H_ -#if !defined(_KERNEL) -#error "no user-serviceable parts inside" -#endif +#ifdef _KERNEL /* Global CC vars. */ extern STAILQ_HEAD(cc_head, cc_algo) cc_list; @@ -108,6 +106,7 @@ struct cc_var { #define CC_DUPACK 0x0002 /* Duplicate ACK. */ #define CC_PARTIALACK 0x0004 /* Not yet. */ #define CC_SACK 0x0008 /* Not yet. */ +#endif /* _KERNEL */ /* * Congestion signal types passed to the cong_signal() hook. The highest order 8 @@ -121,6 +120,7 @@ struct cc_var { #define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ +#ifdef _KERNEL /* * Structure to hold data and function pointers that together represent a * congestion control algorithm. @@ -184,4 +184,5 @@ extern struct rwlock cc_list_lock; #define CC_ALGOOPT_LIMIT 2048 +#endif /* _KERNEL */ #endif /* _NETINET_CC_CC_H_ */ diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 125cacb28350..4e06c97920d6 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -168,6 +168,7 @@ struct tcphdr { #define TCP_NOOPT 8 /* don't use TCP options */ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ +#define TCP_STATS 33 /* retrieve stats blob structure */ #define TCP_LOG 34 /* configure event logging for connection */ #define TCP_LOGBUF 35 /* retrieve event log for connection */ #define TCP_LOGID 36 /* configure log ID to correlate connections */ @@ -364,4 +365,18 @@ struct tcp_function_set { */ #define TLS_SET_RECORD_TYPE 1 +/* + * TCP specific variables of interest for tp->t_stats stats(9) accounting. + */ +#define VOI_TCP_TXPB 0 /* Transmit payload bytes */ +#define VOI_TCP_RETXPB 1 /* Retransmit payload bytes */ +#define VOI_TCP_FRWIN 2 /* Foreign receive window */ +#define VOI_TCP_LCWIN 3 /* Local congesiton window */ +#define VOI_TCP_RTT 4 /* Round trip time */ +#define VOI_TCP_CSIG 5 /* Congestion signal */ +#define VOI_TCP_GPUT 6 /* Goodput */ +#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */ +#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */ +#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */ + #endif /* !_NETINET_TCP_H_ */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index e9f9997bf2ad..370fd93f6b7f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include "opt_tcpdebug.h" #include +#include #include #ifdef TCP_HHOOK #include @@ -66,6 +67,7 @@ __FBSDID("$FreeBSD$"); #include #include /* for proc0 declaration */ #include +#include #include #include #include @@ -73,6 +75,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include /* before tcp_seq.h, for tcp_random18() */ @@ -298,6 +301,10 @@ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { +#ifdef STATS + int32_t gput; +#endif + INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; @@ -310,6 +317,35 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { +#ifdef STATS + stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, + ((int32_t)tp->snd_cwnd) - tp->snd_wnd); + if (!IN_RECOVERY(tp->t_flags)) + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, + tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs)); + if ((tp->t_flags & TF_GPUTINPROG) && + SEQ_GEQ(th->th_ack, tp->gput_ack)) { + /* + * Compute goodput in bits per millisecond. + */ + gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) / + max(1, tcp_ts_getticks() - tp->gput_ts); + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, + gput); + /* + * XXXLAS: This is a temporary hack, and should be + * chained off VOI_TCP_GPUT when stats(9) grows an API + * to deal with chained VOIs. + */ + if (tp->t_stats_gput_prev > 0) + stats_voi_update_abs_s32(tp->t_stats, + VOI_TCP_GPUT_ND, + ((gput - tp->t_stats_gput_prev) * 100) / + tp->t_stats_gput_prev); + tp->t_flags &= ~TF_GPUTINPROG; + tp->t_stats_gput_prev = gput; + } +#endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tcp_maxseg(tp)); @@ -328,6 +364,9 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } +#ifdef STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); +#endif } void @@ -393,6 +432,10 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) INP_WLOCK_ASSERT(tp->t_inpcb); +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); +#endif + switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { @@ -1496,6 +1539,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; +#ifdef STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); +#endif /* * TCP ECN processing. @@ -3359,6 +3405,10 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, + imax(0, rtt * 1000 / hz)); +#endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index 48d8067625c5..81d5ee67a3aa 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -30,10 +30,12 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include +#include #include #include #include @@ -41,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -475,7 +478,7 @@ tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp) INP_WLOCK_ASSERT(tp->t_inpcb); -#ifdef NETFLIX +#ifdef STATS if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); #endif diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index f6eff70f3611..1fc8e9ace04f 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #ifdef TCP_HHOOK #include @@ -54,10 +55,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include +#include #include #include @@ -991,15 +994,31 @@ send: struct sockbuf *msb; u_int moff; - if ((tp->t_flags & TF_FORCEDATA) && len == 1) + if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); - else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { +#ifdef STATS + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + stats_voi_update_abs_u32(tp->t_stats, + VOI_TCP_RETXPB, len); + else + stats_voi_update_abs_u64(tp->t_stats, + VOI_TCP_TXPB, len); +#endif /* STATS */ + } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, + len); +#endif /* STATS */ } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); +#ifdef STATS + stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, + len); +#endif /* STATS */ } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) @@ -1472,6 +1491,15 @@ out: tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } +#ifdef STATS + if (!(tp->t_flags & TF_GPUTINPROG) && len) { + tp->t_flags |= TF_GPUTINPROG; + tp->gput_seq = startseq; + tp->gput_ack = startseq + + ulmin(sbavail(&so->so_snd) - off, sendwin); + tp->gput_ts = tcp_ts_getticks(); + } +#endif /* STATS */ } /* diff --git a/sys/netinet/tcp_stats.c b/sys/netinet/tcp_stats.c new file mode 100644 index 000000000000..fc937321c518 --- /dev/null +++ b/sys/netinet/tcp_stats.c @@ -0,0 +1,274 @@ +/*- + * Copyright (c) 2016-2018 Netflix, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Author: Lawrence Stewart + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#endif +#include + +#include + +#include +#include +#include +#include + +#include + +VNET_DEFINE(int, tcp_perconn_stats_dflt_tpl) = -1; + +#ifndef _KERNEL +#define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) +#define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) +#else /* _KERNEL */ + +VNET_DEFINE(int, tcp_perconn_stats_enable) = 2; +VNET_DEFINE_STATIC(struct stats_tpl_sample_rate *, tcp_perconn_stats_sample_rates); +VNET_DEFINE_STATIC(int, tcp_stats_nrates) = 0; +#define V_tcp_perconn_stats_sample_rates VNET(tcp_perconn_stats_sample_rates) +#define V_tcp_stats_nrates VNET(tcp_stats_nrates) + +static struct rmlock tcp_stats_tpl_sampling_lock; +static int tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action, + struct stats_tpl_sample_rate **rates, int *nrates, void *ctx); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, perconn_stats_enable, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_perconn_stats_enable), 0, + "Enable per-connection TCP stats gathering; 1 enables for all connections, " + "2 enables random sampling across log id connection groups"); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, perconn_stats_sample_rates, + CTLTYPE_STRING | CTLFLAG_RW, tcp_stats_tpl_sr_cb, + sizeof(struct rm_priotracker), stats_tpl_sample_rates, "A", + "TCP stats per template random sampling rates, in CSV tpl_spec=percent " + "key-value pairs (see stats(9) for template spec details)"); +#endif /* _KERNEL */ + +#ifdef _KERNEL +int +#else +static int +/* Ensure all templates are also added to the userland template list. */ +__attribute__ ((constructor)) +#endif +tcp_stats_init() +{ + int err, lasterr; + + err = lasterr = 0; + + V_tcp_perconn_stats_dflt_tpl = stats_tpl_alloc("TCP_DEFAULT", 0); + if (V_tcp_perconn_stats_dflt_tpl < 0) + return (-V_tcp_perconn_stats_dflt_tpl); + + struct voistatspec vss_sum[] = { + STATS_VSS_SUM(), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_TXPB, "TCP_TXPB", VSD_DTYPE_INT_U64, + NVSS(vss_sum), vss_sum, 0); + lasterr = err ? err : lasterr; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_RETXPB, "TCP_RETXPB", VSD_DTYPE_INT_U32, + NVSS(vss_sum), vss_sum, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_max[] = { + STATS_VSS_MAX(), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_FRWIN, "TCP_FRWIN", VSD_DTYPE_INT_ULONG, + NVSS(vss_max), vss_max, 0); + lasterr = err ? err : lasterr; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_LCWIN, "TCP_LCWIN", VSD_DTYPE_INT_ULONG, + NVSS(vss_max), vss_max, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_rtt[] = { + STATS_VSS_MAX(), + STATS_VSS_MIN(), + STATS_VSS_TDGSTCLUST32(20, 4), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_RTT, "TCP_RTT", VSD_DTYPE_INT_U32, + NVSS(vss_rtt), vss_rtt, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_congsig[] = { + STATS_VSS_DVHIST32_USR(HBKTS(DVBKT(CC_ECN), DVBKT(CC_RTO), + DVBKT(CC_RTO_ERR), DVBKT(CC_NDUPACK)), 0) + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_CSIG, "TCP_CSIG", VSD_DTYPE_INT_U32, + NVSS(vss_congsig), vss_congsig, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_gput[] = { + STATS_VSS_MAX(), + STATS_VSS_TDGSTCLUST32(20, 4), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_GPUT, "TCP_GPUT", VSD_DTYPE_INT_U32, + NVSS(vss_gput), vss_gput, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_gput_nd[] = { + STATS_VSS_TDGSTCLUST32(10, 4), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_GPUT_ND, "TCP_GPUT_ND", VSD_DTYPE_INT_S32, + NVSS(vss_gput_nd), vss_gput_nd, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_windiff[] = { + STATS_VSS_CRHIST32_USR(HBKTS(CRBKT(0)), VSD_HIST_LBOUND_INF) + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_CALCFRWINDIFF, "TCP_CALCFRWINDIFF", VSD_DTYPE_INT_S32, + NVSS(vss_windiff), vss_windiff, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_acklen[] = { + STATS_VSS_MAX(), + STATS_VSS_CRHIST32_LIN(0, 9, 1, VSD_HIST_UBOUND_INF) + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_ACKLEN, "TCP_ACKLEN", VSD_DTYPE_INT_U32, + NVSS(vss_acklen), vss_acklen, 0); + lasterr = err ? err : lasterr; + + return (lasterr); +} + +#ifdef _KERNEL +int +tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, + size_t seed_len) +{ + struct rm_priotracker tracker; + int tpl; + + tpl = -1; + + if (V_tcp_stats_nrates > 0) { + rm_rlock(&tcp_stats_tpl_sampling_lock, &tracker); + tpl = stats_tpl_sample_rollthedice(V_tcp_perconn_stats_sample_rates, + V_tcp_stats_nrates, seed_bytes, seed_len); + rm_runlock(&tcp_stats_tpl_sampling_lock, &tracker); + + if (tpl >= 0) { + INP_WLOCK_ASSERT(tp->t_inpcb); + if (tp->t_stats != NULL) + stats_blob_destroy(tp->t_stats); + tp->t_stats = stats_blob_alloc(tpl, 0); + if (tp->t_stats == NULL) + tpl = -ENOMEM; + } + } + + return (tpl); +} + +/* + * Callback function for stats_tpl_sample_rates() to interact with the TCP + * subsystem's stats template sample rates list. + */ +int +tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action, + struct stats_tpl_sample_rate **rates, int *nrates, void *ctx) +{ + struct stats_tpl_sample_rate *old_rates; + int old_nrates; + + if (ctx == NULL) + return (ENOMEM); + + switch (action) { + case TPL_SR_RLOCKED_GET: + /* + * Return with rlock held i.e. this call must be paired with a + * "action == TPL_SR_RUNLOCK" call. + */ + rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED); + rm_rlock(&tcp_stats_tpl_sampling_lock, + (struct rm_priotracker *)ctx); + /* FALLTHROUGH */ + case TPL_SR_UNLOCKED_GET: + if (rates != NULL) + *rates = V_tcp_perconn_stats_sample_rates; + if (nrates != NULL) + *nrates = V_tcp_stats_nrates; + break; + case TPL_SR_RUNLOCK: + rm_assert(&tcp_stats_tpl_sampling_lock, RA_RLOCKED); + rm_runlock(&tcp_stats_tpl_sampling_lock, + (struct rm_priotracker *)ctx); + break; + case TPL_SR_PUT: + KASSERT(rates != NULL && nrates != NULL, + ("%s: PUT without new rates", __func__)); + rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED); + if (rates == NULL || nrates == NULL) + return (EINVAL); + rm_wlock(&tcp_stats_tpl_sampling_lock); + old_rates = V_tcp_perconn_stats_sample_rates; + old_nrates = V_tcp_stats_nrates; + V_tcp_perconn_stats_sample_rates = *rates; + V_tcp_stats_nrates = *nrates; + rm_wunlock(&tcp_stats_tpl_sampling_lock); + *rates = old_rates; + *nrates = old_nrates; + break; + default: + return (EINVAL); + break; + } + + return (0); +} + +RM_SYSINIT(tcp_stats_tpl_sampling_lock, &tcp_stats_tpl_sampling_lock, + "tcp_stats_tpl_sampling_lock"); +#endif /* _KERNEL */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 70e4243dbaf5..6a4051d3793c 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #ifdef TCP_HHOOK @@ -54,6 +55,8 @@ __FBSDID("$FreeBSD$"); #ifdef KERN_TLS #include #endif +#include +#include #include #include #include @@ -1004,6 +1007,11 @@ tcp_init(void) if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); +#endif +#ifdef STATS + if (tcp_stats_init()) + printf("%s: WARNING: unable to initialise TCP stats\n", + __func__); #endif hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); @@ -1694,6 +1702,10 @@ tcp_newtcpcb(struct inpcb *inp) if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } +#ifdef STATS + if (V_tcp_perconn_stats_enable == 1) + tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); +#endif return (tp); /* XXX */ } @@ -1912,6 +1924,9 @@ tcp_discardcb(struct tcpcb *tp) #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif +#ifdef STATS + stats_blob_destroy(tp->t_stats); +#endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 77341d176e48..8c4f0185772e 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -49,11 +49,13 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include +#include #include #include #ifdef INET6 @@ -65,6 +67,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef DDB #include @@ -108,6 +111,13 @@ __FBSDID("$FreeBSD$"); #endif #include +#include +#include +#include +#include +#include +#include + /* * TCP protocol interface to socket abstraction. */ @@ -1816,6 +1826,9 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp #endif struct cc_algo *algo; char *pbuf, buf[TCP_LOG_ID_LEN]; +#ifdef STATS + struct statsblob *sbp; +#endif size_t len; /* @@ -1933,6 +1946,35 @@ unlock_and_done: error = EINVAL; break; + case TCP_STATS: + INP_WUNLOCK(inp); +#ifdef STATS + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + if (optval > 0) + sbp = stats_blob_alloc( + V_tcp_perconn_stats_dflt_tpl, 0); + else + sbp = NULL; + + INP_WLOCK_RECHECK(inp); + if ((tp->t_stats != NULL && sbp == NULL) || + (tp->t_stats == NULL && sbp != NULL)) { + struct statsblob *t = tp->t_stats; + tp->t_stats = sbp; + sbp = t; + } + INP_WUNLOCK(inp); + + stats_blob_destroy(sbp); +#else + return (EOPNOTSUPP); +#endif /* !STATS */ + break; + case TCP_CONGESTION: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); @@ -2217,6 +2259,55 @@ unlock_and_done: INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; + case TCP_STATS: + { +#ifdef STATS + int nheld; + TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0; + + error = 0; + socklen_t outsbsz = sopt->sopt_valsize; + if (tp->t_stats == NULL) + error = ENOENT; + else if (outsbsz >= tp->t_stats->cursz) + outsbsz = tp->t_stats->cursz; + else if (outsbsz >= sizeof(struct statsblob)) + outsbsz = sizeof(struct statsblob); + else + error = EINVAL; + INP_WUNLOCK(inp); + if (error) + break; + + sbp = sopt->sopt_val; + nheld = atop(round_page(((vm_offset_t)sbp) + + (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp)); + vm_page_t ma[nheld]; + if (vm_fault_quick_hold_pages( + &curproc->p_vmspace->vm_map, (vm_offset_t)sbp, + outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma, + nheld) < 0) { + error = EFAULT; + break; + } + + if ((error = copyin_nofault(&(sbp->flags), &sbflags, + SIZEOF_MEMBER(struct statsblob, flags)))) + goto unhold; + + INP_WLOCK_RECHECK(inp); + error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats, + sbflags | SB_CLONE_USRDSTNOFAULT); + INP_WUNLOCK(inp); + sopt->sopt_valsize = outsbsz; +unhold: + vm_page_unhold_pages(ma, nheld); +#else + INP_WUNLOCK(inp); + error = EOPNOTSUPP; +#endif /* !STATS */ + break; + } case TCP_CONGESTION: len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); INP_WUNLOCK(inp); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 42f13c083b58..8ad3b2f9a483 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -210,7 +210,12 @@ struct tcpcb { struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ + struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ + uint32_t gput_ts; /* Time goodput measurement started */ + tcp_seq gput_seq; /* Outbound measurement seq */ + tcp_seq gput_ack; /* Inbound measurement ack */ + int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { @@ -327,7 +332,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function); #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid */ #define TF_UNUSED1 0x00004000 /* unused */ -#define TF_UNUSED2 0x00008000 /* unused */ +#define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ @@ -787,6 +792,10 @@ VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); +#ifdef STATS +VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); +VNET_DECLARE(int, tcp_perconn_stats_enable); +#endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); @@ -823,6 +832,10 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) +#ifdef STATS +#define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) +#define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) +#endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) @@ -966,10 +979,13 @@ int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); +int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, + size_t seed_len); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); +int tcp_stats_init(void); static inline void tcp_fields_to_host(struct tcphdr *th) diff --git a/sys/sys/stats.h b/sys/sys/stats.h index 30b1073cfa99..8f1c8edee5aa 100644 --- a/sys/sys/stats.h +++ b/sys/sys/stats.h @@ -58,6 +58,9 @@ #define _SYS_STATS_H_ #include +#ifdef DIAGNOSTIC +#include +#endif #ifndef _KERNEL /*