From 2da066ef6d85d3f7cd8aaec14369d66254836536 Mon Sep 17 00:00:00 2001
From: John Baldwin <jhb@FreeBSD.org>
Date: Thu, 2 May 2024 16:28:16 -0700
Subject: [PATCH] libnvmf: Add internal library to support NVMe over Fabrics

libnvmf provides APIs for transmitting and receiving Command and
Response capsules along with data associated with NVMe commands.
Capsules are represented by 'struct nvmf_capsule' objects.

Capsules are transmitted and received on queue pairs represented by
'struct nvmf_qpair' objects.

Queue pairs belong to an association represented by a 'struct
nvmf_association' object.

libnvmf provides additional helper APIs to assist with constructing
command capsules for a host, response capsules for a controller,
connecting queue pairs to a remote controller and optionally
offloading connected queues to an in-kernel host, accepting queue pair
connections from remote hosts and optionally offloading connected
queues to an in-kernel controller, constructing controller data
structures for local controllers, etc.

libnvmf also includes an internal transport abstraction as well as an
implementation of a userspace TCP transport.

libnvmf is primarily intended for ease of use and low-traffic use cases
such as establishing connections that are handed off to the kernel.
As such, it uses a simple API built on blocking I/O.

For a host, a consumer first populates an 'struct
nvmf_association_params' with a set of parameters shared by all queue
pairs for a single association such as whether or not to use SQ flow
control and header and data digests and creates a 'struct
nvmf_association' object.  The consumer is responsible for
establishing a TCP socket for each queue pair.  This socket is
included in the 'struct nvmf_qpair_params' passed to 'nvmf_connect' to
complete transport-specific negotiation, send a Fabrics Connect
command, and wait for the Connect reply. Upon success, a new 'struct
nvmf_qpair' object is returned.  This queue pair can then be used to
send and receive capsules.  A command capsule is allocated, populated
with an SQE and optional data buffer, and transmitted via
nvmf_host_transmit_command.  The consumer can then wait for a reply
via nvmf_host_wait_for_response.  The library also provides some
wrapper functions such as nvmf_read_property and nvmf_write_property
which send a command and wait for a response synchronously.

For a controller, a consumer uses a single association for a set of
incoming connections.  A consumer can choose to use multiple
associations (e.g. a separate association for connections to a
discovery controller listening on a different port than I/O
controllers).  The consumer is responsible for accepting TCP sockets
directly, but once a socket has been accepted it is passed to
nvmf_accept to perform transport-specific negotiation and wait for the
Connect command.  Similar to nvmf_connect, nvmf_accept returns a newly
construct nvmf_qpair.  However, in contrast to nvmf_connect,
nvmf_accept does not complete the Fabrics negotiation.  The consumer
must explicitly send a response capsule before waiting for additional
command capsules to arrive.  In particular, in the kernel offload
case, the Connect command and data are provided to the kernel
controller and the Connect response capsule is sent by the kernel once
it is ready to handle the new queue pair.

For userspace controller command handling, the consumer uses
nvmf_controller_receive_capsule to wait for a command capsule.
nvmf_receive_controller_data is used to retrieve any data from a
command (e.g. the data for a WRITE command).  It can be called
multiple times to split the data transfer into smaller sizes.
nvmf_send_controller_data is used to send data to a remote host in
response to a command.  It also sends a response capsule indicating
success, or an error if an internal error occurs.  nvmf_send_response
is used to send a response without associated data.  There are also
several convenience wrappers such as nvmf_send_success and
nvmf_send_generic_error.

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44710
---
 lib/Makefile                  |    1 +
 lib/libnvmf/Makefile          |   22 +
 lib/libnvmf/internal.h        |  116 +++
 lib/libnvmf/libnvmf.h         |  363 ++++++++
 lib/libnvmf/nvmf_controller.c |  463 +++++++++++
 lib/libnvmf/nvmf_host.c       |  911 ++++++++++++++++++++
 lib/libnvmf/nvmf_tcp.c        | 1474 +++++++++++++++++++++++++++++++++
 lib/libnvmf/nvmf_transport.c  |  269 ++++++
 share/mk/src.libnames.mk      |    4 +
 9 files changed, 3623 insertions(+)
 create mode 100644 lib/libnvmf/Makefile
 create mode 100644 lib/libnvmf/internal.h
 create mode 100644 lib/libnvmf/libnvmf.h
 create mode 100644 lib/libnvmf/nvmf_controller.c
 create mode 100644 lib/libnvmf/nvmf_host.c
 create mode 100644 lib/libnvmf/nvmf_tcp.c
 create mode 100644 lib/libnvmf/nvmf_transport.c

diff --git a/lib/Makefile b/lib/Makefile
index 6135cff10c15..5696fa4aa593 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -78,6 +78,7 @@ SUBDIR=	${SUBDIR_BOOTSTRAP} \
 	libnetbsd \
 	libnetmap \
 	libnv \
+	libnvmf \
 	libopenbsd \
 	libpam \
 	libpathconv \
diff --git a/lib/libnvmf/Makefile b/lib/libnvmf/Makefile
new file mode 100644
index 000000000000..dbba6b476510
--- /dev/null
+++ b/lib/libnvmf/Makefile
@@ -0,0 +1,22 @@
+.PATH:  ${SRCTOP}/sys/dev/nvmf/controller
+.PATH:  ${SRCTOP}/sys/libkern
+
+LIB=		nvmf
+INTERNALLIB=
+PACKAGE=	nvmf
+
+INCS=		libnvmf.h
+
+SRCS=		gsb_crc32.c \
+		nvmf_controller.c \
+		nvmf_host.c \
+		nvmf_tcp.c \
+		nvmf_transport.c \
+		nvmft_subr.c
+
+CFLAGS+=	-I${SRCTOP}/sys/dev/nvmf/controller
+CFLAGS+=	-I${SRCTOP}/sys/dev/nvmf
+
+.include <bsd.lib.mk>
+
+CWARNFLAGS.gsb_crc32.c=	-Wno-cast-align
diff --git a/lib/libnvmf/internal.h b/lib/libnvmf/internal.h
new file mode 100644
index 000000000000..cf45c15ba2f0
--- /dev/null
+++ b/lib/libnvmf/internal.h
@@ -0,0 +1,116 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __LIBNVMF_INTERNAL_H__
+#define __LIBNVMF_INTERNAL_H__
+
+#include <sys/queue.h>
+
+struct nvmf_transport_ops {
+	/* Association management. */
+	struct nvmf_association *(*allocate_association)(bool controller,
+	    const struct nvmf_association_params *params);
+	void (*update_association)(struct nvmf_association *na,
+	    const struct nvme_controller_data *cdata);
+	void (*free_association)(struct nvmf_association *na);
+
+	/* Queue pair management. */
+	struct nvmf_qpair *(*allocate_qpair)(struct nvmf_association *na,
+	    const struct nvmf_qpair_params *params);
+	void (*free_qpair)(struct nvmf_qpair *qp);
+
+	/* Create params for kernel handoff. */
+	int (*kernel_handoff_params)(struct nvmf_qpair *qp,
+	    struct nvmf_handoff_qpair_params *qparams);
+
+	/* Capsule operations. */
+	struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp);
+	void (*free_capsule)(struct nvmf_capsule *nc);
+	int (*transmit_capsule)(struct nvmf_capsule *nc);
+	int (*receive_capsule)(struct nvmf_qpair *qp,
+	    struct nvmf_capsule **ncp);
+	uint8_t (*validate_command_capsule)(const struct nvmf_capsule *nc);
+
+	/* Transferring controller data. */
+	size_t (*capsule_data_len)(const struct nvmf_capsule *nc);
+	int (*receive_controller_data)(const struct nvmf_capsule *nc,
+	    uint32_t data_offset, void *buf, size_t len);
+	int (*send_controller_data)(const struct nvmf_capsule *nc,
+	    const void *buf, size_t len);
+};
+
+struct nvmf_association {
+	struct nvmf_transport_ops *na_ops;
+	enum nvmf_trtype na_trtype;
+	bool na_controller;
+
+	struct nvmf_association_params na_params;
+
+	/* Each qpair holds a reference on an association. */
+	u_int na_refs;
+
+	char *na_last_error;
+};
+
+struct nvmf_qpair {
+	struct nvmf_association *nq_association;
+	bool nq_admin;
+
+	uint16_t nq_cid;	/* host only */
+
+	/*
+	 * Queue sizes.  This assumes the same size for both the
+	 * completion and submission queues within a pair.
+	 */
+	u_int	nq_qsize;
+
+	/* Flow control management for submission queues. */
+	bool nq_flow_control;
+	uint16_t nq_sqhd;
+	uint16_t nq_sqtail;	/* host only */
+
+	/* Value in response to/from CONNECT. */
+	uint16_t nq_cntlid;
+
+	uint32_t nq_kato;	/* valid on admin queue only */
+
+	TAILQ_HEAD(, nvmf_capsule) nq_rx_capsules;
+};
+
+struct nvmf_capsule {
+	struct nvmf_qpair *nc_qpair;
+
+	/* Either a SQE or CQE. */
+	union {
+		struct nvme_command nc_sqe;
+		struct nvme_completion nc_cqe;
+	};
+	int	nc_qe_len;
+
+	/*
+	 * Is SQHD in received capsule valid?  False for locally-
+	 * synthesized responses.
+	 */
+	bool	nc_sqhd_valid;
+
+	/* Data buffer. */
+	bool	nc_send_data;
+	void	*nc_data;
+	size_t	nc_data_len;
+
+	TAILQ_ENTRY(nvmf_capsule) nc_link;
+};
+
+extern struct nvmf_transport_ops tcp_ops;
+
+void	na_clear_error(struct nvmf_association *na);
+void	na_error(struct nvmf_association *na, const char *fmt, ...);
+
+int	nvmf_kernel_handoff_params(struct nvmf_qpair *qp,
+    struct nvmf_handoff_qpair_params *qparams);
+
+#endif /* !__LIBNVMF_INTERNAL_H__ */
diff --git a/lib/libnvmf/libnvmf.h b/lib/libnvmf/libnvmf.h
new file mode 100644
index 000000000000..f15277a02621
--- /dev/null
+++ b/lib/libnvmf/libnvmf.h
@@ -0,0 +1,363 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __LIBNVMF_H__
+#define	__LIBNVMF_H__
+
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+
+struct nvmf_capsule;
+struct nvmf_association;
+struct nvmf_qpair;
+
+/*
+ * Parameters shared by all queue-pairs of an association.  Note that
+ * this contains the requested values used to initiate transport
+ * negotiation.
+ */
+struct nvmf_association_params {
+	bool sq_flow_control;		/* SQ flow control required. */
+	bool dynamic_controller_model;	/* Controller only */
+	uint16_t max_admin_qsize;	/* Controller only */
+	uint32_t max_io_qsize;		/* Controller only, 0 for discovery */
+	union {
+		struct {
+			uint8_t pda;	/* Tx-side PDA. */
+			bool header_digests;
+			bool data_digests;
+			uint32_t maxr2t;	/* Host only */
+			uint32_t maxh2cdata;	/* Controller only */
+		} tcp;
+	};
+};
+
+/* Parameters specific to a single queue pair of an association. */
+struct nvmf_qpair_params {
+	bool admin;			/* Host only */
+	union {
+		struct {
+			int fd;
+		} tcp;
+	};
+};
+
+/* Transport-independent APIs. */
+
+/*
+ * A host should allocate a new association for each association with
+ * a controller.  After the admin queue has been allocated and the
+ * controller's data has been fetched, it should be passed to
+ * nvmf_update_association to update internal transport-specific
+ * parameters before allocating I/O queues.
+ *
+ * A controller uses a single association to manage all incoming
+ * queues since it is not known until after parsing the CONNECT
+ * command which transport queues are admin vs I/O and which
+ * controller they are created against.
+ */
+struct nvmf_association *nvmf_allocate_association(enum nvmf_trtype trtype,
+    bool controller, const struct nvmf_association_params *params);
+void	nvmf_update_assocation(struct nvmf_association *na,
+    const struct nvme_controller_data *cdata);
+void	nvmf_free_association(struct nvmf_association *na);
+
+/* The most recent association-wide error message. */
+const char *nvmf_association_error(const struct nvmf_association *na);
+
+/*
+ * A queue pair represents either an Admin or I/O
+ * submission/completion queue pair.
+ *
+ * Each open qpair holds a reference on its association.  Once queue
+ * pairs are allocated, callers can safely free the association to
+ * ease bookkeeping.
+ *
+ * If nvmf_allocate_qpair fails, a detailed error message can be obtained
+ * from nvmf_association_error.
+ */
+struct nvmf_qpair *nvmf_allocate_qpair(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params);
+void	nvmf_free_qpair(struct nvmf_qpair *qp);
+
+/*
+ * Capsules are either commands (host -> controller) or responses
+ * (controller -> host).  A single data buffer segment may be
+ * associated with a command capsule.  Transmitted data is not copied
+ * by this API but instead must be preserved until the capsule is
+ * transmitted and freed.
+ */
+struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp,
+    const void *sqe);
+struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp,
+    const void *cqe);
+void	nvmf_free_capsule(struct nvmf_capsule *nc);
+int	nvmf_capsule_append_data(struct nvmf_capsule *nc,
+    void *buf, size_t len, bool send);
+int	nvmf_transmit_capsule(struct nvmf_capsule *nc);
+int	nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp);
+const void *nvmf_capsule_sqe(const struct nvmf_capsule *nc);
+const void *nvmf_capsule_cqe(const struct nvmf_capsule *nc);
+
+/* Return a string name for a transport type. */
+const char *nvmf_transport_type(uint8_t trtype);
+
+/* Validate a NVMe Qualified Name. */
+bool	nvmf_nqn_valid(const char *nqn);
+
+/* Controller-specific APIs. */
+
+/*
+ * A controller calls this function to check for any
+ * transport-specific errors (invalid fields) in a received command
+ * capsule.  The callback returns a generic command status value:
+ * NVME_SC_SUCCESS if no error is found.
+ */
+uint8_t	nvmf_validate_command_capsule(const struct nvmf_capsule *nc);
+
+/*
+ * A controller calls this function to query the amount of data
+ * associated with a command capsule.
+ */
+size_t	nvmf_capsule_data_len(const struct nvmf_capsule *cc);
+
+/*
+ * A controller calls this function to receive data associated with a
+ * command capsule (e.g. the data for a WRITE command).  This can
+ * either return in-capsule data or fetch data from the host
+ * (e.g. using a R2T PDU over TCP).  The received command capsule
+ * should be passed in 'nc'.  The received data is stored in '*buf'.
+ */
+int	nvmf_receive_controller_data(const struct nvmf_capsule *nc,
+    uint32_t data_offset, void *buf, size_t len);
+
+/*
+ * A controller calls this function to send data in response to a
+ * command along with a response capsule.  If the data transfer
+ * succeeds, a success response is sent.  If the data transfer fails,
+ * an appropriate error status capsule is sent.  Regardless, a
+ * response capsule is always sent.
+ */
+int	nvmf_send_controller_data(const struct nvmf_capsule *nc,
+    const void *buf, size_t len);
+
+/*
+ * Construct a CQE for a reply to a command capsule in 'nc' with the
+ * completion status 'status'.  This is useful when additional CQE
+ * info is required beyond the completion status.
+ */
+void	nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc,
+    uint16_t status);
+
+/*
+ * Construct and send a response capsule to a command capsule with
+ * the supplied CQE.
+ */
+int	nvmf_send_response(const struct nvmf_capsule *nc, const void *cqe);
+
+/*
+ * Wait for a single command capsule and return it in *ncp.  This can
+ * fail if an invalid capsule is received or an I/O error occurs.
+ */
+int	nvmf_controller_receive_capsule(struct nvmf_qpair *qp,
+    struct nvmf_capsule **ncp);
+
+/* Send a response capsule from a controller. */
+int	nvmf_controller_transmit_response(struct nvmf_capsule *nc);
+
+/* Construct and send an error response capsule. */
+int	nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type,
+    uint8_t sc_status);
+
+/*
+ * Construct and send an error response capsule using a generic status
+ * code.
+ */
+int	nvmf_send_generic_error(const struct nvmf_capsule *nc,
+    uint8_t sc_status);
+
+/* Construct and send a simple success response capsule. */
+int	nvmf_send_success(const struct nvmf_capsule *nc);
+
+/*
+ * Allocate a new queue pair and wait for the CONNECT command capsule.
+ * If this fails, a detailed error message can be obtained from
+ * nvmf_association_error.  On success, the command capsule is saved
+ * in '*ccp' and the connect data is saved in 'data'.  The caller
+ * must send an explicit response and free the the command capsule.
+ */
+struct nvmf_qpair *nvmf_accept(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params, struct nvmf_capsule **ccp,
+    struct nvmf_fabric_connect_data *data);
+
+/*
+ * Construct and send a response capsule with the Fabrics CONNECT
+ * invalid parameters error status.  If data is true the offset is
+ * relative to the CONNECT data structure, otherwise the offset is
+ * relative to the SQE.
+ */
+void	nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc,
+    bool data, uint16_t offset);
+
+/* Construct and send a response capsule for a successful CONNECT. */
+int	nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid);
+
+/* Compute the initial state of CAP for a controller. */
+uint64_t nvmf_controller_cap(struct nvmf_qpair *qp);
+
+/* Generate a serial number string from a host ID. */
+void	nvmf_controller_serial(char *buf, size_t len, u_long hostid);
+
+/*
+ * Populate an Identify Controller data structure for a Discovery
+ * controller.
+ */
+void	nvmf_init_discovery_controller_data(struct nvmf_qpair *qp,
+    struct nvme_controller_data *cdata);
+
+/*
+ * Populate an Identify Controller data structure for an I/O
+ * controller.
+ */
+void	nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial,
+    const char *subnqn, int nn, uint32_t ioccsz,
+    struct nvme_controller_data *cdata);
+
+/*
+ * Validate if a new value for CC is legal given the existing values of
+ * CAP and CC.
+ */
+bool	nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc,
+    uint32_t new_cc);
+
+/* Return the log page id (LID) of a GET_LOG_PAGE command. */
+uint8_t	nvmf_get_log_page_id(const struct nvme_command *cmd);
+
+/* Return the requested data length of a GET_LOG_PAGE command. */
+uint64_t nvmf_get_log_page_length(const struct nvme_command *cmd);
+
+/* Return the requested data offset of a GET_LOG_PAGE command. */
+uint64_t nvmf_get_log_page_offset(const struct nvme_command *cmd);
+
+/* Prepare to handoff a controller qpair. */
+int	nvmf_handoff_controller_qpair(struct nvmf_qpair *qp,
+    struct nvmf_handoff_controller_qpair *h);
+
+/* Host-specific APIs. */
+
+/*
+ * Connect to an admin or I/O queue.  If this fails, a detailed error
+ * message can be obtained from nvmf_association_error.
+ */
+struct nvmf_qpair *nvmf_connect(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size,
+    const uint8_t hostid[16], uint16_t cntlid, const char *subnqn,
+    const char *hostnqn, uint32_t kato);
+
+/* Return the CNTLID for a queue returned from CONNECT. */
+uint16_t nvmf_cntlid(struct nvmf_qpair *qp);
+
+/*
+ * Send a command to the controller.  This can fail with EBUSY if the
+ * submission queue is full.
+ */
+int	nvmf_host_transmit_command(struct nvmf_capsule *nc);
+
+/*
+ * Wait for a response to a command.  If there are no outstanding
+ * commands in the SQ, fails with EWOULDBLOCK.
+ */
+int	nvmf_host_receive_response(struct nvmf_qpair *qp,
+    struct nvmf_capsule **rcp);
+
+/*
+ * Wait for a response to a specific command.  The command must have been
+ * succesfully sent previously.
+ */
+int	nvmf_host_wait_for_response(struct nvmf_capsule *cc,
+    struct nvmf_capsule **rcp);
+
+/* Build a KeepAlive command. */
+struct nvmf_capsule *nvmf_keepalive(struct nvmf_qpair *qp);
+
+/* Read a controller property. */
+int	nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
+    uint64_t *value);
+
+/* Write a controller property. */
+int	nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset,
+    uint8_t size, uint64_t value);
+
+/* Construct a 16-byte HostId from kern.hostuuid. */
+int	nvmf_hostid_from_hostuuid(uint8_t hostid[16]);
+
+/* Construct a NQN from kern.hostuuid. */
+int	nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]);
+
+/* Fetch controller data via IDENTIFY. */
+int	nvmf_host_identify_controller(struct nvmf_qpair *qp,
+    struct nvme_controller_data *data);
+
+/* Fetch namespace data via IDENTIFY. */
+int	nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid,
+    struct nvme_namespace_data *nsdata);
+
+/*
+ * Fetch discovery log page.  The memory for the log page is allocated
+ * by malloc() and returned in *logp.  The caller must free the
+ * memory.
+ */
+int	nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp,
+    struct nvme_discovery_log **logp);
+
+/*
+ * Request a desired number of I/O queues via SET_FEATURES.  The
+ * number of actual I/O queues available is returned in *actual on
+ * success.
+ */
+int	nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested,
+    u_int *actual);
+
+/*
+ * Handoff active host association to the kernel.  This frees the
+ * qpairs (even on error).
+ */
+int	nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues,
+    struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata);
+
+/*
+ * Disconnect an active host association previously handed off to the
+ * kernel.  *name is either the name of the device (nvmeX) for this
+ * association or the remote subsystem NQN.
+ */
+int	nvmf_disconnect_host(const char *host);
+
+/*
+ * Disconnect all active host associations previously handed off to
+ * the kernel.
+ */
+int	nvmf_disconnect_all(void);
+
+/*
+ * Fetch reconnect parameters from an existing kernel host to use for
+ * establishing a new association.
+ */
+int	nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams);
+
+/*
+ * Handoff active host association to an existing host in the kernel.
+ * This frees the qpairs (even on error).
+ */
+int	nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp,
+    u_int num_queues, struct nvmf_qpair **io_queues,
+    const struct nvme_controller_data *cdata);
+
+#endif /* !__LIBNVMF_H__ */
diff --git a/lib/libnvmf/nvmf_controller.c b/lib/libnvmf/nvmf_controller.c
new file mode 100644
index 000000000000..554e5e769ded
--- /dev/null
+++ b/lib/libnvmf/nvmf_controller.c
@@ -0,0 +1,463 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/utsname.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libnvmf.h"
+#include "internal.h"
+#include "nvmft_subr.h"
+
+void
+nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc, uint16_t status)
+{
+	struct nvme_completion *cpl = cqe;
+	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+	memset(cpl, 0, sizeof(*cpl));
+	cpl->cid = cmd->cid;
+	cpl->status = htole16(status);
+}
+
+static struct nvmf_capsule *
+nvmf_simple_response(const struct nvmf_capsule *nc, uint8_t sc_type,
+    uint8_t sc_status)
+{
+	struct nvme_completion cpl;
+	uint16_t status;
+
+	status = NVMEF(NVME_STATUS_SCT, sc_type) |
+	    NVMEF(NVME_STATUS_SC, sc_status);
+	nvmf_init_cqe(&cpl, nc, status);
+	return (nvmf_allocate_response(nc->nc_qpair, &cpl));
+}
+
+int
+nvmf_controller_receive_capsule(struct nvmf_qpair *qp,
+    struct nvmf_capsule **ncp)
+{
+	struct nvmf_capsule *nc;
+	int error;
+	uint8_t sc_status;
+
+	*ncp = NULL;
+	error = nvmf_receive_capsule(qp, &nc);
+	if (error != 0)
+		return (error);
+
+	sc_status = nvmf_validate_command_capsule(nc);
+	if (sc_status != NVME_SC_SUCCESS) {
+		nvmf_send_generic_error(nc, sc_status);
+		nvmf_free_capsule(nc);
+		return (EPROTO);
+	}
+
+	*ncp = nc;
+	return (0);
+}
+
+int
+nvmf_controller_transmit_response(struct nvmf_capsule *nc)
+{
+	struct nvmf_qpair *qp = nc->nc_qpair;
+
+	/* Set SQHD. */
+	if (qp->nq_flow_control) {
+		qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize;
+		nc->nc_cqe.sqhd = htole16(qp->nq_sqhd);
+	} else
+		nc->nc_cqe.sqhd = 0;
+
+	return (nvmf_transmit_capsule(nc));
+}
+
+int
+nvmf_send_response(const struct nvmf_capsule *cc, const void *cqe)
+{
+	struct nvmf_capsule *rc;
+	int error;
+
+	rc = nvmf_allocate_response(cc->nc_qpair, cqe);
+	if (rc == NULL)
+		return (ENOMEM);
+	error = nvmf_controller_transmit_response(rc);
+	nvmf_free_capsule(rc);
+	return (error);
+}
+
+int
+nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type,
+    uint8_t sc_status)
+{
+	struct nvmf_capsule *rc;
+	int error;
+
+	rc = nvmf_simple_response(cc, sc_type, sc_status);
+	error = nvmf_controller_transmit_response(rc);
+	nvmf_free_capsule(rc);
+	return (error);
+}
+
+int
+nvmf_send_generic_error(const struct nvmf_capsule *nc, uint8_t sc_status)
+{
+	return (nvmf_send_error(nc, NVME_SCT_GENERIC, sc_status));
+}
+
+int
+nvmf_send_success(const struct nvmf_capsule *nc)
+{
+	return (nvmf_send_generic_error(nc, NVME_SC_SUCCESS));
+}
+
+void
+nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, bool data,
+    uint16_t offset)
+{
+	struct nvmf_fabric_connect_rsp rsp;
+	struct nvmf_capsule *rc;
+
+	nvmf_init_cqe(&rsp, cc,
+	    NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) |
+	    NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
+	rsp.status_code_specific.invalid.ipo = htole16(offset);
+	rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
+	rc = nvmf_allocate_response(cc->nc_qpair, &rsp);
+	nvmf_transmit_capsule(rc);
+	nvmf_free_capsule(rc);
+}
+
+struct nvmf_qpair *
+nvmf_accept(struct nvmf_association *na, const struct nvmf_qpair_params *params,
+    struct nvmf_capsule **ccp, struct nvmf_fabric_connect_data *data)
+{
+	static const char hostid_zero[sizeof(data->hostid)];
+	const struct nvmf_fabric_connect_cmd *cmd;
+	struct nvmf_qpair *qp;
+	struct nvmf_capsule *cc, *rc;
+	u_int qsize;
+	int error;
+	uint16_t cntlid;
+	uint8_t sc_status;
+
+	qp = NULL;
+	cc = NULL;
+	rc = NULL;
+	*ccp = NULL;
+	na_clear_error(na);
+	if (!na->na_controller) {
+		na_error(na, "Cannot accept on a host");
+		goto error;
+	}
+
+	qp = nvmf_allocate_qpair(na, params);
+	if (qp == NULL)
+		goto error;
+
+	/* Read the CONNECT capsule. */
+	error = nvmf_receive_capsule(qp, &cc);
+	if (error != 0) {
+		na_error(na, "Failed to receive CONNECT: %s", strerror(error));
+		goto error;
+	}
+
+	sc_status = nvmf_validate_command_capsule(cc);
+	if (sc_status != 0) {
+		na_error(na, "CONNECT command failed to validate: %u",
+		    sc_status);
+		rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, sc_status);
+		goto error;
+	}
+
+	cmd = nvmf_capsule_sqe(cc);
+	if (cmd->opcode != NVME_OPC_FABRICS_COMMANDS ||
+	    cmd->fctype != NVMF_FABRIC_COMMAND_CONNECT) {
+		na_error(na, "Invalid opcode in CONNECT (%u,%u)", cmd->opcode,
+		    cmd->fctype);
+		rc = nvmf_simple_response(cc, NVME_SCT_GENERIC,
+		    NVME_SC_INVALID_OPCODE);
+		goto error;
+	}
+
+	if (cmd->recfmt != htole16(0)) {
+		na_error(na, "Unsupported CONNECT record format %u",
+		    le16toh(cmd->recfmt));
+		rc = nvmf_simple_response(cc, NVME_SCT_COMMAND_SPECIFIC,
+		    NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT);
+		goto error;
+	}
+
+	qsize = le16toh(cmd->sqsize) + 1;
+	if (cmd->qid == 0) {
+		/* Admin queue limits. */
+		if (qsize < NVME_MIN_ADMIN_ENTRIES ||
+		    qsize > NVME_MAX_ADMIN_ENTRIES ||
+		    qsize > na->na_params.max_admin_qsize) {
+			na_error(na, "Invalid queue size %u", qsize);
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, sqsize));
+			goto error;
+		}
+		qp->nq_admin = true;
+	} else {
+		/* I/O queues not allowed for discovery. */
+		if (na->na_params.max_io_qsize == 0) {
+			na_error(na, "I/O queue on discovery controller");
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, qid));
+			goto error;
+		}
+
+		/* I/O queue limits. */
+		if (qsize < NVME_MIN_IO_ENTRIES ||
+		    qsize > NVME_MAX_IO_ENTRIES ||
+		    qsize > na->na_params.max_io_qsize) {
+			na_error(na, "Invalid queue size %u", qsize);
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, sqsize));
+			goto error;
+		}
+
+		/* KATO is reserved for I/O queues. */
+		if (cmd->kato != 0) {
+			na_error(na,
+			    "KeepAlive timeout specified for I/O queue");
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, kato));
+			goto error;
+		}
+		qp->nq_admin = false;
+	}
+	qp->nq_qsize = qsize;
+
+	/* Fetch CONNECT data. */
+	if (nvmf_capsule_data_len(cc) != sizeof(*data)) {
+		na_error(na, "Invalid data payload length for CONNECT: %zu",
+		    nvmf_capsule_data_len(cc));
+		nvmf_connect_invalid_parameters(cc, false,
+		    offsetof(struct nvmf_fabric_connect_cmd, sgl1));
+		goto error;
+	}
+
+	error = nvmf_receive_controller_data(cc, 0, data, sizeof(*data));
+	if (error != 0) {
+		na_error(na, "Failed to read data for CONNECT: %s",
+		    strerror(error));
+		rc = nvmf_simple_response(cc, NVME_SCT_GENERIC,
+		    NVME_SC_DATA_TRANSFER_ERROR);
+		goto error;
+	}
+
+	/* The hostid must be non-zero. */
+	if (memcmp(data->hostid, hostid_zero, sizeof(hostid_zero)) == 0) {
+		na_error(na, "HostID in CONNECT data is zero");
+		nvmf_connect_invalid_parameters(cc, true,
+		    offsetof(struct nvmf_fabric_connect_data, hostid));
+		goto error;
+	}
+
+	cntlid = le16toh(data->cntlid);
+	if (cmd->qid == 0) {
+		if (na->na_params.dynamic_controller_model) {
+			if (cntlid != NVMF_CNTLID_DYNAMIC) {
+				na_error(na, "Invalid controller ID %#x",
+				    cntlid);
+				nvmf_connect_invalid_parameters(cc, true,
+				    offsetof(struct nvmf_fabric_connect_data,
+					cntlid));
+				goto error;
+			}
+		} else {
+			if (cntlid > NVMF_CNTLID_STATIC_MAX &&
+			    cntlid != NVMF_CNTLID_STATIC_ANY) {
+				na_error(na, "Invalid controller ID %#x",
+				    cntlid);
+				nvmf_connect_invalid_parameters(cc, true,
+				    offsetof(struct nvmf_fabric_connect_data,
+					cntlid));
+				goto error;
+			}
+		}
+	} else {
+		/* Wildcard Controller IDs are only valid on an Admin queue. */
+		if (cntlid > NVMF_CNTLID_STATIC_MAX) {
+			na_error(na, "Invalid controller ID %#x", cntlid);
+			nvmf_connect_invalid_parameters(cc, true,
+			    offsetof(struct nvmf_fabric_connect_data, cntlid));
+			goto error;
+		}
+	}
+
+	/* Simple validation of each NQN. */
+	if (!nvmf_nqn_valid(data->subnqn)) {
+		na_error(na, "Invalid SubNQN %.*s", (int)sizeof(data->subnqn),
+		    data->subnqn);
+		nvmf_connect_invalid_parameters(cc, true,
+		    offsetof(struct nvmf_fabric_connect_data, subnqn));
+		goto error;
+	}
+	if (!nvmf_nqn_valid(data->hostnqn)) {
+		na_error(na, "Invalid HostNQN %.*s", (int)sizeof(data->hostnqn),
+		    data->hostnqn);
+		nvmf_connect_invalid_parameters(cc, true,
+		    offsetof(struct nvmf_fabric_connect_data, hostnqn));
+		goto error;
+	}
+
+	if (na->na_params.sq_flow_control ||
+	    (cmd->cattr & NVMF_CONNECT_ATTR_DISABLE_SQ_FC) == 0)
+		qp->nq_flow_control = true;
+	else
+		qp->nq_flow_control = false;
+	qp->nq_sqhd = 0;
+	qp->nq_kato = le32toh(cmd->kato);
+	*ccp = cc;
+	return (qp);
+error:
+	if (rc != NULL) {
+		nvmf_transmit_capsule(rc);
+		nvmf_free_capsule(rc);
+	}
+	if (cc != NULL)
+		nvmf_free_capsule(cc);
+	if (qp != NULL)
+		nvmf_free_qpair(qp);
+	return (NULL);
+}
+
+int
+nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid)
+{
+	struct nvmf_fabric_connect_rsp rsp;
+	struct nvmf_qpair *qp = cc->nc_qpair;
+	struct nvmf_capsule *rc;
+	int error;
+
+	nvmf_init_cqe(&rsp, cc, 0);
+	if (qp->nq_flow_control)
+		rsp.sqhd = htole16(qp->nq_sqhd);
+	else
+		rsp.sqhd = htole16(0xffff);
+	rsp.status_code_specific.success.cntlid = htole16(cntlid);
+	rc = nvmf_allocate_response(qp, &rsp);
+	if (rc == NULL)
+		return (ENOMEM);
+	error = nvmf_transmit_capsule(rc);
+	nvmf_free_capsule(rc);
+	if (error == 0)
+		qp->nq_cntlid = cntlid;
+	return (error);
+}
+
+uint64_t
+nvmf_controller_cap(struct nvmf_qpair *qp)
+{
+	const struct nvmf_association *na = qp->nq_association;
+
+	return (_nvmf_controller_cap(na->na_params.max_io_qsize,
+	    NVMF_CC_EN_TIMEOUT));
+}
+
+bool
+nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc,
+    uint32_t new_cc)
+{
+	const struct nvmf_association *na = qp->nq_association;
+
+	return (_nvmf_validate_cc(na->na_params.max_io_qsize, cap, old_cc,
+	    new_cc));
+}
+
+void
+nvmf_init_discovery_controller_data(struct nvmf_qpair *qp,
+    struct nvme_controller_data *cdata)
+{
+	const struct nvmf_association *na = qp->nq_association;
+	struct utsname utsname;
+	char *cp;
+
+	memset(cdata, 0, sizeof(*cdata));
+
+	/*
+	 * 5.2 Figure 37 states model name and serial are reserved,
+	 * but Linux includes them.  Don't bother with serial, but
+	 * do set model name.
+	 */
+	uname(&utsname);
+	nvmf_strpad(cdata->mn, utsname.sysname, sizeof(cdata->mn));
+	nvmf_strpad(cdata->fr, utsname.release, sizeof(cdata->fr));
+	cp = memchr(cdata->fr, '-', sizeof(cdata->fr));
+	if (cp != NULL)
+		memset(cp, ' ', sizeof(cdata->fr) - (cp - (char *)cdata->fr));
+
+	cdata->ctrlr_id = htole16(qp->nq_cntlid);
+	cdata->ver = htole32(NVME_REV(1, 4));
+	cdata->cntrltype = 2;
+
+	cdata->lpa = NVMEF(NVME_CTRLR_DATA_LPA_EXT_DATA, 1);
+	cdata->elpe = 0;
+
+	cdata->maxcmd = htole16(na->na_params.max_admin_qsize);
+
+	/* Transport-specific? */
+	cdata->sgls = htole32(
+	    NVMEF(NVME_CTRLR_DATA_SGLS_TRANSPORT_DATA_BLOCK, 1) |
+	    NVMEF(NVME_CTRLR_DATA_SGLS_ADDRESS_AS_OFFSET, 1) |
+	    NVMEF(NVME_CTRLR_DATA_SGLS_NVM_COMMAND_SET, 1));
+
+	strlcpy(cdata->subnqn, NVMF_DISCOVERY_NQN, sizeof(cdata->subnqn));
+}
+
+void
+nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial,
+    const char *subnqn, int nn, uint32_t ioccsz,
+    struct nvme_controller_data *cdata)
+{
+	const struct nvmf_association *na = qp->nq_association;
+	struct utsname utsname;
+
+	uname(&utsname);
+
+	_nvmf_init_io_controller_data(qp->nq_cntlid, na->na_params.max_io_qsize,
+	    serial, utsname.sysname, utsname.release, subnqn, nn, ioccsz,
+	    sizeof(struct nvme_completion), cdata);
+}
+
+uint8_t
+nvmf_get_log_page_id(const struct nvme_command *cmd)
+{
+	assert(cmd->opc == NVME_OPC_GET_LOG_PAGE);
+	return (le32toh(cmd->cdw10) & 0xff);
+}
+
+uint64_t
+nvmf_get_log_page_length(const struct nvme_command *cmd)
+{
+	uint32_t numd;
+
+	assert(cmd->opc == NVME_OPC_GET_LOG_PAGE);
+	numd = le32toh(cmd->cdw10) >> 16 | (le32toh(cmd->cdw11) & 0xffff) << 16;
+	return ((numd + 1) * 4);
+}
+
+uint64_t
+nvmf_get_log_page_offset(const struct nvme_command *cmd)
+{
+	assert(cmd->opc == NVME_OPC_GET_LOG_PAGE);
+	return (le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32);
+}
+
+int
+nvmf_handoff_controller_qpair(struct nvmf_qpair *qp,
+    struct nvmf_handoff_controller_qpair *h)
+{
+	h->trtype = qp->nq_association->na_trtype;
+	return (nvmf_kernel_handoff_params(qp, &h->params));
+}
diff --git a/lib/libnvmf/nvmf_host.c b/lib/libnvmf/nvmf_host.c
new file mode 100644
index 000000000000..b78e2af65897
--- /dev/null
+++ b/lib/libnvmf/nvmf_host.c
@@ -0,0 +1,911 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/sysctl.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <uuid.h>
+
+#include "libnvmf.h"
+#include "internal.h"
+
+static void
+nvmf_init_sqe(void *sqe, uint8_t opcode)
+{
+	struct nvme_command *cmd = sqe;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->opc = opcode;
+}
+
+static void
+nvmf_init_fabrics_sqe(void *sqe, uint8_t fctype)
+{
+	struct nvmf_capsule_cmd *cmd = sqe;
+
+	nvmf_init_sqe(sqe, NVME_OPC_FABRICS_COMMANDS);
+	cmd->fctype = fctype;
+}
+
+struct nvmf_qpair *
+nvmf_connect(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size,
+    const uint8_t hostid[16], uint16_t cntlid, const char *subnqn,
+    const char *hostnqn, uint32_t kato)
+{
+	struct nvmf_fabric_connect_cmd cmd;
+	struct nvmf_fabric_connect_data data;
+	const struct nvmf_fabric_connect_rsp *rsp;
+	struct nvmf_qpair *qp;
+	struct nvmf_capsule *cc, *rc;
+	int error;
+	uint16_t sqhd, status;
+
+	qp = NULL;
+	cc = NULL;
+	rc = NULL;
+	na_clear_error(na);
+	if (na->na_controller) {
+		na_error(na, "Cannot connect on a controller");
+		goto error;
+	}
+
+	if (params->admin != (qid == 0)) {
+		na_error(na, "Admin queue must use Queue ID 0");
+		goto error;
+	}
+
+	if (qid == 0) {
+		if (queue_size < NVME_MIN_ADMIN_ENTRIES ||
+		    queue_size > NVME_MAX_ADMIN_ENTRIES) {
+			na_error(na, "Invalid queue size %u", queue_size);
+			goto error;
+		}
+	} else {
+		if (queue_size < NVME_MIN_IO_ENTRIES ||
+		    queue_size > NVME_MAX_IO_ENTRIES) {
+			na_error(na, "Invalid queue size %u", queue_size);
+			goto error;
+		}
+
+		/* KATO is only for Admin queues. */
+		if (kato != 0) {
+			na_error(na, "Cannot set KATO on I/O queues");
+			goto error;
+		}
+	}
+
+	qp = nvmf_allocate_qpair(na, params);
+	if (qp == NULL)
+		goto error;
+
+	nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_CONNECT);
+	cmd.recfmt = 0;
+	cmd.qid = htole16(qid);
+
+	/* N.B. sqsize is 0's based. */
+	cmd.sqsize = htole16(queue_size - 1);
+	if (!na->na_params.sq_flow_control)
+		cmd.cattr |= NVMF_CONNECT_ATTR_DISABLE_SQ_FC;
+	cmd.kato = htole32(kato);
+
+	cc = nvmf_allocate_command(qp, &cmd);
+	if (cc == NULL) {
+		na_error(na, "Failed to allocate command capsule: %s",
+		    strerror(errno));
+		goto error;
+	}
+
+	memset(&data, 0, sizeof(data));
+	memcpy(data.hostid, hostid, sizeof(data.hostid));
+	data.cntlid = htole16(cntlid);
+	strlcpy(data.subnqn, subnqn, sizeof(data.subnqn));
+	strlcpy(data.hostnqn, hostnqn, sizeof(data.hostnqn));
+
+	error = nvmf_capsule_append_data(cc, &data, sizeof(data), true);
+	if (error != 0) {
+		na_error(na, "Failed to append data to CONNECT capsule: %s",
+		    strerror(error));
+		goto error;
+	}
+
+	error = nvmf_transmit_capsule(cc);
+	if (error != 0) {
+		na_error(na, "Failed to transmit CONNECT capsule: %s",
+		    strerror(errno));
+		goto error;
+	}
+
+	error = nvmf_receive_capsule(qp, &rc);
+	if (error != 0) {
+		na_error(na, "Failed to receive CONNECT response: %s",
+		    strerror(error));
+		goto error;
+	}
+
+	rsp = (const struct nvmf_fabric_connect_rsp *)&rc->nc_cqe;
+	status = le16toh(rc->nc_cqe.status);
+	if (status != 0) {
+		if (NVME_STATUS_GET_SC(status) == NVMF_FABRIC_SC_INVALID_PARAM)
+			na_error(na,
+			    "CONNECT invalid parameter IATTR: %#x IPO: %#x",
+			    rsp->status_code_specific.invalid.iattr,
+			    rsp->status_code_specific.invalid.ipo);
+		else
+			na_error(na, "CONNECT failed, status %#x", status);
+		goto error;
+	}
+
+	if (rc->nc_cqe.cid != cmd.cid) {
+		na_error(na, "Mismatched CID in CONNECT response");
+		goto error;
+	}
+
+	if (!rc->nc_sqhd_valid) {
+		na_error(na, "CONNECT response without valid SQHD");
+		goto error;
+	}
+
+	sqhd = le16toh(rsp->sqhd);
+	if (sqhd == 0xffff) {
+		if (na->na_params.sq_flow_control) {
+			na_error(na, "Controller disabled SQ flow control");
+			goto error;
+		}
+		qp->nq_flow_control = false;
+	} else {
+		qp->nq_flow_control = true;
+		qp->nq_sqhd = sqhd;
+		qp->nq_sqtail = sqhd;
+	}
+
+	if (rsp->status_code_specific.success.authreq) {
+		na_error(na, "CONNECT response requests authentication\n");
+		goto error;
+	}
+
+	qp->nq_qsize = queue_size;
+	qp->nq_cntlid = le16toh(rsp->status_code_specific.success.cntlid);
+	qp->nq_kato = kato;
+	/* XXX: Save qid in qp? */
+	return (qp);
+
+error:
+	if (rc != NULL)
+		nvmf_free_capsule(rc);
+	if (cc != NULL)
+		nvmf_free_capsule(cc);
+	if (qp != NULL)
+		nvmf_free_qpair(qp);
+	return (NULL);
+}
+
+uint16_t
+nvmf_cntlid(struct nvmf_qpair *qp)
+{
+	return (qp->nq_cntlid);
+}
+
+int
+nvmf_host_transmit_command(struct nvmf_capsule *nc)
+{
+	struct nvmf_qpair *qp = nc->nc_qpair;
+	uint16_t new_sqtail;
+	int error;
+
+	/* Fail if the queue is full. */
+	new_sqtail = (qp->nq_sqtail + 1) % qp->nq_qsize;
+	if (new_sqtail == qp->nq_sqhd)
+		return (EBUSY);
+
+	nc->nc_sqe.cid = htole16(qp->nq_cid);
+
+	/* 4.2 Skip CID of 0xFFFF. */
+	qp->nq_cid++;
+	if (qp->nq_cid == 0xFFFF)
+		qp->nq_cid = 0;
+
+	error = nvmf_transmit_capsule(nc);
+	if (error != 0)
+		return (error);
+
+	qp->nq_sqtail = new_sqtail;
+	return (0);
+}
+
+/* Receive a single capsule and update SQ FC accounting. */
+static int
+nvmf_host_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp)
+{
+	struct nvmf_capsule *nc;
+	int error;
+
+	/* If the SQ is empty, there is no response to wait for. */
+	if (qp->nq_sqhd == qp->nq_sqtail)
+		return (EWOULDBLOCK);
+
+	error = nvmf_receive_capsule(qp, &nc);
+	if (error != 0)
+		return (error);
+
+	if (qp->nq_flow_control) {
+		if (nc->nc_sqhd_valid)
+			qp->nq_sqhd = le16toh(nc->nc_cqe.sqhd);
+	} else {
+		/*
+		 * If SQ FC is disabled, just advance the head for
+		 * each response capsule received so that we track the
+		 * number of outstanding commands.
+		 */
+		qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize;
+	}
+	*ncp = nc;
+	return (0);
+}
+
+int
+nvmf_host_receive_response(struct nvmf_qpair *qp, struct nvmf_capsule **ncp)
+{
+	struct nvmf_capsule *nc;
+
+	/* Return the oldest previously received response. */
+	if (!TAILQ_EMPTY(&qp->nq_rx_capsules)) {
+		nc = TAILQ_FIRST(&qp->nq_rx_capsules);
+		TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link);
+		*ncp = nc;
+		return (0);
+	}
+
+	return (nvmf_host_receive_capsule(qp, ncp));
+}
+
+int
+nvmf_host_wait_for_response(struct nvmf_capsule *cc,
+    struct nvmf_capsule **rcp)
+{
+	struct nvmf_qpair *qp = cc->nc_qpair;
+	struct nvmf_capsule *rc;
+	int error;
+
+	/* Check if a response was already received. */
+	TAILQ_FOREACH(rc, &qp->nq_rx_capsules, nc_link) {
+		if (rc->nc_cqe.cid == cc->nc_sqe.cid) {
+			TAILQ_REMOVE(&qp->nq_rx_capsules, rc, nc_link);
+			*rcp = rc;
+			return (0);
+		}
+	}
+
+	/* Wait for a response. */
+	for (;;) {
+		error = nvmf_host_receive_capsule(qp, &rc);
+		if (error != 0)
+			return (error);
+
+		if (rc->nc_cqe.cid != cc->nc_sqe.cid) {
+			TAILQ_INSERT_TAIL(&qp->nq_rx_capsules, rc, nc_link);
+			continue;
+		}
+
+		*rcp = rc;
+		return (0);
+	}
+}
+
+struct nvmf_capsule *
+nvmf_keepalive(struct nvmf_qpair *qp)
+{
+	struct nvme_command cmd;
+
+	if (!qp->nq_admin) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	nvmf_init_sqe(&cmd, NVME_OPC_KEEP_ALIVE);
+
+	return (nvmf_allocate_command(qp, &cmd));
+}
+
+static struct nvmf_capsule *
+nvmf_get_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size)
+{
+	struct nvmf_fabric_prop_get_cmd cmd;
+
+	nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_GET);
+	switch (size) {
+	case 4:
+		cmd.attrib.size = NVMF_PROP_SIZE_4;
+		break;
+	case 8:
+		cmd.attrib.size = NVMF_PROP_SIZE_8;
+		break;
+	default:
+		errno = EINVAL;
+		return (NULL);
+	}
+	cmd.ofst = htole32(offset);
+
+	return (nvmf_allocate_command(qp, &cmd));
+}
+
+int
+nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
+    uint64_t *value)
+{
+	struct nvmf_capsule *cc, *rc;
+	const struct nvmf_fabric_prop_get_rsp *rsp;
+	uint16_t status;
+	int error;
+
+	if (!qp->nq_admin)
+		return (EINVAL);
+
+	cc = nvmf_get_property(qp, offset, size);
+	if (cc == NULL)
+		return (errno);
+
+	error = nvmf_host_transmit_command(cc);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_wait_for_response(cc, &rc);
+	nvmf_free_capsule(cc);
+	if (error != 0)
+		return (error);
+
+	rsp = (const struct nvmf_fabric_prop_get_rsp *)&rc->nc_cqe;
+	status = le16toh(rc->nc_cqe.status);
+	if (status != 0) {
+		printf("NVMF: PROPERTY_GET failed, status %#x\n", status);
+		nvmf_free_capsule(rc);
+		return (EIO);
+	}
+
+	if (size == 8)
+		*value = le64toh(rsp->value.u64);
+	else
+		*value = le32toh(rsp->value.u32.low);
+	nvmf_free_capsule(rc);
+	return (0);
+}
+
+static struct nvmf_capsule *
+nvmf_set_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
+    uint64_t value)
+{
+	struct nvmf_fabric_prop_set_cmd cmd;
+
+	nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_SET);
+	switch (size) {
+	case 4:
+		cmd.attrib.size = NVMF_PROP_SIZE_4;
+		cmd.value.u32.low = htole32(value);
+		break;
+	case 8:
+		cmd.attrib.size = NVMF_PROP_SIZE_8;
+		cmd.value.u64 = htole64(value);
+		break;
+	default:
+		errno = EINVAL;
+		return (NULL);
+	}
+	cmd.ofst = htole32(offset);
+
+	return (nvmf_allocate_command(qp, &cmd));
+}
+
+int
+nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
+    uint64_t value)
+{
+	struct nvmf_capsule *cc, *rc;
+	uint16_t status;
+	int error;
+
+	if (!qp->nq_admin)
+		return (EINVAL);
+
+	cc = nvmf_set_property(qp, offset, size, value);
+	if (cc == NULL)
+		return (errno);
+
+	error = nvmf_host_transmit_command(cc);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_wait_for_response(cc, &rc);
+	nvmf_free_capsule(cc);
+	if (error != 0)
+		return (error);
+
+	status = le16toh(rc->nc_cqe.status);
+	if (status != 0) {
+		printf("NVMF: PROPERTY_SET failed, status %#x\n", status);
+		nvmf_free_capsule(rc);
+		return (EIO);
+	}
+
+	nvmf_free_capsule(rc);
+	return (0);
+}
+
+int
+nvmf_hostid_from_hostuuid(uint8_t hostid[16])
+{
+	char hostuuid_str[64];
+	uuid_t hostuuid;
+	size_t len;
+	uint32_t status;
+
+	len = sizeof(hostuuid_str);
+	if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0)
+		return (errno);
+
+	uuid_from_string(hostuuid_str, &hostuuid, &status);
+	switch (status) {
+	case uuid_s_ok:
+		break;
+	case uuid_s_no_memory:
+		return (ENOMEM);
+	default:
+		return (EINVAL);
+	}
+
+	uuid_enc_le(hostid, &hostuuid);
+	return (0);
+}
+
+int
+nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN])
+{
+	char hostuuid_str[64];
+	size_t len;
+
+	len = sizeof(hostuuid_str);
+	if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0)
+		return (errno);
+
+	strlcpy(nqn, NVMF_NQN_UUID_PRE, NVMF_NQN_MAX_LEN);
+	strlcat(nqn, hostuuid_str, NVMF_NQN_MAX_LEN);
+	return (0);
+}
+
+int
+nvmf_host_identify_controller(struct nvmf_qpair *qp,
+    struct nvme_controller_data *cdata)
+{
+	struct nvme_command cmd;
+	struct nvmf_capsule *cc, *rc;
+	int error;
+	uint16_t status;
+
+	if (!qp->nq_admin)
+		return (EINVAL);
+
+	nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY);
+
+	/* 5.15.1 Use CNS of 0x01 for controller data. */
+	cmd.cdw10 = htole32(1);
+
+	cc = nvmf_allocate_command(qp, &cmd);
+	if (cc == NULL)
+		return (errno);
+
+	error = nvmf_capsule_append_data(cc, cdata, sizeof(*cdata), false);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_transmit_command(cc);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_wait_for_response(cc, &rc);
+	nvmf_free_capsule(cc);
+	if (error != 0)
+		return (error);
+
+	status = le16toh(rc->nc_cqe.status);
+	if (status != 0) {
+		printf("NVMF: IDENTIFY failed, status %#x\n", status);
+		nvmf_free_capsule(rc);
+		return (EIO);
+	}
+
+	nvmf_free_capsule(rc);
+	return (0);
+}
+
+int
+nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid,
+    struct nvme_namespace_data *nsdata)
+{
+	struct nvme_command cmd;
+	struct nvmf_capsule *cc, *rc;
+	int error;
+	uint16_t status;
+
+	if (!qp->nq_admin)
+		return (EINVAL);
+
+	nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY);
+
+	/* 5.15.1 Use CNS of 0x00 for namespace data. */
+	cmd.cdw10 = htole32(0);
+	cmd.nsid = htole32(nsid);
+
+	cc = nvmf_allocate_command(qp, &cmd);
+	if (cc == NULL)
+		return (errno);
+
+	error = nvmf_capsule_append_data(cc, nsdata, sizeof(*nsdata), false);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_transmit_command(cc);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_wait_for_response(cc, &rc);
+	nvmf_free_capsule(cc);
+	if (error != 0)
+		return (error);
+
+	status = le16toh(rc->nc_cqe.status);
+	if (status != 0) {
+		printf("NVMF: IDENTIFY failed, status %#x\n", status);
+		nvmf_free_capsule(rc);
+		return (EIO);
+	}
+
+	nvmf_free_capsule(rc);
+	return (0);
+}
+
+static int
+nvmf_get_discovery_log_page(struct nvmf_qpair *qp, uint64_t offset, void *buf,
+    size_t len)
+{
+	struct nvme_command cmd;
+	struct nvmf_capsule *cc, *rc;
+	size_t numd;
+	int error;
+	uint16_t status;
+
+	if (len % 4 != 0 || len == 0 || offset % 4 != 0)
+		return (EINVAL);
+
+	numd = (len / 4) - 1;
+	nvmf_init_sqe(&cmd, NVME_OPC_GET_LOG_PAGE);
+	cmd.cdw10 = htole32(numd << 16 | NVME_LOG_DISCOVERY);
+	cmd.cdw11 = htole32(numd >> 16);
+	cmd.cdw12 = htole32(offset);
+	cmd.cdw13 = htole32(offset >> 32);
+
+	cc = nvmf_allocate_command(qp, &cmd);
+	if (cc == NULL)
+		return (errno);
+
+	error = nvmf_capsule_append_data(cc, buf, len, false);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_transmit_command(cc);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_wait_for_response(cc, &rc);
+	nvmf_free_capsule(cc);
+	if (error != 0)
+		return (error);
+
+	status = le16toh(rc->nc_cqe.status);
+	if (NVMEV(NVME_STATUS_SC, status) ==
+	    NVMF_FABRIC_SC_LOG_RESTART_DISCOVERY) {
+		nvmf_free_capsule(rc);
+		return (EAGAIN);
+	}
+	if (status != 0) {
+		printf("NVMF: GET_LOG_PAGE failed, status %#x\n", status);
+		nvmf_free_capsule(rc);
+		return (EIO);
+	}
+
+	nvmf_free_capsule(rc);
+	return (0);
+}
+
+int
+nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp,
+    struct nvme_discovery_log **logp)
+{
+	struct nvme_discovery_log hdr, *log;
+	size_t payload_len;
+	int error;
+
+	if (!qp->nq_admin)
+		return (EINVAL);
+
+	log = NULL;
+	for (;;) {
+		error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr));
+		if (error != 0)
+			return (error);
+		nvme_discovery_log_swapbytes(&hdr);
+
+		if (hdr.recfmt != 0) {
+			printf("NVMF: Unsupported discovery log format: %d\n",
+			    hdr.recfmt);
+			return (EINVAL);
+		}
+
+		if (hdr.numrec > 1024) {
+			printf("NVMF: Too many discovery log entries: %ju\n",
+			    (uintmax_t)hdr.numrec);
+			return (EFBIG);
+		}
+
+		payload_len = sizeof(log->entries[0]) * hdr.numrec;
+		log = reallocf(log, sizeof(*log) + payload_len);
+		if (log == NULL)
+			return (ENOMEM);
+		*log = hdr;
+		if (hdr.numrec == 0)
+			break;
+
+		error = nvmf_get_discovery_log_page(qp, sizeof(hdr),
+		    log->entries, payload_len);
+		if (error == EAGAIN)
+			continue;
+		if (error != 0) {
+			free(log);
+			return (error);
+		}
+
+		/* Re-read the header and check the generation count. */
+		error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr));
+		if (error != 0) {
+			free(log);
+			return (error);
+		}
+		nvme_discovery_log_swapbytes(&hdr);
+
+		if (log->genctr != hdr.genctr)
+			continue;
+
+		for (u_int i = 0; i < log->numrec; i++)
+			nvme_discovery_log_entry_swapbytes(&log->entries[i]);
+		break;
+	}
+	*logp = log;
+	return (0);
+}
+
+int
+nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested, u_int *actual)
+{
+	struct nvme_command cmd;
+	struct nvmf_capsule *cc, *rc;
+	int error;
+	uint16_t status;
+
+	if (!qp->nq_admin || requested < 1 || requested > 65535)
+		return (EINVAL);
+
+	/* The number of queues is 0's based. */
+	requested--;
+
+	nvmf_init_sqe(&cmd, NVME_OPC_SET_FEATURES);
+	cmd.cdw10 = htole32(NVME_FEAT_NUMBER_OF_QUEUES);
+
+	/* Same number of completion and submission queues. */
+	cmd.cdw11 = htole32((requested << 16) | requested);
+
+	cc = nvmf_allocate_command(qp, &cmd);
+	if (cc == NULL)
+		return (errno);
+
+	error = nvmf_host_transmit_command(cc);
+	if (error != 0) {
+		nvmf_free_capsule(cc);
+		return (error);
+	}
+
+	error = nvmf_host_wait_for_response(cc, &rc);
+	nvmf_free_capsule(cc);
+	if (error != 0)
+		return (error);
+
+	status = le16toh(rc->nc_cqe.status);
+	if (status != 0) {
+		printf("NVMF: SET_FEATURES failed, status %#x\n", status);
+		nvmf_free_capsule(rc);
+		return (EIO);
+	}
+
+	*actual = (le32toh(rc->nc_cqe.cdw0) & 0xffff) + 1;
+	nvmf_free_capsule(rc);
+	return (0);
+}
+
+static bool
+is_queue_pair_idle(struct nvmf_qpair *qp)
+{
+	if (qp->nq_sqhd != qp->nq_sqtail)
+		return (false);
+	if (!TAILQ_EMPTY(&qp->nq_rx_capsules))
+		return (false);
+	return (true);
+}
+
+static int
+prepare_queues_for_handoff(struct nvmf_handoff_host *hh,
+    struct nvmf_qpair *admin_qp, u_int num_queues,
+    struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata)
+{
+	struct nvmf_handoff_qpair_params *io;
+	u_int i;
+	int error;
+
+	memset(hh, 0, sizeof(*hh));
+
+	/* All queue pairs must be idle. */
+	if (!is_queue_pair_idle(admin_qp))
+		return (EBUSY);
+	for (i = 0; i < num_queues; i++) {
+		if (!is_queue_pair_idle(io_queues[i]))
+			return (EBUSY);
+	}
+
+	/* First, the admin queue. */
+	hh->trtype = admin_qp->nq_association->na_trtype;
+	hh->kato = admin_qp->nq_kato;
+	error = nvmf_kernel_handoff_params(admin_qp, &hh->admin);
+	if (error)
+		return (error);
+
+	/* Next, the I/O queues. */
+	hh->num_io_queues = num_queues;
+	io = calloc(num_queues, sizeof(*io));
+	for (i = 0; i < num_queues; i++) {
+		error = nvmf_kernel_handoff_params(io_queues[i], &io[i]);
+		if (error) {
+			free(io);
+			return (error);
+		}
+	}
+
+	hh->io = io;
+	hh->cdata = cdata;
+	return (0);
+}
+
+int
+nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues,
+    struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata)
+{
+	struct nvmf_handoff_host hh;
+	u_int i;
+	int error, fd;
+
+	fd = open("/dev/nvmf", O_RDWR);
+	if (fd == -1) {
+		error = errno;
+		goto out;
+	}
+
+	error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues,
+	    cdata);
+	if (error != 0)
+		goto out;
+
+	if (ioctl(fd, NVMF_HANDOFF_HOST, &hh) == -1)
+		error = errno;
+	free(hh.io);
+
+out:
+	if (fd >= 0)
+		close(fd);
+	for (i = 0; i < num_queues; i++)
+		(void)nvmf_free_qpair(io_queues[i]);
+	(void)nvmf_free_qpair(admin_qp);
+	return (error);
+}
+
+int
+nvmf_disconnect_host(const char *host)
+{
+	int error, fd;
+
+	error = 0;
+	fd = open("/dev/nvmf", O_RDWR);
+	if (fd == -1) {
+		error = errno;
+		goto out;
+	}
+
+	if (ioctl(fd, NVMF_DISCONNECT_HOST, &host) == -1)
+		error = errno;
+
+out:
+	if (fd >= 0)
+		close(fd);
+	return (error);
+}
+
+int
+nvmf_disconnect_all(void)
+{
+	int error, fd;
+
+	error = 0;
+	fd = open("/dev/nvmf", O_RDWR);
+	if (fd == -1) {
+		error = errno;
+		goto out;
+	}
+
+	if (ioctl(fd, NVMF_DISCONNECT_ALL) == -1)
+		error = errno;
+
+out:
+	if (fd >= 0)
+		close(fd);
+	return (error);
+}
+
+int
+nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams)
+{
+	if (ioctl(fd, NVMF_RECONNECT_PARAMS, rparams) == -1)
+		return (errno);
+	return (0);
+}
+
+int
+nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp, u_int num_queues,
+    struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata)
+{
+	struct nvmf_handoff_host hh;
+	u_int i;
+	int error;
+
+	error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues,
+	    cdata);
+	if (error != 0)
+		goto out;
+
+	if (ioctl(fd, NVMF_RECONNECT_HOST, &hh) == -1)
+		error = errno;
+	free(hh.io);
+
+out:
+	for (i = 0; i < num_queues; i++)
+		(void)nvmf_free_qpair(io_queues[i]);
+	(void)nvmf_free_qpair(admin_qp);
+	return (error);
+}
diff --git a/lib/libnvmf/nvmf_tcp.c b/lib/libnvmf/nvmf_tcp.c
new file mode 100644
index 000000000000..12da329f34b4
--- /dev/null
+++ b/lib/libnvmf/nvmf_tcp.c
@@ -0,0 +1,1474 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/endian.h>
+#include <sys/gsb_crc32.h>
+#include <sys/queue.h>
+#include <sys/uio.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libnvmf.h"
+#include "internal.h"
+#include "nvmf_tcp.h"
+
+struct nvmf_tcp_qpair;
+
+struct nvmf_tcp_command_buffer {
+	struct nvmf_tcp_qpair *qp;
+
+	void	*data;
+	size_t	data_len;
+	size_t	data_xfered;
+	uint32_t data_offset;
+
+	uint16_t cid;
+	uint16_t ttag;
+
+	LIST_ENTRY(nvmf_tcp_command_buffer) link;
+};
+
+LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
+
+struct nvmf_tcp_association {
+	struct nvmf_association na;
+
+	uint32_t ioccsz;
+};
+
+struct nvmf_tcp_rxpdu {
+	struct nvme_tcp_common_pdu_hdr *hdr;
+	uint32_t data_len;
+};
+
+struct nvmf_tcp_capsule {
+	struct nvmf_capsule nc;
+
+	struct nvmf_tcp_rxpdu rx_pdu;
+	struct nvmf_tcp_command_buffer *cb;
+
+	TAILQ_ENTRY(nvmf_tcp_capsule) link;
+};
+
+struct nvmf_tcp_qpair {
+	struct nvmf_qpair qp;
+	int s;
+
+	uint8_t	txpda;
+	uint8_t rxpda;
+	bool header_digests;
+	bool data_digests;
+	uint32_t maxr2t;
+	uint32_t maxh2cdata;
+	uint32_t max_icd;	/* Host only */
+	uint16_t next_ttag;	/* Controller only */
+
+	struct nvmf_tcp_command_buffer_list tx_buffers;
+	struct nvmf_tcp_command_buffer_list rx_buffers;
+	TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
+};
+
+#define	TASSOC(nc)	((struct nvmf_tcp_association *)(na))
+#define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
+#define	CTCAP(nc)	((const struct nvmf_tcp_capsule *)(nc))
+#define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
+
+static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
+
+static uint32_t
+compute_digest(const void *buf, size_t len)
+{
+	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
+}
+
+static struct nvmf_tcp_command_buffer *
+tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
+    uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
+    bool receive)
+{
+	struct nvmf_tcp_command_buffer *cb;
+
+	cb = malloc(sizeof(*cb));
+	cb->qp = qp;
+	cb->data = data;
+	cb->data_offset = data_offset;
+	cb->data_len = data_len;
+	cb->data_xfered = 0;
+	cb->cid = cid;
+	cb->ttag = ttag;
+
+	if (receive)
+		LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
+	else
+		LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
+	return (cb);
+}
+
+static struct nvmf_tcp_command_buffer *
+tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
+    bool receive)
+{
+	struct nvmf_tcp_command_buffer_list *list;
+	struct nvmf_tcp_command_buffer *cb;
+
+	list = receive ? &qp->rx_buffers : &qp->tx_buffers;
+	LIST_FOREACH(cb, list, link) {
+		if (cb->cid == cid && cb->ttag == ttag)
+			return (cb);
+	}
+	return (NULL);
+}
+
+static void
+tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
+    bool receive)
+{
+	struct nvmf_tcp_command_buffer *cb;
+
+	cb = tcp_find_command_buffer(qp, cid, ttag, receive);
+	if (cb != NULL)
+		LIST_REMOVE(cb, link);
+}
+
+static void
+tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
+{
+	LIST_REMOVE(cb, link);
+	free(cb);
+}
+
+static int
+nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
+{
+	ssize_t nwritten;
+	const char *cp;
+
+	cp = pdu;
+	while (len != 0) {
+		nwritten = write(qp->s, cp, len);
+		if (nwritten < 0)
+			return (errno);
+		len -= nwritten;
+		cp += nwritten;
+	}
+	return (0);
+}
+
+static int
+nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
+    u_int iovcnt, size_t len)
+{
+	ssize_t nwritten;
+
+	for (;;) {
+		nwritten = writev(qp->s, iov, iovcnt);
+		if (nwritten < 0)
+			return (errno);
+
+		len -= nwritten;
+		if (len == 0)
+			return (0);
+
+		while (iov->iov_len <= (size_t)nwritten) {
+			nwritten -= iov->iov_len;
+			iovcnt--;
+			iov++;
+		}
+
+		iov->iov_base = (char *)iov->iov_base + nwritten;
+		iov->iov_len -= nwritten;
+	}
+}
+
+static void
+nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
+    uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
+{
+	struct nvme_tcp_term_req_hdr hdr;
+	struct iovec iov[2];
+
+	if (hlen != 0) {
+		if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
+			hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
+		if (hlen > pdu_len)
+			hlen = pdu_len;
+	}
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.common.pdu_type = na->na_controller ?
+	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
+	hdr.common.hlen = sizeof(hdr);
+	hdr.common.plen = sizeof(hdr) + hlen;
+	hdr.fes = htole16(fes);
+	le32enc(hdr.fei, fei);
+	iov[0].iov_base = &hdr;
+	iov[0].iov_len = sizeof(hdr);
+	iov[1].iov_base = __DECONST(void *, rx_pdu);
+	iov[1].iov_len = hlen;
+
+	(void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
+	close(qp->s);
+	qp->s = -1;
+}
+
+static int
+nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
+    size_t pdu_len)
+{
+	const struct nvme_tcp_common_pdu_hdr *ch;
+	uint32_t data_len, fei, plen;
+	uint32_t digest, rx_digest;
+	u_int hlen;
+	int error;
+	uint16_t fes;
+
+	/* Determine how large of a PDU header to return for errors. */
+	ch = pdu->hdr;
+	hlen = ch->hlen;
+	plen = le32toh(ch->plen);
+	if (hlen < sizeof(*ch) || hlen > plen)
+		hlen = sizeof(*ch);
+
+	error = nvmf_tcp_validate_pdu_header(ch,
+	    qp->qp.nq_association->na_controller, qp->header_digests,
+	    qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
+	if (error != 0) {
+		if (error == ECONNRESET) {
+			close(qp->s);
+			qp->s = -1;
+		} else {
+			nvmf_tcp_report_error(qp->qp.nq_association, qp,
+			    fes, fei, ch, pdu_len, hlen);
+		}
+		return (error);
+	}
+
+	/* Check header digest if present. */
+	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
+		digest = compute_digest(ch, ch->hlen);
+		memcpy(&rx_digest, (const char *)ch + ch->hlen,
+		    sizeof(rx_digest));
+		if (digest != rx_digest) {
+			printf("NVMe/TCP: Header digest mismatch\n");
+			nvmf_tcp_report_error(qp->qp.nq_association, qp,
+			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
+			    pdu_len, hlen);
+			return (EBADMSG);
+		}
+	}
+
+	/* Check data digest if present. */
+	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
+		digest = compute_digest((const char *)ch + ch->pdo, data_len);
+		memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
+		    sizeof(rx_digest));
+		if (digest != rx_digest) {
+			printf("NVMe/TCP: Data digest mismatch\n");
+			return (EBADMSG);
+		}
+	}
+
+	pdu->data_len = data_len;
+	return (0);
+}
+
+/*
+ * Read data from a socket, retrying until the data has been fully
+ * read or an error occurs.
+ */
+static int
+nvmf_tcp_read_buffer(int s, void *buf, size_t len)
+{
+	ssize_t nread;
+	char *cp;
+
+	cp = buf;
+	while (len != 0) {
+		nread = read(s, cp, len);
+		if (nread < 0)
+			return (errno);
+		if (nread == 0)
+			return (ECONNRESET);
+		len -= nread;
+		cp += nread;
+	}
+	return (0);
+}
+
+static int
+nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvme_tcp_common_pdu_hdr ch;
+	uint32_t plen;
+	int error;
+
+	memset(pdu, 0, sizeof(*pdu));
+	error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
+	if (error != 0)
+		return (error);
+
+	plen = le32toh(ch.plen);
+
+	/*
+	 * Validate a header with garbage lengths to trigger
+	 * an error message without reading more.
+	 */
+	if (plen < sizeof(ch) || ch.hlen > plen) {
+		pdu->hdr = &ch;
+		error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
+		pdu->hdr = NULL;
+		assert(error != 0);
+		return (error);
+	}
+
+	/* Read the rest of the PDU. */
+	pdu->hdr = malloc(plen);
+	memcpy(pdu->hdr, &ch, sizeof(ch));
+	error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
+	if (error != 0)
+		return (error);
+	error = nvmf_tcp_validate_pdu(qp, pdu, plen);
+	if (error != 0) {
+		free(pdu->hdr);
+		pdu->hdr = NULL;
+	}
+	return (error);
+}
+
+static void
+nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
+{
+	free(pdu->hdr);
+	pdu->hdr = NULL;
+}
+
+static int
+nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvme_tcp_term_req_hdr *hdr;
+
+	hdr = (void *)pdu->hdr;
+
+	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
+	    le16toh(hdr->fes), le32dec(hdr->fei));
+	nvmf_tcp_free_pdu(pdu);
+	return (ECONNRESET);
+}
+
+static int
+nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
+    struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvme_tcp_cmd *cmd;
+	struct nvmf_capsule *nc;
+	struct nvmf_tcp_capsule *tc;
+
+	cmd = (void *)pdu->hdr;
+
+	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
+	if (nc == NULL)
+		return (ENOMEM);
+
+	tc = TCAP(nc);
+	tc->rx_pdu = *pdu;
+
+	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
+	return (0);
+}
+
+static int
+nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
+    struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvme_tcp_rsp *rsp;
+	struct nvmf_capsule *nc;
+	struct nvmf_tcp_capsule *tc;
+
+	rsp = (void *)pdu->hdr;
+
+	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
+	if (nc == NULL)
+		return (ENOMEM);
+
+	nc->nc_sqhd_valid = true;
+	tc = TCAP(nc);
+	tc->rx_pdu = *pdu;
+
+	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
+
+	/*
+	 * Once the CQE has been received, no further transfers to the
+	 * command buffer for the associated CID can occur.
+	 */
+	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
+	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
+
+	return (0);
+}
+
+/*
+ * Construct and send a PDU that contains an optional data payload.
+ * This includes dealing with digests and the length fields in the
+ * common header.
+ */
+static int
+nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
+    void *data, uint32_t data_len)
+{
+	struct nvme_tcp_common_pdu_hdr *ch;
+	struct iovec iov[5];
+	u_int iovcnt;
+	uint32_t header_digest, data_digest, pad, pdo, plen;
+
+	plen = hlen;
+	if (qp->header_digests)
+		plen += sizeof(header_digest);
+	if (data_len != 0) {
+		pdo = roundup2(plen, qp->txpda);
+		pad = pdo - plen;
+		plen = pdo + data_len;
+		if (qp->data_digests)
+			plen += sizeof(data_digest);
+	} else {
+		assert(data == NULL);
+		pdo = 0;
+		pad = 0;
+	}
+
+	ch = hdr;
+	ch->hlen = hlen;
+	if (qp->header_digests)
+		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
+	if (qp->data_digests && data_len != 0)
+		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
+	ch->pdo = pdo;
+	ch->plen = htole32(plen);
+
+	/* CH + PSH */
+	iov[0].iov_base = hdr;
+	iov[0].iov_len = hlen;
+	iovcnt = 1;
+
+	/* HDGST */
+	if (qp->header_digests) {
+		header_digest = compute_digest(hdr, hlen);
+		iov[iovcnt].iov_base = &header_digest;
+		iov[iovcnt].iov_len = sizeof(header_digest);
+		iovcnt++;
+	}
+
+	if (pad != 0) {
+		/* PAD */
+		iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
+		iov[iovcnt].iov_len = pad;
+		iovcnt++;
+	}
+
+	if (data_len != 0) {
+		/* DATA */
+		iov[iovcnt].iov_base = data;
+		iov[iovcnt].iov_len = data_len;
+		iovcnt++;
+
+		/* DDGST */
+		if (qp->data_digests) {
+			data_digest = compute_digest(data, data_len);
+			iov[iovcnt].iov_base = &data_digest;
+			iov[iovcnt].iov_len = sizeof(data_digest);
+			iovcnt++;
+		}
+	}
+
+	return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
+}
+
+static int
+nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvme_tcp_h2c_data_hdr *h2c;
+	struct nvmf_tcp_command_buffer *cb;
+	uint32_t data_len, data_offset;
+	const char *icd;
+
+	h2c = (void *)pdu->hdr;
+	if (le32toh(h2c->datal) > qp->maxh2cdata) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
+		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
+	if (cb == NULL) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_len = le32toh(h2c->datal);
+	if (data_len != pdu->data_len) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_offset = le32toh(h2c->datao);
+	if (data_offset < cb->data_offset ||
+	    data_offset + data_len > cb->data_offset + cb->data_len) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
+		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if (data_offset != cb->data_offset + cb->data_xfered) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if ((cb->data_xfered + data_len == cb->data_len) !=
+	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	cb->data_xfered += data_len;
+	data_offset -= cb->data_offset;
+	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
+	memcpy((char *)cb->data + data_offset, icd, data_len);
+
+	nvmf_tcp_free_pdu(pdu);
+	return (0);
+}
+
+static int
+nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvme_tcp_c2h_data_hdr *c2h;
+	struct nvmf_tcp_command_buffer *cb;
+	uint32_t data_len, data_offset;
+	const char *icd;
+
+	c2h = (void *)pdu->hdr;
+
+	cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
+	if (cb == NULL) {
+		/*
+		 * XXX: Could be PDU sequence error if cccid is for a
+		 * command that doesn't use a command buffer.
+		 */
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_len = le32toh(c2h->datal);
+	if (data_len != pdu->data_len) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_offset = le32toh(c2h->datao);
+	if (data_offset < cb->data_offset ||
+	    data_offset + data_len > cb->data_offset + cb->data_len) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
+		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if (data_offset != cb->data_offset + cb->data_xfered) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if ((cb->data_xfered + data_len == cb->data_len) !=
+	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	cb->data_xfered += data_len;
+	data_offset -= cb->data_offset;
+	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
+	memcpy((char *)cb->data + data_offset, icd, data_len);
+
+	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
+		struct nvme_completion cqe;
+		struct nvmf_tcp_capsule *tc;
+		struct nvmf_capsule *nc;
+
+		memset(&cqe, 0, sizeof(cqe));
+		cqe.cid = cb->cid;
+
+		nc = nvmf_allocate_response(&qp->qp, &cqe);
+		if (nc == NULL) {
+			nvmf_tcp_free_pdu(pdu);
+			return (ENOMEM);
+		}
+		nc->nc_sqhd_valid = false;
+
+		tc = TCAP(nc);
+		TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
+	}
+
+	nvmf_tcp_free_pdu(pdu);
+	return (0);
+}
+
+/* NB: cid and ttag and little-endian already. */
+static int
+tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
+    uint32_t data_offset, void *buf, size_t len, bool last_pdu)
+{
+	struct nvme_tcp_h2c_data_hdr h2c;
+
+	memset(&h2c, 0, sizeof(h2c));
+	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
+	if (last_pdu)
+		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
+	h2c.cccid = cid;
+	h2c.ttag = ttag;
+	h2c.datao = htole32(data_offset);
+	h2c.datal = htole32(len);
+
+	return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
+}
+
+/* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
+static int
+tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
+    uint32_t data_offset, void *buf, size_t len, bool last_pdu)
+{
+	char *p;
+
+	p = buf;
+	while (len != 0) {
+		size_t todo;
+		int error;
+
+		todo = len;
+		if (todo > qp->maxh2cdata)
+			todo = qp->maxh2cdata;
+		error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
+		    last_pdu && todo == len);
+		if (error != 0)
+			return (error);
+		p += todo;
+		len -= todo;
+	}
+	return (0);
+}
+
+static int
+nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	struct nvmf_tcp_command_buffer *cb;
+	struct nvme_tcp_r2t_hdr *r2t;
+	uint32_t data_len, data_offset;
+	int error;
+
+	r2t = (void *)pdu->hdr;
+
+	cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
+	if (cb == NULL) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_offset = le32toh(r2t->r2to);
+	if (data_offset != cb->data_xfered) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
+		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	/*
+	 * XXX: The spec does not specify how to handle R2T tranfers
+	 * out of range of the original command.
+	 */
+	data_len = le32toh(r2t->r2tl);
+	if (data_offset + data_len > cb->data_len) {
+		nvmf_tcp_report_error(qp->qp.nq_association, qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
+		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	cb->data_xfered += data_len;
+
+	/*
+	 * Write out one or more H2C_DATA PDUs containing the
+	 * requested data.
+	 */
+	error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
+	    data_offset, (char *)cb->data + data_offset, data_len, true);
+
+	nvmf_tcp_free_pdu(pdu);
+	return (error);
+}
+
+static int
+nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
+{
+	struct nvmf_tcp_rxpdu pdu;
+	int error;
+
+	error = nvmf_tcp_read_pdu(qp, &pdu);
+	if (error != 0)
+		return (error);
+
+	switch (pdu.hdr->pdu_type) {
+	default:
+		__unreachable();
+		break;
+	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+		return (nvmf_tcp_handle_term_req(&pdu));
+	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+		return (nvmf_tcp_save_command_capsule(qp, &pdu));
+	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
+		return (nvmf_tcp_save_response_capsule(qp, &pdu));
+	case NVME_TCP_PDU_TYPE_H2C_DATA:
+		return (nvmf_tcp_handle_h2c_data(qp, &pdu));
+	case NVME_TCP_PDU_TYPE_C2H_DATA:
+		return (nvmf_tcp_handle_c2h_data(qp, &pdu));
+	case NVME_TCP_PDU_TYPE_R2T:
+		return (nvmf_tcp_handle_r2t(qp, &pdu));
+	}
+}
+
+static bool
+nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
+    const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
+{
+	const struct nvme_tcp_ic_req *pdu;
+	uint32_t plen;
+	u_int hlen;
+
+	/* Determine how large of a PDU header to return for errors. */
+	hlen = ch->hlen;
+	plen = le32toh(ch->plen);
+	if (hlen < sizeof(*ch) || hlen > plen)
+		hlen = sizeof(*ch);
+
+	/*
+	 * Errors must be reported for the lowest incorrect field
+	 * first, so validate fields in order.
+	 */
+
+	/* Validate pdu_type. */
+
+	/* Controllers only receive PDUs with a PDU direction of 0. */
+	if (na->na_controller != (ch->pdu_type & 0x01) == 0) {
+		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	switch (ch->pdu_type) {
+	case NVME_TCP_PDU_TYPE_IC_REQ:
+	case NVME_TCP_PDU_TYPE_IC_RESP:
+		break;
+	default:
+		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	/* Validate flags. */
+	if (ch->flags != 0) {
+		na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
+		    ch->flags);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	/* Validate hlen. */
+	if (ch->hlen != 128) {
+		na_error(na, "NVMe/TCP: Invalid PDU header length %u",
+		    ch->hlen);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	/* Validate pdo. */
+	if (ch->pdo != 0) {
+		na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	/* Validate plen. */
+	if (plen != 128) {
+		na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	/* Validate fields common to both ICReq and ICResp. */
+	pdu = (const struct nvme_tcp_ic_req *)ch;
+	if (le16toh(pdu->pfv) != 0) {
+		na_error(na, "NVMe/TCP: Unsupported PDU version %u",
+		    le16toh(pdu->pfv));
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
+		    8, ch, pdu_len, hlen);
+		return (false);
+	}
+
+	if (pdu->hpda > NVME_TCP_HPDA_MAX) {
+		na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	if (pdu->dgst.bits.reserved != 0) {
+		na_error(na, "NVMe/TCP: Invalid digest settings");
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
+		    hlen);
+		return (false);
+	}
+
+	return (true);
+}
+
+static bool
+nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
+    struct nvme_tcp_ic_req *pdu)
+{
+	int error;
+
+	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
+	if (error != 0) {
+		na_error(na, "NVMe/TCP: Failed to read IC request: %s",
+		    strerror(error));
+		return (false);
+	}
+
+	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
+}
+
+static bool
+nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
+    struct nvme_tcp_ic_resp *pdu)
+{
+	int error;
+
+	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
+	if (error != 0) {
+		na_error(na, "NVMe/TCP: Failed to read IC response: %s",
+		    strerror(error));
+		return (false);
+	}
+
+	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
+}
+
+static struct nvmf_association *
+tcp_allocate_association(bool controller __unused,
+    const struct nvmf_association_params *params __unused)
+{
+	struct nvmf_tcp_association *ta;
+
+	ta = calloc(1, sizeof(*ta));
+
+	return (&ta->na);
+}
+
+static void
+tcp_update_association(struct nvmf_association *na,
+    const struct nvme_controller_data *cdata)
+{
+	struct nvmf_tcp_association *ta = TASSOC(na);
+
+	ta->ioccsz = le32toh(cdata->ioccsz);
+}
+
+static void
+tcp_free_association(struct nvmf_association *na)
+{
+	free(na);
+}
+
+static bool
+tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
+{
+	const struct nvmf_association_params *params = &na->na_params;
+	struct nvmf_tcp_association *ta = TASSOC(na);
+	struct nvme_tcp_ic_req ic_req;
+	struct nvme_tcp_ic_resp ic_resp;
+	int error;
+
+	if (!admin) {
+		if (ta->ioccsz == 0) {
+			na_error(na, "TCP I/O queues require cdata");
+			return (false);
+		}
+		if (ta->ioccsz < 4) {
+			na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
+			return (false);
+		}
+	}
+
+	memset(&ic_req, 0, sizeof(ic_req));
+	ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
+	ic_req.common.hlen = sizeof(ic_req);
+	ic_req.common.plen = htole32(sizeof(ic_req));
+	ic_req.pfv = htole16(0);
+	ic_req.hpda = params->tcp.pda;
+	if (params->tcp.header_digests)
+		ic_req.dgst.bits.hdgst_enable = 1;
+	if (params->tcp.data_digests)
+		ic_req.dgst.bits.ddgst_enable = 1;
+	ic_req.maxr2t = htole32(params->tcp.maxr2t);
+
+	error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
+	if (error != 0) {
+		na_error(na, "Failed to write IC request: %s", strerror(error));
+		return (false);
+	}
+
+	if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
+		return (false);
+
+	/* Ensure the controller didn't enable digests we didn't request. */
+	if ((!params->tcp.header_digests &&
+	    ic_resp.dgst.bits.hdgst_enable != 0) ||
+	    (!params->tcp.data_digests &&
+	    ic_resp.dgst.bits.ddgst_enable != 0)) {
+		na_error(na, "Controller enabled unrequested digests");
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
+		    11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
+		return (false);
+	}
+
+	/*
+	 * XXX: Is there an upper-bound to enforce here?  Perhaps pick
+	 * some large value and report larger values as an unsupported
+	 * parameter?
+	 */
+	if (le32toh(ic_resp.maxh2cdata) < 4096) {
+		na_error(na, "Invalid MAXH2CDATA %u",
+		    le32toh(ic_resp.maxh2cdata));
+		nvmf_tcp_report_error(na, qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
+		    sizeof(ic_resp), sizeof(ic_resp));
+		return (false);
+	}
+
+	qp->txpda = (params->tcp.pda + 1) * 4;
+	qp->rxpda = (ic_resp.cpda + 1) * 4;
+	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
+	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
+	qp->maxr2t = params->tcp.maxr2t;
+	qp->maxh2cdata = le32toh(ic_resp.maxh2cdata);
+	if (admin)
+		/* 7.4.3 */
+		qp->max_icd = 8192;
+	else
+		qp->max_icd = (ta->ioccsz - 4) * 16;
+
+	return (0);
+}
+
+static bool
+tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
+{
+	const struct nvmf_association_params *params = &na->na_params;
+	struct nvme_tcp_ic_req ic_req;
+	struct nvme_tcp_ic_resp ic_resp;
+	int error;
+
+	if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
+		return (false);
+
+	memset(&ic_resp, 0, sizeof(ic_resp));
+	ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
+	ic_resp.common.hlen = sizeof(ic_req);
+	ic_resp.common.plen = htole32(sizeof(ic_req));
+	ic_resp.pfv = htole16(0);
+	ic_resp.cpda = params->tcp.pda;
+	if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
+		ic_resp.dgst.bits.hdgst_enable = 1;
+	if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
+		ic_resp.dgst.bits.ddgst_enable = 1;
+	ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
+
+	error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
+	if (error != 0) {
+		na_error(na, "Failed to write IC response: %s",
+		    strerror(error));
+		return (false);
+	}
+
+	qp->txpda = (params->tcp.pda + 1) * 4;
+	qp->rxpda = (ic_req.hpda + 1) * 4;
+	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
+	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
+	qp->maxr2t = le32toh(ic_req.maxr2t);
+	qp->maxh2cdata = params->tcp.maxh2cdata;
+	qp->max_icd = 0;	/* XXX */
+	return (0);
+}
+
+static struct nvmf_qpair *
+tcp_allocate_qpair(struct nvmf_association *na,
+    const struct nvmf_qpair_params *qparams)
+{
+	const struct nvmf_association_params *aparams = &na->na_params;
+	struct nvmf_tcp_qpair *qp;
+	int error;
+
+	if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
+		na_error(na, "Invalid PDA");
+		return (NULL);
+	}
+
+	qp = calloc(1, sizeof(*qp));
+	qp->s = qparams->tcp.fd;
+	LIST_INIT(&qp->rx_buffers);
+	LIST_INIT(&qp->tx_buffers);
+	TAILQ_INIT(&qp->rx_capsules);
+	if (na->na_controller)
+		error = tcp_accept(qp, na);
+	else
+		error = tcp_connect(qp, na, qparams->admin);
+	if (error != 0) {
+		free(qp);
+		return (NULL);
+	}
+
+	return (&qp->qp);
+}
+
+static void
+tcp_free_qpair(struct nvmf_qpair *nq)
+{
+	struct nvmf_tcp_qpair *qp = TQP(nq);
+	struct nvmf_tcp_capsule *ntc, *tc;
+	struct nvmf_tcp_command_buffer *ncb, *cb;
+
+	TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
+		TAILQ_REMOVE(&qp->rx_capsules, tc, link);
+		nvmf_free_capsule(&tc->nc);
+	}
+	LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
+		tcp_free_command_buffer(cb);
+	}
+	LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
+		tcp_free_command_buffer(cb);
+	}
+	free(qp);
+}
+
+static int
+tcp_kernel_handoff_params(struct nvmf_qpair *nq,
+    struct nvmf_handoff_qpair_params *qparams)
+{
+	struct nvmf_tcp_qpair *qp = TQP(nq);
+
+	qparams->tcp.fd = qp->s;
+	qparams->tcp.rxpda = qp->rxpda;
+	qparams->tcp.txpda = qp->txpda;
+	qparams->tcp.header_digests = qp->header_digests;
+	qparams->tcp.data_digests = qp->data_digests;
+	qparams->tcp.maxr2t = qp->maxr2t;
+	qparams->tcp.maxh2cdata = qp->maxh2cdata;
+	qparams->tcp.max_icd = qp->max_icd;
+
+	return (0);
+}
+
+static struct nvmf_capsule *
+tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
+{
+	struct nvmf_tcp_capsule *nc;
+
+	nc = calloc(1, sizeof(*nc));
+	return (&nc->nc);
+}
+
+static void
+tcp_free_capsule(struct nvmf_capsule *nc)
+{
+	struct nvmf_tcp_capsule *tc = TCAP(nc);
+
+	nvmf_tcp_free_pdu(&tc->rx_pdu);
+	if (tc->cb != NULL)
+		tcp_free_command_buffer(tc->cb);
+	free(tc);
+}
+
+static int
+tcp_transmit_command(struct nvmf_capsule *nc)
+{
+	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
+	struct nvmf_tcp_capsule *tc = TCAP(nc);
+	struct nvme_tcp_cmd cmd;
+	struct nvme_sgl_descriptor *sgl;
+	int error;
+	bool use_icd;
+
+	use_icd = false;
+	if (nc->nc_data_len != 0 && nc->nc_send_data &&
+	    nc->nc_data_len <= qp->max_icd)
+		use_icd = true;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
+	cmd.ccsqe = nc->nc_sqe;
+
+	/* Populate SGL in SQE. */
+	sgl = &cmd.ccsqe.sgl;
+	memset(sgl, 0, sizeof(*sgl));
+	sgl->address = 0;
+	sgl->length = htole32(nc->nc_data_len);
+	if (use_icd) {
+		/* Use in-capsule data. */
+		sgl->type = NVME_SGL_TYPE_ICD;
+	} else {
+		/* Use a command buffer. */
+		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
+	}
+
+	/* Send command capsule. */
+	error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
+	    nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * If data will be transferred using a command buffer, allocate a
+	 * buffer structure and queue it.
+	 */
+	if (nc->nc_data_len != 0 && !use_icd)
+		tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
+		    nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
+
+	return (0);
+}
+
+static int
+tcp_transmit_response(struct nvmf_capsule *nc)
+{
+	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
+	struct nvme_tcp_rsp rsp;
+
+	memset(&rsp, 0, sizeof(rsp));
+	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
+	rsp.rccqe = nc->nc_cqe;
+
+	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
+}
+
+static int
+tcp_transmit_capsule(struct nvmf_capsule *nc)
+{
+	if (nc->nc_qe_len == sizeof(struct nvme_command))
+		return (tcp_transmit_command(nc));
+	else
+		return (tcp_transmit_response(nc));
+}
+
+static int
+tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
+{
+	struct nvmf_tcp_qpair *qp = TQP(nq);
+	struct nvmf_tcp_capsule *tc;
+	int error;
+
+	while (TAILQ_EMPTY(&qp->rx_capsules)) {
+		error = nvmf_tcp_receive_pdu(qp);
+		if (error != 0)
+			return (error);
+	}
+	tc = TAILQ_FIRST(&qp->rx_capsules);
+	TAILQ_REMOVE(&qp->rx_capsules, tc, link);
+	*ncp = &tc->nc;
+	return (0);
+}
+
+static uint8_t
+tcp_validate_command_capsule(const struct nvmf_capsule *nc)
+{
+	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
+	const struct nvme_sgl_descriptor *sgl;
+
+	assert(tc->rx_pdu.hdr != NULL);
+
+	sgl = &nc->nc_sqe.sgl;
+	switch (sgl->type) {
+	case NVME_SGL_TYPE_ICD:
+		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
+			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
+			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
+		}
+		break;
+	case NVME_SGL_TYPE_COMMAND_BUFFER:
+		if (tc->rx_pdu.data_len != 0) {
+			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
+			return (NVME_SC_INVALID_FIELD);
+		}
+		break;
+	default:
+		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
+		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
+	}
+
+	if (sgl->address != 0) {
+		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
+		return (NVME_SC_SGL_OFFSET_INVALID);
+	}
+
+	return (NVME_SC_SUCCESS);
+}
+
+static size_t
+tcp_capsule_data_len(const struct nvmf_capsule *nc)
+{
+	assert(nc->nc_qe_len == sizeof(struct nvme_command));
+	return (le32toh(nc->nc_sqe.sgl.length));
+}
+
+/* NB: cid and ttag are both little-endian already. */
+static int
+tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
+    uint32_t data_offset, uint32_t data_len)
+{
+	struct nvme_tcp_r2t_hdr r2t;
+
+	memset(&r2t, 0, sizeof(r2t));
+	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
+	r2t.cccid = cid;
+	r2t.ttag = ttag;
+	r2t.r2to = htole32(data_offset);
+	r2t.r2tl = htole32(data_len);
+
+	return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
+}
+
+static int
+tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
+    void *buf, size_t len)
+{
+	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
+	struct nvmf_tcp_command_buffer *cb;
+	int error;
+	uint16_t ttag;
+
+	/*
+	 * Don't bother byte-swapping ttag as it is just a cookie
+	 * value returned by the other end as-is.
+	 */
+	ttag = qp->next_ttag++;
+
+	error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
+	if (error != 0)
+		return (error);
+
+	cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
+	    nc->nc_sqe.cid, ttag, true);
+
+	/* Parse received PDUs until the data transfer is complete. */
+	while (cb->data_xfered < cb->data_len) {
+		error = nvmf_tcp_receive_pdu(qp);
+		if (error != 0)
+			break;
+	}
+	tcp_free_command_buffer(cb);
+	return (error);
+}
+
+static int
+tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
+    void *buf, size_t len)
+{
+	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
+	const char *icd;
+
+	icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
+	memcpy(buf, icd, len);
+	return (0);
+}
+
+static int
+tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
+    void *buf, size_t len)
+{
+	struct nvmf_association *na = nc->nc_qpair->nq_association;
+	const struct nvme_sgl_descriptor *sgl;
+	size_t data_len;
+
+	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
+		return (EINVAL);
+
+	sgl = &nc->nc_sqe.sgl;
+	data_len = le32toh(sgl->length);
+	if (data_offset + len > data_len)
+		return (EFBIG);
+
+	if (sgl->type == NVME_SGL_TYPE_ICD)
+		return (tcp_receive_icd_data(nc, data_offset, buf, len));
+	else
+		return (tcp_receive_r2t_data(nc, data_offset, buf, len));
+}
+
+/* NB: cid is little-endian already. */
+static int
+tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
+    uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
+    bool success)
+{
+	struct nvme_tcp_c2h_data_hdr c2h;
+
+	memset(&c2h, 0, sizeof(c2h));
+	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
+	if (last_pdu)
+		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
+	if (success)
+		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
+	c2h.cccid = cid;
+	c2h.datao = htole32(data_offset);
+	c2h.datal = htole32(len);
+
+	return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
+	    __DECONST(void *, buf), len));
+}
+
+static int
+tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
+    size_t len)
+{
+	struct nvmf_association *na = nc->nc_qpair->nq_association;
+	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
+	const struct nvme_sgl_descriptor *sgl;
+	const char *src;
+	size_t todo;
+	uint32_t data_len, data_offset;
+	int error;
+	bool last_pdu, send_success_flag;
+
+	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
+		return (EINVAL);
+
+	sgl = &nc->nc_sqe.sgl;
+	data_len = le32toh(sgl->length);
+	if (len != data_len) {
+		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+		return (EFBIG);
+	}
+
+	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
+		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+		return (EINVAL);
+	}
+
+	/* Use the SUCCESS flag if SQ flow control is disabled. */
+	send_success_flag = !qp->qp.nq_flow_control;
+
+	/*
+	 * Write out one or more C2H_DATA PDUs containing the data.
+	 * Each PDU is arbitrarily capped at 256k.
+	 */
+	data_offset = 0;
+	src = buf;
+	while (len > 0) {
+		if (len > 256 * 1024) {
+			todo = 256 * 1024;
+			last_pdu = false;
+		} else {
+			todo = len;
+			last_pdu = true;
+		}
+		error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
+		    src, todo, last_pdu, last_pdu && send_success_flag);
+		if (error != 0) {
+			nvmf_send_generic_error(nc,
+			    NVME_SC_TRANSIENT_TRANSPORT_ERROR);
+			return (error);
+		}
+		data_offset += todo;
+		src += todo;
+		len -= todo;
+	}
+	if (!send_success_flag)
+		nvmf_send_success(nc);
+	return (0);
+}
+
+struct nvmf_transport_ops tcp_ops = {
+	.allocate_association = tcp_allocate_association,
+	.update_association = tcp_update_association,
+	.free_association = tcp_free_association,
+	.allocate_qpair = tcp_allocate_qpair,
+	.free_qpair = tcp_free_qpair,
+	.kernel_handoff_params = tcp_kernel_handoff_params,
+	.allocate_capsule = tcp_allocate_capsule,
+	.free_capsule = tcp_free_capsule,
+	.transmit_capsule = tcp_transmit_capsule,
+	.receive_capsule = tcp_receive_capsule,
+	.validate_command_capsule = tcp_validate_command_capsule,
+	.capsule_data_len = tcp_capsule_data_len,
+	.receive_controller_data = tcp_receive_controller_data,
+	.send_controller_data = tcp_send_controller_data,
+};
diff --git a/lib/libnvmf/nvmf_transport.c b/lib/libnvmf/nvmf_transport.c
new file mode 100644
index 000000000000..1a8505f2a993
--- /dev/null
+++ b/lib/libnvmf/nvmf_transport.c
@@ -0,0 +1,269 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/refcount.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libnvmf.h"
+#include "internal.h"
+
+struct nvmf_association *
+nvmf_allocate_association(enum nvmf_trtype trtype, bool controller,
+    const struct nvmf_association_params *params)
+{
+	struct nvmf_transport_ops *ops;
+	struct nvmf_association *na;
+
+	switch (trtype) {
+	case NVMF_TRTYPE_TCP:
+		ops = &tcp_ops;
+		break;
+	default:
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	na = ops->allocate_association(controller, params);
+	if (na == NULL)
+		return (NULL);
+
+	na->na_ops = ops;
+	na->na_trtype = trtype;
+	na->na_controller = controller;
+	na->na_params = *params;
+	na->na_last_error = NULL;
+	refcount_init(&na->na_refs, 1);
+	return (na);
+}
+
+void
+nvmf_update_assocation(struct nvmf_association *na,
+    const struct nvme_controller_data *cdata)
+{
+	na->na_ops->update_association(na, cdata);
+}
+
+void
+nvmf_free_association(struct nvmf_association *na)
+{
+	if (refcount_release(&na->na_refs)) {
+		free(na->na_last_error);
+		na->na_ops->free_association(na);
+	}
+}
+
+const char *
+nvmf_association_error(const struct nvmf_association *na)
+{
+	return (na->na_last_error);
+}
+
+void
+na_clear_error(struct nvmf_association *na)
+{
+	free(na->na_last_error);
+	na->na_last_error = NULL;
+}
+
+void
+na_error(struct nvmf_association *na, const char *fmt, ...)
+{
+	va_list ap;
+	char *str;
+
+	if (na->na_last_error != NULL)
+		return;
+	va_start(ap, fmt);
+	vasprintf(&str, fmt, ap);
+	va_end(ap);
+	na->na_last_error = str;
+}
+
+struct nvmf_qpair *
+nvmf_allocate_qpair(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params)
+{
+	struct nvmf_qpair *qp;
+
+	na_clear_error(na);
+	qp = na->na_ops->allocate_qpair(na, params);
+	if (qp == NULL)
+		return (NULL);
+
+	refcount_acquire(&na->na_refs);
+	qp->nq_association = na;
+	qp->nq_admin = params->admin;
+	TAILQ_INIT(&qp->nq_rx_capsules);
+	return (qp);
+}
+
+void
+nvmf_free_qpair(struct nvmf_qpair *qp)
+{
+	struct nvmf_association *na;
+	struct nvmf_capsule *nc, *tc;
+
+	TAILQ_FOREACH_SAFE(nc, &qp->nq_rx_capsules, nc_link, tc) {
+		TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link);
+		nvmf_free_capsule(nc);
+	}
+	na = qp->nq_association;
+	na->na_ops->free_qpair(qp);
+	nvmf_free_association(na);
+}
+
+struct nvmf_capsule *
+nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe)
+{
+	struct nvmf_capsule *nc;
+
+	nc = qp->nq_association->na_ops->allocate_capsule(qp);
+	if (nc == NULL)
+		return (NULL);
+
+	nc->nc_qpair = qp;
+	nc->nc_qe_len = sizeof(struct nvme_command);
+	memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
+
+	/* 4.2 of NVMe base spec: Fabrics always uses SGL. */
+	nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
+	nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
+	return (nc);
+}
+
+struct nvmf_capsule *
+nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe)
+{
+	struct nvmf_capsule *nc;
+
+	nc = qp->nq_association->na_ops->allocate_capsule(qp);
+	if (nc == NULL)
+		return (NULL);
+
+	nc->nc_qpair = qp;
+	nc->nc_qe_len = sizeof(struct nvme_completion);
+	memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
+	return (nc);
+}
+
+int
+nvmf_capsule_append_data(struct nvmf_capsule *nc, void *buf, size_t len,
+    bool send)
+{
+	if (nc->nc_qe_len == sizeof(struct nvme_completion))
+		return (EINVAL);
+	if (nc->nc_data_len != 0)
+		return (EBUSY);
+
+	nc->nc_data = buf;
+	nc->nc_data_len = len;
+	nc->nc_send_data = send;
+	return (0);
+}
+
+void
+nvmf_free_capsule(struct nvmf_capsule *nc)
+{
+	nc->nc_qpair->nq_association->na_ops->free_capsule(nc);
+}
+
+int
+nvmf_transmit_capsule(struct nvmf_capsule *nc)
+{
+	return (nc->nc_qpair->nq_association->na_ops->transmit_capsule(nc));
+}
+
+int
+nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp)
+{
+	return (qp->nq_association->na_ops->receive_capsule(qp, ncp));
+}
+
+const void *
+nvmf_capsule_sqe(const struct nvmf_capsule *nc)
+{
+	assert(nc->nc_qe_len == sizeof(struct nvme_command));
+	return (&nc->nc_sqe);
+}
+
+const void *
+nvmf_capsule_cqe(const struct nvmf_capsule *nc)
+{
+	assert(nc->nc_qe_len == sizeof(struct nvme_completion));
+	return (&nc->nc_cqe);
+}
+
+uint8_t
+nvmf_validate_command_capsule(const struct nvmf_capsule *nc)
+{
+	assert(nc->nc_qe_len == sizeof(struct nvme_command));
+
+	if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
+		return (NVME_SC_INVALID_FIELD);
+
+	return (nc->nc_qpair->nq_association->na_ops->validate_command_capsule(nc));
+}
+
+size_t
+nvmf_capsule_data_len(const struct nvmf_capsule *nc)
+{
+	return (nc->nc_qpair->nq_association->na_ops->capsule_data_len(nc));
+}
+
+int
+nvmf_receive_controller_data(const struct nvmf_capsule *nc,
+    uint32_t data_offset, void *buf, size_t len)
+{
+	return (nc->nc_qpair->nq_association->na_ops->receive_controller_data(nc,
+	    data_offset, buf, len));
+}
+
+int
+nvmf_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
+    size_t len)
+{
+	return (nc->nc_qpair->nq_association->na_ops->send_controller_data(nc,
+	    buf, len));
+}
+
+int
+nvmf_kernel_handoff_params(struct nvmf_qpair *qp,
+    struct nvmf_handoff_qpair_params *qparams)
+{
+	memset(qparams, 0, sizeof(*qparams));
+	qparams->admin = qp->nq_admin;
+	qparams->sq_flow_control = qp->nq_flow_control;
+	qparams->qsize = qp->nq_qsize;
+	qparams->sqhd = qp->nq_sqhd;
+	qparams->sqtail = qp->nq_sqtail;
+	return (qp->nq_association->na_ops->kernel_handoff_params(qp, qparams));
+}
+
+const char *
+nvmf_transport_type(uint8_t trtype)
+{
+	static _Thread_local char buf[8];
+
+	switch (trtype) {
+	case NVMF_TRTYPE_RDMA:
+		return ("RDMA");
+	case NVMF_TRTYPE_FC:
+		return ("Fibre Channel");
+	case NVMF_TRTYPE_TCP:
+		return ("TCP");
+	case NVMF_TRTYPE_INTRA_HOST:
+		return ("Intra-host");
+	default:
+		snprintf(buf, sizeof(buf), "0x%02x\n", trtype);
+		return (buf);
+	}
+}
diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk
index ebe788a346aa..0bde68ea443b 100644
--- a/share/mk/src.libnames.mk
+++ b/share/mk/src.libnames.mk
@@ -56,6 +56,7 @@ _INTERNALLIBS=	\
 		netbsd \
 		ntp \
 		ntpevent \
+		nvmf \
 		openbsd \
 		opts \
 		parse \
@@ -599,6 +600,9 @@ LIBNV?=		${LIBNVDIR}/libnv${PIE_SUFFIX}.a
 LIBISCSIUTILDIR=	${_LIB_OBJTOP}/lib/libiscsiutil
 LIBISCSIUTIL?=	${LIBISCSIUTILDIR}/libiscsiutil${PIE_SUFFIX}.a
 
+LIBNVMFDIR=	${_LIB_OBJTOP}/lib/libnvmf
+LIBNVMF?=	${LIBNVMFDIR}/libnvmf${PIE_SUFFIX}.a
+
 LIBTELNETDIR=	${_LIB_OBJTOP}/lib/libtelnet
 LIBTELNET?=	${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a