nvmfd: A simple userspace daemon for the NVMe over Fabrics controller

This daemon can operate as a purely userspace controller exporting one
or more simulated RAM disks or local block devices as NVMe namespaces
to a remote host.  In this case the daemon provides a discovery
controller with a single entry for an I/O controller.

nvmfd can also offload I/O controller queue pairs to the nvmft.ko
in-kernel Fabrics controller when -K is passed.  In this mode, nvmfd
still accepts connections and performs initial transport-specific
negotitation in userland.  The daemon still provides a userspace-only
discovery controller with a single entry for an I/O controller.
However, queue pairs for the I/O controller are handed off to the CTL
NVMF frontend.

Eventually ctld(8) should be refactored to to provide an abstraction
for the frontend protocol and the discovery and the kernel mode of
this daemon should be merged into ctld(8).  At that point this daemon
can be moved to tools/tools/nvmf as a debugging tool (mostly as sample
code for a userspace controller using libnvmf).

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44731
This commit is contained in:
John Baldwin 2024-05-02 16:35:40 -07:00
parent 09a931554a
commit a8089ea5ae
10 changed files with 2255 additions and 0 deletions

View File

@ -56,6 +56,7 @@ SUBDIR= adduser \
nfsuserd \
nmtree \
nologin \
nvmfd \
pciconf \
periodic \
pnfsdscopymr \

14
usr.sbin/nvmfd/Makefile Normal file
View File

@ -0,0 +1,14 @@
.include <src.opts.mk>
.PATH: ${SRCTOP}/sys/libkern
PACKAGE=nvme-tools
PROG= nvmfd
SRCS= nvmfd.c controller.c ctl.c devices.c discovery.c gsb_crc32.c io.c
CFLAGS+= -I${SRCTOP}/lib/libnvmf
MAN= nvmfd.8
LIBADD+= nvmf pthread util nv
.include <bsd.prog.mk>
CFLAGS.ctl.c= -I${SRCTOP}/sys
CWARNFLAGS.gsb_crc32.c= -Wno-cast-align

244
usr.sbin/nvmfd/controller.c Normal file
View File

@ -0,0 +1,244 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <err.h>
#include <errno.h>
#include <libnvmf.h>
#include <stdlib.h>
#include "internal.h"
struct controller {
struct nvmf_qpair *qp;
uint64_t cap;
uint32_t vs;
uint32_t cc;
uint32_t csts;
bool shutdown;
struct nvme_controller_data cdata;
};
static bool
update_cc(struct controller *c, uint32_t new_cc)
{
uint32_t changes;
if (c->shutdown)
return (false);
if (!nvmf_validate_cc(c->qp, c->cap, c->cc, new_cc))
return (false);
changes = c->cc ^ new_cc;
c->cc = new_cc;
/* Handle shutdown requests. */
if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
c->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
c->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
c->shutdown = true;
}
if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
/* Controller reset. */
c->csts = 0;
c->shutdown = true;
} else
c->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
}
return (true);
}
static void
handle_property_get(const struct controller *c, const struct nvmf_capsule *nc,
const struct nvmf_fabric_prop_get_cmd *pget)
{
struct nvmf_fabric_prop_get_rsp rsp;
nvmf_init_cqe(&rsp, nc, 0);
switch (le32toh(pget->ofst)) {
case NVMF_PROP_CAP:
if (pget->attrib.size != NVMF_PROP_SIZE_8)
goto error;
rsp.value.u64 = htole64(c->cap);
break;
case NVMF_PROP_VS:
if (pget->attrib.size != NVMF_PROP_SIZE_4)
goto error;
rsp.value.u32.low = htole32(c->vs);
break;
case NVMF_PROP_CC:
if (pget->attrib.size != NVMF_PROP_SIZE_4)
goto error;
rsp.value.u32.low = htole32(c->cc);
break;
case NVMF_PROP_CSTS:
if (pget->attrib.size != NVMF_PROP_SIZE_4)
goto error;
rsp.value.u32.low = htole32(c->csts);
break;
default:
goto error;
}
nvmf_send_response(nc, &rsp);
return;
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static void
handle_property_set(struct controller *c, const struct nvmf_capsule *nc,
const struct nvmf_fabric_prop_set_cmd *pset)
{
switch (le32toh(pset->ofst)) {
case NVMF_PROP_CC:
if (pset->attrib.size != NVMF_PROP_SIZE_4)
goto error;
if (!update_cc(c, le32toh(pset->value.u32.low)))
goto error;
break;
default:
goto error;
}
nvmf_send_success(nc);
return;
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static void
handle_fabrics_command(struct controller *c,
const struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
{
switch (fc->fctype) {
case NVMF_FABRIC_COMMAND_PROPERTY_GET:
handle_property_get(c, nc,
(const struct nvmf_fabric_prop_get_cmd *)fc);
break;
case NVMF_FABRIC_COMMAND_PROPERTY_SET:
handle_property_set(c, nc,
(const struct nvmf_fabric_prop_set_cmd *)fc);
break;
case NVMF_FABRIC_COMMAND_CONNECT:
warnx("CONNECT command on connected queue");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
break;
case NVMF_FABRIC_COMMAND_DISCONNECT:
warnx("DISCONNECT command on admin queue");
nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
break;
default:
warnx("Unsupported fabrics command %#x", fc->fctype);
nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
break;
}
}
static void
handle_identify_command(const struct controller *c,
const struct nvmf_capsule *nc, const struct nvme_command *cmd)
{
uint8_t cns;
cns = le32toh(cmd->cdw10) & 0xFF;
switch (cns) {
case 1:
break;
default:
warnx("Unsupported CNS %#x for IDENTIFY", cns);
goto error;
}
nvmf_send_controller_data(nc, &c->cdata, sizeof(c->cdata));
return;
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
void
controller_handle_admin_commands(struct controller *c, handle_command *cb,
void *cb_arg)
{
struct nvmf_qpair *qp = c->qp;
const struct nvme_command *cmd;
struct nvmf_capsule *nc;
int error;
for (;;) {
error = nvmf_controller_receive_capsule(qp, &nc);
if (error != 0) {
if (error != ECONNRESET)
warnc(error, "Failed to read command capsule");
break;
}
cmd = nvmf_capsule_sqe(nc);
/*
* Only permit Fabrics commands while a controller is
* disabled.
*/
if (NVMEV(NVME_CC_REG_EN, c->cc) == 0 &&
cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
warnx("Unsupported admin opcode %#x whiled disabled\n",
cmd->opc);
nvmf_send_generic_error(nc,
NVME_SC_COMMAND_SEQUENCE_ERROR);
nvmf_free_capsule(nc);
continue;
}
if (cb(nc, cmd, cb_arg)) {
nvmf_free_capsule(nc);
continue;
}
switch (cmd->opc) {
case NVME_OPC_FABRICS_COMMANDS:
handle_fabrics_command(c, nc,
(const struct nvmf_fabric_cmd *)cmd);
break;
case NVME_OPC_IDENTIFY:
handle_identify_command(c, nc, cmd);
break;
default:
warnx("Unsupported admin opcode %#x", cmd->opc);
nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
break;
}
nvmf_free_capsule(nc);
}
}
struct controller *
init_controller(struct nvmf_qpair *qp,
const struct nvme_controller_data *cdata)
{
struct controller *c;
c = calloc(1, sizeof(*c));
c->qp = qp;
c->cap = nvmf_controller_cap(c->qp);
c->vs = cdata->ver;
c->cdata = *cdata;
return (c);
}
void
free_controller(struct controller *c)
{
free(c);
}

139
usr.sbin/nvmfd/ctl.c Normal file
View File

@ -0,0 +1,139 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/linker.h>
#include <sys/nv.h>
#include <sys/time.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <libnvmf.h>
#include <string.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl_ioctl.h>
#include "internal.h"
static int ctl_fd = -1;
static int ctl_port;
static void
open_ctl(void)
{
if (ctl_fd > 0)
return;
ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR);
if (ctl_fd == -1 && errno == ENOENT) {
if (kldload("ctl") == -1)
err(1, "Failed to load ctl.ko");
ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR);
}
if (ctl_fd == -1)
err(1, "Failed to open %s", CTL_DEFAULT_DEV);
}
void
init_ctl_port(const char *subnqn, const struct nvmf_association_params *params)
{
char result_buf[256];
struct ctl_port_entry entry;
struct ctl_req req;
nvlist_t *nvl;
open_ctl();
nvl = nvlist_create(0);
nvlist_add_string(nvl, "subnqn", subnqn);
/* XXX: Hardcoded in discovery.c */
nvlist_add_stringf(nvl, "portid", "%u", 1);
nvlist_add_stringf(nvl, "max_io_qsize", "%u", params->max_io_qsize);
memset(&req, 0, sizeof(req));
strlcpy(req.driver, "nvmf", sizeof(req.driver));
req.reqtype = CTL_REQ_CREATE;
req.args = nvlist_pack(nvl, &req.args_len);
if (req.args == NULL)
errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_CREATE");
req.result = result_buf;
req.result_len = sizeof(result_buf);
if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0)
err(1, "ioctl(CTL_PORT/CTL_REQ_CREATE)");
if (req.status == CTL_LUN_ERROR)
errx(1, "Failed to create CTL port: %s", req.error_str);
if (req.status != CTL_LUN_OK)
errx(1, "Failed to create CTL port: %d", req.status);
nvlist_destroy(nvl);
nvl = nvlist_unpack(result_buf, req.result_len, 0);
if (nvl == NULL)
errx(1, "Failed to unpack nvlist from CTL_PORT/CTL_REQ_CREATE");
ctl_port = nvlist_get_number(nvl, "port_id");
nvlist_destroy(nvl);
memset(&entry, 0, sizeof(entry));
entry.targ_port = ctl_port;
if (ioctl(ctl_fd, CTL_ENABLE_PORT, &entry) != 0)
errx(1, "ioctl(CTL_ENABLE_PORT)");
}
void
shutdown_ctl_port(const char *subnqn)
{
struct ctl_req req;
nvlist_t *nvl;
open_ctl();
nvl = nvlist_create(0);
nvlist_add_string(nvl, "subnqn", subnqn);
memset(&req, 0, sizeof(req));
strlcpy(req.driver, "nvmf", sizeof(req.driver));
req.reqtype = CTL_REQ_REMOVE;
req.args = nvlist_pack(nvl, &req.args_len);
if (req.args == NULL)
errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_REMOVE");
if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0)
err(1, "ioctl(CTL_PORT/CTL_REQ_REMOVE)");
if (req.status == CTL_LUN_ERROR)
errx(1, "Failed to remove CTL port: %s", req.error_str);
if (req.status != CTL_LUN_OK)
errx(1, "Failed to remove CTL port: %d", req.status);
nvlist_destroy(nvl);
}
void
ctl_handoff_qpair(struct nvmf_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd,
const struct nvmf_fabric_connect_data *data)
{
struct ctl_nvmf req;
int error;
memset(&req, 0, sizeof(req));
req.type = CTL_NVMF_HANDOFF;
error = nvmf_handoff_controller_qpair(qp, &req.data.handoff);
if (error != 0) {
warnc(error, "Failed to prepare qpair for handoff");
return;
}
req.data.handoff.cmd = cmd;
req.data.handoff.data = data;
if (ioctl(ctl_fd, CTL_NVMF, &req) != 0)
warn("ioctl(CTL_NVMF/CTL_NVMF_HANDOFF)");
}

386
usr.sbin/nvmfd/devices.c Normal file
View File

@ -0,0 +1,386 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/disk.h>
#include <sys/gsb_crc32.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <net/ieee_oui.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <libnvmf.h>
#include <libutil.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "internal.h"
#define RAMDISK_PREFIX "ramdisk:"
struct backing_device {
enum { RAMDISK, FILE, CDEV } type;
union {
int fd; /* FILE, CDEV */
void *mem; /* RAMDISK */
};
u_int sector_size;
uint64_t nlbas;
uint64_t eui64;
};
static struct backing_device *devices;
static u_int ndevices;
static uint64_t
generate_eui64(uint32_t low)
{
return (OUI_FREEBSD_NVME_LOW << 16 | low);
}
static uint32_t
crc32(const void *buf, size_t len)
{
return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
}
static void
init_ramdisk(const char *config, struct backing_device *dev)
{
static uint32_t ramdisk_idx = 1;
uint64_t num;
dev->type = RAMDISK;
dev->sector_size = 512;
if (expand_number(config, &num))
errx(1, "Invalid ramdisk specification: %s", config);
if ((num % dev->sector_size) != 0)
errx(1, "Invalid ramdisk size %ju", (uintmax_t)num);
dev->mem = calloc(num, 1);
dev->nlbas = num / dev->sector_size;
dev->eui64 = generate_eui64('M' << 24 | ramdisk_idx++);
}
static void
init_filedevice(const char *config, int fd, struct stat *sb,
struct backing_device *dev)
{
dev->type = FILE;
dev->fd = fd;
dev->sector_size = 512;
if ((sb->st_size % dev->sector_size) != 0)
errx(1, "File size is not a multiple of 512: %s", config);
dev->nlbas = sb->st_size / dev->sector_size;
dev->eui64 = generate_eui64('F' << 24 |
(crc32(config, strlen(config)) & 0xffffff));
}
static void
init_chardevice(const char *config, int fd, struct backing_device *dev)
{
off_t len;
dev->type = CDEV;
dev->fd = fd;
if (ioctl(fd, DIOCGSECTORSIZE, &dev->sector_size) != 0)
err(1, "Failed to fetch sector size for %s", config);
if (ioctl(fd, DIOCGMEDIASIZE, &len) != 0)
err(1, "Failed to fetch sector size for %s", config);
dev->nlbas = len / dev->sector_size;
dev->eui64 = generate_eui64('C' << 24 |
(crc32(config, strlen(config)) & 0xffffff));
}
static void
init_device(const char *config, struct backing_device *dev)
{
struct stat sb;
int fd;
/* Check for a RAM disk. */
if (strncmp(RAMDISK_PREFIX, config, strlen(RAMDISK_PREFIX)) == 0) {
init_ramdisk(config + strlen(RAMDISK_PREFIX), dev);
return;
}
fd = open(config, O_RDWR);
if (fd == -1)
err(1, "Failed to open %s", config);
if (fstat(fd, &sb) == -1)
err(1, "fstat");
switch (sb.st_mode & S_IFMT) {
case S_IFCHR:
init_chardevice(config, fd, dev);
break;
case S_IFREG:
init_filedevice(config, fd, &sb, dev);
break;
default:
errx(1, "Invalid file type for %s", config);
}
}
void
register_devices(int ac, char **av)
{
ndevices = ac;
devices = calloc(ndevices, sizeof(*devices));
for (int i = 0; i < ac; i++)
init_device(av[i], &devices[i]);
}
u_int
device_count(void)
{
return (ndevices);
}
static struct backing_device *
lookup_device(uint32_t nsid)
{
if (nsid == 0 || nsid > ndevices)
return (NULL);
return (&devices[nsid - 1]);
}
void
device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist)
{
u_int count;
memset(nslist, 0, sizeof(*nslist));
count = 0;
nsid++;
while (nsid <= ndevices) {
nslist->ns[count] = htole32(nsid);
count++;
if (count == nitems(nslist->ns))
break;
nsid++;
}
}
bool
device_identification_descriptor(uint32_t nsid, void *buf)
{
struct backing_device *dev;
char *p;
dev = lookup_device(nsid);
if (dev == NULL)
return (false);
memset(buf, 0, 4096);
p = buf;
/* EUI64 */
*p++ = 1;
*p++ = 8;
p += 2;
be64enc(p, dev->eui64);
return (true);
}
bool
device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata)
{
struct backing_device *dev;
dev = lookup_device(nsid);
if (dev == NULL)
return (false);
memset(nsdata, 0, sizeof(*nsdata));
nsdata->nsze = htole64(dev->nlbas);
nsdata->ncap = nsdata->nsze;
nsdata->nuse = nsdata->ncap;
nsdata->nlbaf = 1 - 1;
nsdata->flbas = NVMEF(NVME_NS_DATA_FLBAS_FORMAT, 0);
nsdata->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS,
ffs(dev->sector_size) - 1);
be64enc(nsdata->eui64, dev->eui64);
return (true);
}
static bool
read_buffer(int fd, void *buf, size_t len, off_t offset)
{
ssize_t nread;
char *dst;
dst = buf;
while (len > 0) {
nread = pread(fd, dst, len, offset);
if (nread == -1 && errno == EINTR)
continue;
if (nread <= 0)
return (false);
dst += nread;
len -= nread;
offset += nread;
}
return (true);
}
void
device_read(uint32_t nsid, uint64_t lba, u_int nlb,
const struct nvmf_capsule *nc)
{
struct backing_device *dev;
char *p, *src;
off_t off;
size_t len;
dev = lookup_device(nsid);
if (dev == NULL) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return;
}
if (lba + nlb < lba || lba + nlb > dev->nlbas) {
nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE);
return;
}
off = lba * dev->sector_size;
len = nlb * dev->sector_size;
if (nvmf_capsule_data_len(nc) != len) {
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
return;
}
if (dev->type == RAMDISK) {
p = NULL;
src = (char *)dev->mem + off;
} else {
p = malloc(len);
if (!read_buffer(dev->fd, p, len, off)) {
free(p);
nvmf_send_generic_error(nc,
NVME_SC_INTERNAL_DEVICE_ERROR);
return;
}
src = p;
}
nvmf_send_controller_data(nc, src, len);
free(p);
}
static bool
write_buffer(int fd, const void *buf, size_t len, off_t offset)
{
ssize_t nwritten;
const char *src;
src = buf;
while (len > 0) {
nwritten = pwrite(fd, src, len, offset);
if (nwritten == -1 && errno == EINTR)
continue;
if (nwritten <= 0)
return (false);
src += nwritten;
len -= nwritten;
offset += nwritten;
}
return (true);
}
void
device_write(uint32_t nsid, uint64_t lba, u_int nlb,
const struct nvmf_capsule *nc)
{
struct backing_device *dev;
char *p, *dst;
off_t off;
size_t len;
int error;
dev = lookup_device(nsid);
if (dev == NULL) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return;
}
if (lba + nlb < lba || lba + nlb > dev->nlbas) {
nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE);
return;
}
off = lba * dev->sector_size;
len = nlb * dev->sector_size;
if (nvmf_capsule_data_len(nc) != len) {
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
return;
}
if (dev->type == RAMDISK) {
p = NULL;
dst = (char *)dev->mem + off;
} else {
p = malloc(len);
dst = p;
}
error = nvmf_receive_controller_data(nc, 0, dst, len);
if (error != 0) {
nvmf_send_generic_error(nc, NVME_SC_TRANSIENT_TRANSPORT_ERROR);
free(p);
return;
}
if (dev->type != RAMDISK) {
if (!write_buffer(dev->fd, p, len, off)) {
free(p);
nvmf_send_generic_error(nc,
NVME_SC_INTERNAL_DEVICE_ERROR);
return;
}
}
free(p);
nvmf_send_success(nc);
}
void
device_flush(uint32_t nsid, const struct nvmf_capsule *nc)
{
struct backing_device *dev;
dev = lookup_device(nsid);
if (dev == NULL) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return;
}
switch (dev->type) {
case RAMDISK:
break;
case FILE:
if (fdatasync(dev->fd) == -1) {
nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR,
NVME_SC_WRITE_FAULTS);
return;
}
break;
case CDEV:
if (ioctl(dev->fd, DIOCGFLUSH) == -1) {
nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR,
NVME_SC_WRITE_FAULTS);
return;
}
}
nvmf_send_success(nc);
}

343
usr.sbin/nvmfd/discovery.c Normal file
View File

@ -0,0 +1,343 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <assert.h>
#include <err.h>
#include <libnvmf.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "internal.h"
struct io_controller_data {
struct nvme_discovery_log_entry entry;
bool wildcard;
};
struct discovery_controller {
struct nvme_discovery_log *discovery_log;
size_t discovery_log_len;
int s;
};
struct discovery_thread_arg {
struct controller *c;
struct nvmf_qpair *qp;
int s;
};
static struct io_controller_data *io_controllers;
static struct nvmf_association *discovery_na;
static u_int num_io_controllers;
static bool
init_discovery_log_entry(struct nvme_discovery_log_entry *entry, int s,
const char *subnqn)
{
struct sockaddr_storage ss;
socklen_t len;
bool wildcard;
len = sizeof(ss);
if (getsockname(s, (struct sockaddr *)&ss, &len) == -1)
err(1, "getsockname");
memset(entry, 0, sizeof(*entry));
entry->trtype = NVMF_TRTYPE_TCP;
switch (ss.ss_family) {
case AF_INET:
{
struct sockaddr_in *sin;
sin = (struct sockaddr_in *)&ss;
entry->adrfam = NVMF_ADRFAM_IPV4;
snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u",
htons(sin->sin_port));
if (inet_ntop(AF_INET, &sin->sin_addr, entry->traddr,
sizeof(entry->traddr)) == NULL)
err(1, "inet_ntop");
wildcard = (sin->sin_addr.s_addr == htonl(INADDR_ANY));
break;
}
case AF_INET6:
{
struct sockaddr_in6 *sin6;
sin6 = (struct sockaddr_in6 *)&ss;
entry->adrfam = NVMF_ADRFAM_IPV6;
snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u",
htons(sin6->sin6_port));
if (inet_ntop(AF_INET6, &sin6->sin6_addr, entry->traddr,
sizeof(entry->traddr)) == NULL)
err(1, "inet_ntop");
wildcard = (memcmp(&sin6->sin6_addr, &in6addr_any,
sizeof(in6addr_any)) == 0);
break;
}
default:
errx(1, "Unsupported address family %u", ss.ss_family);
}
entry->subtype = NVMF_SUBTYPE_NVME;
if (flow_control_disable)
entry->treq |= (1 << 2);
entry->portid = htole16(1);
entry->cntlid = htole16(NVMF_CNTLID_DYNAMIC);
entry->aqsz = NVME_MAX_ADMIN_ENTRIES;
strlcpy(entry->subnqn, subnqn, sizeof(entry->subnqn));
return (wildcard);
}
void
init_discovery(void)
{
struct nvmf_association_params aparams;
memset(&aparams, 0, sizeof(aparams));
aparams.sq_flow_control = false;
aparams.dynamic_controller_model = true;
aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES;
aparams.tcp.pda = 0;
aparams.tcp.header_digests = header_digests;
aparams.tcp.data_digests = data_digests;
aparams.tcp.maxr2t = 1;
aparams.tcp.maxh2cdata = 256 * 1024;
discovery_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true,
&aparams);
if (discovery_na == NULL)
err(1, "Failed to create discovery association");
}
void
discovery_add_io_controller(int s, const char *subnqn)
{
struct io_controller_data *icd;
io_controllers = reallocf(io_controllers, (num_io_controllers + 1) *
sizeof(*io_controllers));
icd = &io_controllers[num_io_controllers];
num_io_controllers++;
icd->wildcard = init_discovery_log_entry(&icd->entry, s, subnqn);
}
static void
build_discovery_log_page(struct discovery_controller *dc)
{
struct sockaddr_storage ss;
socklen_t len;
char traddr[256];
u_int i, nentries;
uint8_t adrfam;
if (dc->discovery_log != NULL)
return;
len = sizeof(ss);
if (getsockname(dc->s, (struct sockaddr *)&ss, &len) == -1) {
warn("build_discovery_log_page: getsockname");
return;
}
memset(traddr, 0, sizeof(traddr));
switch (ss.ss_family) {
case AF_INET:
{
struct sockaddr_in *sin;
sin = (struct sockaddr_in *)&ss;
adrfam = NVMF_ADRFAM_IPV4;
if (inet_ntop(AF_INET, &sin->sin_addr, traddr,
sizeof(traddr)) == NULL) {
warn("build_discovery_log_page: inet_ntop");
return;
}
break;
}
case AF_INET6:
{
struct sockaddr_in6 *sin6;
sin6 = (struct sockaddr_in6 *)&ss;
adrfam = NVMF_ADRFAM_IPV6;
if (inet_ntop(AF_INET6, &sin6->sin6_addr, traddr,
sizeof(traddr)) == NULL) {
warn("build_discovery_log_page: inet_ntop");
return;
}
break;
}
default:
assert(false);
}
nentries = 0;
for (i = 0; i < num_io_controllers; i++) {
if (io_controllers[i].wildcard &&
io_controllers[i].entry.adrfam != adrfam)
continue;
nentries++;
}
dc->discovery_log_len = sizeof(*dc->discovery_log) +
nentries * sizeof(struct nvme_discovery_log_entry);
dc->discovery_log = calloc(dc->discovery_log_len, 1);
dc->discovery_log->numrec = nentries;
dc->discovery_log->recfmt = 0;
nentries = 0;
for (i = 0; i < num_io_controllers; i++) {
if (io_controllers[i].wildcard &&
io_controllers[i].entry.adrfam != adrfam)
continue;
dc->discovery_log->entries[nentries] = io_controllers[i].entry;
if (io_controllers[i].wildcard)
memcpy(dc->discovery_log->entries[nentries].traddr,
traddr, sizeof(traddr));
}
}
static void
handle_get_log_page_command(const struct nvmf_capsule *nc,
const struct nvme_command *cmd, struct discovery_controller *dc)
{
uint64_t offset;
uint32_t length;
switch (nvmf_get_log_page_id(cmd)) {
case NVME_LOG_DISCOVERY:
break;
default:
warnx("Unsupported log page %u for discovery controller",
nvmf_get_log_page_id(cmd));
goto error;
}
build_discovery_log_page(dc);
offset = nvmf_get_log_page_offset(cmd);
if (offset >= dc->discovery_log_len)
goto error;
length = nvmf_get_log_page_length(cmd);
if (length > dc->discovery_log_len - offset)
length = dc->discovery_log_len - offset;
nvmf_send_controller_data(nc, (char *)dc->discovery_log + offset,
length);
return;
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static bool
discovery_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd,
void *arg)
{
struct discovery_controller *dc = arg;
switch (cmd->opc) {
case NVME_OPC_GET_LOG_PAGE:
handle_get_log_page_command(nc, cmd, dc);
return (true);
default:
return (false);
}
}
static void *
discovery_thread(void *arg)
{
struct discovery_thread_arg *dta = arg;
struct discovery_controller dc;
pthread_detach(pthread_self());
memset(&dc, 0, sizeof(dc));
dc.s = dta->s;
controller_handle_admin_commands(dta->c, discovery_command, &dc);
free(dc.discovery_log);
free_controller(dta->c);
nvmf_free_qpair(dta->qp);
close(dta->s);
free(dta);
return (NULL);
}
void
handle_discovery_socket(int s)
{
struct nvmf_fabric_connect_data data;
struct nvme_controller_data cdata;
struct nvmf_qpair_params qparams;
struct discovery_thread_arg *dta;
struct nvmf_capsule *nc;
struct nvmf_qpair *qp;
pthread_t thr;
int error;
memset(&qparams, 0, sizeof(qparams));
qparams.tcp.fd = s;
nc = NULL;
qp = nvmf_accept(discovery_na, &qparams, &nc, &data);
if (qp == NULL) {
warnx("Failed to create discovery qpair: %s",
nvmf_association_error(discovery_na));
goto error;
}
if (strcmp(data.subnqn, NVMF_DISCOVERY_NQN) != 0) {
warn("Discovery qpair with invalid SubNQN: %.*s",
(int)sizeof(data.subnqn), data.subnqn);
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, subnqn));
goto error;
}
/* Just use a controller ID of 1 for all discovery controllers. */
error = nvmf_finish_accept(nc, 1);
if (error != 0) {
warnc(error, "Failed to send CONNECT reponse");
goto error;
}
nvmf_init_discovery_controller_data(qp, &cdata);
dta = malloc(sizeof(*dta));
dta->qp = qp;
dta->s = s;
dta->c = init_controller(qp, &cdata);
error = pthread_create(&thr, NULL, discovery_thread, dta);
if (error != 0) {
warnc(error, "Failed to create discovery thread");
free_controller(dta->c);
free(dta);
goto error;
}
nvmf_free_capsule(nc);
return;
error:
if (nc != NULL)
nvmf_free_capsule(nc);
if (qp != NULL)
nvmf_free_qpair(qp);
close(s);
}

65
usr.sbin/nvmfd/internal.h Normal file
View File

@ -0,0 +1,65 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#ifndef __INTERNAL_H__
#define __INTERNAL_H__
#include <stdbool.h>
struct controller;
struct nvme_command;
struct nvme_controller_data;
struct nvme_ns_list;
struct nvmf_capsule;
struct nvmf_qpair;
typedef bool handle_command(const struct nvmf_capsule *,
const struct nvme_command *, void *);
extern bool data_digests;
extern bool header_digests;
extern bool flow_control_disable;
extern bool kernel_io;
/* controller.c */
void controller_handle_admin_commands(struct controller *c,
handle_command *cb, void *cb_arg);
struct controller *init_controller(struct nvmf_qpair *qp,
const struct nvme_controller_data *cdata);
void free_controller(struct controller *c);
/* discovery.c */
void init_discovery(void);
void handle_discovery_socket(int s);
void discovery_add_io_controller(int s, const char *subnqn);
/* io.c */
void init_io(const char *subnqn);
void handle_io_socket(int s);
void shutdown_io(void);
/* devices.c */
void register_devices(int ac, char **av);
u_int device_count(void);
void device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist);
bool device_identification_descriptor(uint32_t nsid, void *buf);
bool device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata);
void device_read(uint32_t nsid, uint64_t lba, u_int nlb,
const struct nvmf_capsule *nc);
void device_write(uint32_t nsid, uint64_t lba, u_int nlb,
const struct nvmf_capsule *nc);
void device_flush(uint32_t nsid, const struct nvmf_capsule *nc);
/* ctl.c */
void init_ctl_port(const char *subnqn,
const struct nvmf_association_params *params);
void ctl_handoff_qpair(struct nvmf_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd,
const struct nvmf_fabric_connect_data *data);
void shutdown_ctl_port(const char *subnqn);
#endif /* !__INTERNAL_H__ */

677
usr.sbin/nvmfd/io.c Normal file
View File

@ -0,0 +1,677 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/sysctl.h>
#include <err.h>
#include <errno.h>
#include <libnvmf.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "internal.h"
struct io_controller {
struct controller *c;
u_int num_io_queues;
u_int active_io_queues;
struct nvmf_qpair **io_qpairs;
int *io_sockets;
struct nvme_firmware_page fp;
struct nvme_health_information_page hip;
uint16_t partial_dur;
uint16_t partial_duw;
uint16_t cntlid;
char hostid[16];
char hostnqn[NVME_NQN_FIELD_SIZE];
};
static struct nvmf_association *io_na;
static pthread_cond_t io_cond;
static pthread_mutex_t io_na_mutex;
static struct io_controller *io_controller;
static const char *nqn;
static char serial[NVME_SERIAL_NUMBER_LENGTH];
void
init_io(const char *subnqn)
{
struct nvmf_association_params aparams;
u_long hostid;
size_t len;
memset(&aparams, 0, sizeof(aparams));
aparams.sq_flow_control = !flow_control_disable;
aparams.dynamic_controller_model = true;
aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES;
aparams.max_io_qsize = NVMF_MAX_IO_ENTRIES;
aparams.tcp.pda = 0;
aparams.tcp.header_digests = header_digests;
aparams.tcp.data_digests = data_digests;
aparams.tcp.maxr2t = 1;
aparams.tcp.maxh2cdata = 256 * 1024;
io_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true,
&aparams);
if (io_na == NULL)
err(1, "Failed to create I/O controller association");
nqn = subnqn;
/* Generate a serial number from the kern.hostid node. */
len = sizeof(hostid);
if (sysctlbyname("kern.hostid", &hostid, &len, NULL, 0) == -1)
err(1, "sysctl: kern.hostid");
nvmf_controller_serial(serial, sizeof(serial), hostid);
pthread_cond_init(&io_cond, NULL);
pthread_mutex_init(&io_na_mutex, NULL);
if (kernel_io)
init_ctl_port(subnqn, &aparams);
}
void
shutdown_io(void)
{
if (kernel_io)
shutdown_ctl_port(nqn);
}
static void
handle_get_log_page(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
uint64_t offset;
uint32_t numd;
size_t len;
uint8_t lid;
lid = le32toh(cmd->cdw10) & 0xff;
numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
if (offset % 3 != 0)
goto error;
len = (numd + 1) * 4;
switch (lid) {
case NVME_LOG_ERROR:
{
void *buf;
if (len % sizeof(struct nvme_error_information_entry) != 0)
goto error;
buf = calloc(1, len);
nvmf_send_controller_data(nc, buf, len);
free(buf);
return;
}
case NVME_LOG_HEALTH_INFORMATION:
if (len != sizeof(ioc->hip))
goto error;
nvmf_send_controller_data(nc, &ioc->hip, sizeof(ioc->hip));
return;
case NVME_LOG_FIRMWARE_SLOT:
if (len != sizeof(ioc->fp))
goto error;
nvmf_send_controller_data(nc, &ioc->fp, sizeof(ioc->fp));
return;
default:
warnx("Unsupported page %#x for GET_LOG_PAGE\n", lid);
goto error;
}
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static bool
handle_io_identify_command(const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
struct nvme_namespace_data nsdata;
struct nvme_ns_list nslist;
uint32_t nsid;
uint8_t cns;
cns = le32toh(cmd->cdw10) & 0xFF;
switch (cns) {
case 0: /* Namespace data. */
if (!device_namespace_data(le32toh(cmd->nsid), &nsdata)) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return (true);
}
nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata));
return (true);
case 2: /* Active namespace list. */
nsid = le32toh(cmd->nsid);
if (nsid >= 0xfffffffe) {
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
return (true);
}
device_active_nslist(nsid, &nslist);
nvmf_send_controller_data(nc, &nslist, sizeof(nslist));
return (true);
case 3: /* Namespace Identification Descriptor list. */
if (!device_identification_descriptor(le32toh(cmd->nsid),
&nsdata)) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return (true);
}
nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata));
return (true);
default:
return (false);
}
}
static void
handle_set_features(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
struct nvme_completion cqe;
uint8_t fid;
fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
switch (fid) {
case NVME_FEAT_NUMBER_OF_QUEUES:
{
uint32_t num_queues;
if (ioc->num_io_queues != 0) {
nvmf_send_generic_error(nc,
NVME_SC_COMMAND_SEQUENCE_ERROR);
return;
}
num_queues = le32toh(cmd->cdw11) & 0xffff;
/* 5.12.1.7: 65535 is invalid. */
if (num_queues == 65535)
goto error;
/* Fabrics requires the same number of SQs and CQs. */
if (le32toh(cmd->cdw11) >> 16 != num_queues)
goto error;
/* Convert to 1's based */
num_queues++;
/* Lock to synchronize with handle_io_qpair. */
pthread_mutex_lock(&io_na_mutex);
ioc->num_io_queues = num_queues;
ioc->io_qpairs = calloc(num_queues, sizeof(*ioc->io_qpairs));
ioc->io_sockets = calloc(num_queues, sizeof(*ioc->io_sockets));
pthread_mutex_unlock(&io_na_mutex);
nvmf_init_cqe(&cqe, nc, 0);
cqe.cdw0 = cmd->cdw11;
nvmf_send_response(nc, &cqe);
return;
}
case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
{
uint32_t aer_mask;
aer_mask = le32toh(cmd->cdw11);
/* Check for any reserved or unimplemented feature bits. */
if ((aer_mask & 0xffffc000) != 0)
goto error;
/* No AERs are generated by this daemon. */
nvmf_send_success(nc);
return;
}
default:
warnx("Unsupported feature ID %u for SET_FEATURES", fid);
goto error;
}
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static bool
admin_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd,
void *arg)
{
struct io_controller *ioc = arg;
switch (cmd->opc) {
case NVME_OPC_GET_LOG_PAGE:
handle_get_log_page(ioc, nc, cmd);
return (true);
case NVME_OPC_IDENTIFY:
return (handle_io_identify_command(nc, cmd));
case NVME_OPC_SET_FEATURES:
handle_set_features(ioc, nc, cmd);
return (true);
case NVME_OPC_ASYNC_EVENT_REQUEST:
/* Ignore and never complete. */
return (true);
case NVME_OPC_KEEP_ALIVE:
nvmf_send_success(nc);
return (true);
default:
return (false);
}
}
static void
handle_admin_qpair(struct io_controller *ioc)
{
pthread_setname_np(pthread_self(), "admin queue");
controller_handle_admin_commands(ioc->c, admin_command, ioc);
pthread_mutex_lock(&io_na_mutex);
for (u_int i = 0; i < ioc->num_io_queues; i++) {
if (ioc->io_qpairs[i] == NULL || ioc->io_sockets[i] == -1)
continue;
close(ioc->io_sockets[i]);
ioc->io_sockets[i] = -1;
}
/* Wait for I/O threads to notice. */
while (ioc->active_io_queues > 0)
pthread_cond_wait(&io_cond, &io_na_mutex);
io_controller = NULL;
pthread_mutex_unlock(&io_na_mutex);
free_controller(ioc->c);
free(ioc);
}
static bool
handle_io_fabrics_command(const struct nvmf_capsule *nc,
const struct nvmf_fabric_cmd *fc)
{
switch (fc->fctype) {
case NVMF_FABRIC_COMMAND_CONNECT:
warnx("CONNECT command on connected queue");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
break;
case NVMF_FABRIC_COMMAND_DISCONNECT:
{
const struct nvmf_fabric_disconnect_cmd *dis =
(const struct nvmf_fabric_disconnect_cmd *)fc;
if (dis->recfmt != htole16(0)) {
nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT);
break;
}
nvmf_send_success(nc);
return (true);
}
default:
warnx("Unsupported fabrics command %#x", fc->fctype);
nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
break;
}
return (false);
}
static void
hip_add(uint64_t pair[2], uint64_t addend)
{
uint64_t old, new;
old = le64toh(pair[0]);
new = old + addend;
pair[0] = htole64(new);
if (new < old)
pair[1] += htole64(1);
}
static uint64_t
cmd_lba(const struct nvme_command *cmd)
{
return ((uint64_t)le32toh(cmd->cdw11) << 32 | le32toh(cmd->cdw10));
}
static u_int
cmd_nlb(const struct nvme_command *cmd)
{
return ((le32toh(cmd->cdw12) & 0xffff) + 1);
}
static void
handle_read(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
size_t len;
len = nvmf_capsule_data_len(nc);
device_read(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc);
hip_add(ioc->hip.host_read_commands, 1);
len /= 512;
len += ioc->partial_dur;
if (len > 1000)
hip_add(ioc->hip.data_units_read, len / 1000);
ioc->partial_dur = len % 1000;
}
static void
handle_write(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
size_t len;
len = nvmf_capsule_data_len(nc);
device_write(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc);
hip_add(ioc->hip.host_write_commands, 1);
len /= 512;
len += ioc->partial_duw;
if (len > 1000)
hip_add(ioc->hip.data_units_written, len / 1000);
ioc->partial_duw = len % 1000;
}
static void
handle_flush(const struct nvmf_capsule *nc, const struct nvme_command *cmd)
{
device_flush(le32toh(cmd->nsid), nc);
}
static bool
handle_io_commands(struct io_controller *ioc, struct nvmf_qpair *qp)
{
const struct nvme_command *cmd;
struct nvmf_capsule *nc;
int error;
bool disconnect;
disconnect = false;
while (!disconnect) {
error = nvmf_controller_receive_capsule(qp, &nc);
if (error != 0) {
if (error != ECONNRESET)
warnc(error, "Failed to read command capsule");
break;
}
cmd = nvmf_capsule_sqe(nc);
switch (cmd->opc) {
case NVME_OPC_FLUSH:
if (cmd->nsid == htole32(0xffffffff)) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
break;
}
handle_flush(nc, cmd);
break;
case NVME_OPC_WRITE:
handle_write(ioc, nc, cmd);
break;
case NVME_OPC_READ:
handle_read(ioc, nc, cmd);
break;
case NVME_OPC_FABRICS_COMMANDS:
disconnect = handle_io_fabrics_command(nc,
(const struct nvmf_fabric_cmd *)cmd);
break;
default:
warnx("Unsupported NVM opcode %#x", cmd->opc);
nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
break;
}
nvmf_free_capsule(nc);
}
return (disconnect);
}
static void
handle_io_qpair(struct io_controller *ioc, struct nvmf_qpair *qp, int qid)
{
char name[64];
bool disconnect;
snprintf(name, sizeof(name), "I/O queue %d", qid);
pthread_setname_np(pthread_self(), name);
disconnect = handle_io_commands(ioc, qp);
pthread_mutex_lock(&io_na_mutex);
if (disconnect)
ioc->io_qpairs[qid - 1] = NULL;
if (ioc->io_sockets[qid - 1] != -1) {
close(ioc->io_sockets[qid - 1]);
ioc->io_sockets[qid - 1] = -1;
}
ioc->active_io_queues--;
if (ioc->active_io_queues == 0)
pthread_cond_broadcast(&io_cond);
pthread_mutex_unlock(&io_na_mutex);
}
static void
connect_admin_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc,
const struct nvmf_fabric_connect_data *data)
{
struct nvme_controller_data cdata;
struct io_controller *ioc;
int error;
/* Can only have one active I/O controller at a time. */
pthread_mutex_lock(&io_na_mutex);
if (io_controller != NULL) {
pthread_mutex_unlock(&io_na_mutex);
nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_CONTROLLER_BUSY);
goto error;
}
error = nvmf_finish_accept(nc, 2);
if (error != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnc(error, "Failed to send CONNECT response");
goto error;
}
ioc = calloc(1, sizeof(*ioc));
ioc->cntlid = 2;
memcpy(ioc->hostid, data->hostid, sizeof(ioc->hostid));
memcpy(ioc->hostnqn, data->hostnqn, sizeof(ioc->hostnqn));
nvmf_init_io_controller_data(qp, serial, nqn, device_count(),
NVMF_IOCCSZ, &cdata);
ioc->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
memcpy(ioc->fp.revision[0], cdata.fr, sizeof(cdata.fr));
ioc->hip.power_cycles[0] = 1;
ioc->c = init_controller(qp, &cdata);
io_controller = ioc;
pthread_mutex_unlock(&io_na_mutex);
nvmf_free_capsule(nc);
handle_admin_qpair(ioc);
close(s);
return;
error:
nvmf_free_capsule(nc);
close(s);
}
static void
connect_io_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc,
const struct nvmf_fabric_connect_data *data, uint16_t qid)
{
struct io_controller *ioc;
int error;
pthread_mutex_lock(&io_na_mutex);
if (io_controller == NULL) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to create I/O qpair without admin qpair");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
goto error;
}
if (memcmp(io_controller->hostid, data->hostid,
sizeof(data->hostid)) != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnx("hostid mismatch for I/O qpair CONNECT");
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, hostid));
goto error;
}
if (le16toh(data->cntlid) != io_controller->cntlid) {
pthread_mutex_unlock(&io_na_mutex);
warnx("cntlid mismatch for I/O qpair CONNECT");
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, cntlid));
goto error;
}
if (memcmp(io_controller->hostnqn, data->hostnqn,
sizeof(data->hostid)) != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnx("host NQN mismatch for I/O qpair CONNECT");
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, hostnqn));
goto error;
}
if (io_controller->num_io_queues == 0) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to create I/O qpair without enabled queues");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
goto error;
}
if (qid > io_controller->num_io_queues) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to create invalid I/O qpair %u", qid);
nvmf_connect_invalid_parameters(nc, false,
offsetof(struct nvmf_fabric_connect_cmd, qid));
goto error;
}
if (io_controller->io_qpairs[qid - 1] != NULL) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to re-create I/O qpair %u", qid);
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
goto error;
}
error = nvmf_finish_accept(nc, io_controller->cntlid);
if (error != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnc(error, "Failed to send CONNECT response");
goto error;
}
ioc = io_controller;
ioc->active_io_queues++;
ioc->io_qpairs[qid - 1] = qp;
ioc->io_sockets[qid - 1] = s;
pthread_mutex_unlock(&io_na_mutex);
nvmf_free_capsule(nc);
handle_io_qpair(ioc, qp, qid);
return;
error:
nvmf_free_capsule(nc);
close(s);
}
static void *
io_socket_thread(void *arg)
{
struct nvmf_fabric_connect_data data;
struct nvmf_qpair_params qparams;
const struct nvmf_fabric_connect_cmd *cmd;
struct nvmf_capsule *nc;
struct nvmf_qpair *qp;
int s;
pthread_detach(pthread_self());
s = (intptr_t)arg;
memset(&qparams, 0, sizeof(qparams));
qparams.tcp.fd = s;
nc = NULL;
qp = nvmf_accept(io_na, &qparams, &nc, &data);
if (qp == NULL) {
warnx("Failed to create I/O qpair: %s",
nvmf_association_error(io_na));
goto error;
}
if (kernel_io) {
ctl_handoff_qpair(qp, nvmf_capsule_sqe(nc), &data);
goto error;
}
if (strcmp(data.subnqn, nqn) != 0) {
warn("I/O qpair with invalid SubNQN: %.*s",
(int)sizeof(data.subnqn), data.subnqn);
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, subnqn));
goto error;
}
/* Is this an admin or I/O queue pair? */
cmd = nvmf_capsule_sqe(nc);
if (cmd->qid == 0)
connect_admin_qpair(s, qp, nc, &data);
else
connect_io_qpair(s, qp, nc, &data, le16toh(cmd->qid));
nvmf_free_qpair(qp);
return (NULL);
error:
if (nc != NULL)
nvmf_free_capsule(nc);
if (qp != NULL)
nvmf_free_qpair(qp);
close(s);
return (NULL);
}
void
handle_io_socket(int s)
{
pthread_t thr;
int error;
error = pthread_create(&thr, NULL, io_socket_thread,
(void *)(uintptr_t)s);
if (error != 0) {
warnc(error, "Failed to create I/O qpair thread");
close(s);
}
}

126
usr.sbin/nvmfd/nvmfd.8 Normal file
View File

@ -0,0 +1,126 @@
.\"
.\" SPDX-License-Identifier: BSD-2-Clause
.\"
.\" Copyright (c) 2024 Chelsio Communications, Inc.
.\"
.Dd May 2, 2024
.Dt NVMFD 8
.Os
.Sh NAME
.Nm nvmfd
.Nd "NVMeoF controller daemon"
.Sh SYNOPSIS
.Nm
.Fl K
.Op Fl dFGg
.Op Fl P Ar port
.Op Fl p Ar port
.Op Fl t Ar transport
.Op Fl n Ar subnqn
.Nm
.Op Fl dFGg
.Op Fl P Ar port
.Op Fl p Ar port
.Op Fl t Ar transport
.Op Fl n Ar subnqn
.Ar device
.Op Ar device ...
.Sh DESCRIPTION
.Nm
accepts incoming NVMeoF connections for both I/O and discovery controllers.
.Nm
can either implement a single dynamic I/O controller in user mode or hand
off incoming I/O controller connections to
.Xr nvmft 4 .
A dynamic discovery controller service is always provided in user mode.
.Pp
The following options are available:
.Bl -tag -width "-t transport"
.It Fl F
Permit remote hosts to disable SQ flow control.
.It Fl G
Permit remote hosts to enable PDU data digests for the TCP transport.
.It Fl g
Permit remote hosts to enable PDU header digests for the TCP transport.
.It Fl K
Enable kernel mode which hands off incoming I/O controller connections to
.Xr nvmft 4 .
.It Fl P Ar port
Use
.Ar port
as the listen TCP port for the discovery controller service.
The default value is 8009.
.It Fl d
Enable debug mode.
The daemon sends any errors to standard output and does not place
itself in the background.
.It Fl p Ar port
Use
.Ar port
as the listen TCP port for the I/O controller service.
By default an unused ephemeral port will be chosen.
.It Fl n Ar subnqn
The Subsystem NVMe Qualified Name for the I/O controller.
If an explicit NQN is not given, a default value is generated from the
current host's UUID obtained from the
.Vt kern.hostuuid
sysctl.
.It Fl t Ar transport
The transport type to use.
The default transport is
.Dq tcp .
.It Ar device
When implementing a user mode I/O controller,
one or more
.Ar device
arguments must be specified.
Each
.Ar device
describes the backing store for a namespace exported to remote hosts.
Devices can be specified using one of the following syntaxes:
.Bl -tag -width "ramdisk:size"
.It Pa pathname
File or disk device
.It ramdisk : Ns Ar size
Allocate a memory disk with the given
.Ar size .
.Ar size
may use any of the suffixes supported by
.Xr expand_number 3 .
.El
.El
.Sh FILES
.Bl -tag -width "/var/run/nvmfd.pid" -compact
.It Pa /var/run/nvmfd.pid
The default location of the
.Nm
PID file.
.El
.Sh EXIT STATUS
.Ex -std
.Sh SEE ALSO
.Xr ctl 4 ,
.Xr nvmft 4 ,
.Xr ctladm 8 ,
.Xr ctld 8
.Sh HISTORY
The
.Nm
module first appeared in
.Fx 15.0 .
.Sh AUTHORS
The
.Nm
subsystem was developed by
.An John Baldwin Aq Mt jhb@FreeBSD.org
under sponsorship from Chelsio Communications, Inc.
.Sh BUGS
The discovery controller and kernel mode functionality of
.Nm
should be merged into
.Xr ctld 8 .
.Pp
Additional paramters such as
.Va MAXR2T ,
.Va MAXH2CDATA ,
and queue sizes should be configurable.

260
usr.sbin/nvmfd/nvmfd.c Normal file
View File

@ -0,0 +1,260 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/event.h>
#include <sys/linker.h>
#include <sys/module.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <assert.h>
#include <err.h>
#include <errno.h>
#include <libnvmf.h>
#include <libutil.h>
#include <netdb.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "internal.h"
bool data_digests = false;
bool header_digests = false;
bool flow_control_disable = false;
bool kernel_io = false;
static const char *subnqn;
static volatile bool quit = false;
static void
usage(void)
{
fprintf(stderr, "nvmfd -K [-FGg] [-P port] [-p port] [-t transport] [-n subnqn]\n"
"nvmfd [-dDFH] [-P port] [-p port] [-t transport] [-n subnqn]\n"
"\tdevice [device [...]]\n"
"\n"
"Devices use one of the following syntaxes:\n"
"\tpathame - file or disk device\n"
"\tramdisk:size - memory disk of given size\n");
exit(1);
}
static void
handle_sig(int sig __unused)
{
quit = true;
}
static void
register_listen_socket(int kqfd, int s, void *udata)
{
struct kevent kev;
if (listen(s, -1) != 0)
err(1, "listen");
EV_SET(&kev, s, EVFILT_READ, EV_ADD, 0, 0, udata);
if (kevent(kqfd, &kev, 1, NULL, 0, NULL) == -1)
err(1, "kevent: failed to add listen socket");
}
static void
create_passive_sockets(int kqfd, const char *port, bool discovery)
{
struct addrinfo hints, *ai, *list;
bool created;
int error, s;
memset(&hints, 0, sizeof(hints));
hints.ai_flags = AI_PASSIVE;
hints.ai_family = AF_UNSPEC;
hints.ai_protocol = IPPROTO_TCP;
error = getaddrinfo(NULL, port, &hints, &list);
if (error != 0)
errx(1, "%s", gai_strerror(error));
created = false;
for (ai = list; ai != NULL; ai = ai->ai_next) {
s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
if (s == -1)
continue;
if (bind(s, ai->ai_addr, ai->ai_addrlen) != 0) {
close(s);
continue;
}
if (discovery) {
register_listen_socket(kqfd, s, (void *)1);
} else {
register_listen_socket(kqfd, s, (void *)2);
discovery_add_io_controller(s, subnqn);
}
created = true;
}
freeaddrinfo(list);
if (!created)
err(1, "Failed to create any listen sockets");
}
static void
handle_connections(int kqfd)
{
struct kevent ev;
int s;
signal(SIGHUP, handle_sig);
signal(SIGINT, handle_sig);
signal(SIGQUIT, handle_sig);
signal(SIGTERM, handle_sig);
while (!quit) {
if (kevent(kqfd, NULL, 0, &ev, 1, NULL) == -1) {
if (errno == EINTR)
continue;
err(1, "kevent");
}
assert(ev.filter == EVFILT_READ);
s = accept(ev.ident, NULL, NULL);
if (s == -1) {
warn("accept");
continue;
}
switch ((uintptr_t)ev.udata) {
case 1:
handle_discovery_socket(s);
break;
case 2:
handle_io_socket(s);
break;
default:
__builtin_unreachable();
}
}
}
int
main(int ac, char **av)
{
struct pidfh *pfh;
const char *dport, *ioport, *transport;
pid_t pid;
int ch, error, kqfd;
bool daemonize;
static char nqn[NVMF_NQN_MAX_LEN];
/* 7.4.9.3 Default port for discovery */
dport = "8009";
pfh = NULL;
daemonize = true;
ioport = "0";
subnqn = NULL;
transport = "tcp";
while ((ch = getopt(ac, av, "dFgGKn:P:p:t:")) != -1) {
switch (ch) {
case 'd':
daemonize = false;
break;
case 'F':
flow_control_disable = true;
break;
case 'G':
data_digests = true;
break;
case 'g':
header_digests = true;
break;
case 'K':
kernel_io = true;
break;
case 'n':
subnqn = optarg;
break;
case 'P':
dport = optarg;
break;
case 'p':
ioport = optarg;
break;
case 't':
transport = optarg;
break;
default:
usage();
}
}
av += optind;
ac -= optind;
if (kernel_io) {
if (ac > 0)
usage();
if (modfind("nvmft") == -1 && kldload("nvmft") == -1)
warn("couldn't load nvmft");
} else {
if (ac < 1)
usage();
}
if (strcasecmp(transport, "tcp") == 0) {
} else
errx(1, "Invalid transport %s", transport);
if (subnqn == NULL) {
error = nvmf_nqn_from_hostuuid(nqn);
if (error != 0)
errc(1, error, "Failed to generate NQN");
subnqn = nqn;
}
if (!kernel_io)
register_devices(ac, av);
init_discovery();
init_io(subnqn);
if (daemonize) {
pfh = pidfile_open(NULL, 0600, &pid);
if (pfh == NULL) {
if (errno == EEXIST)
errx(1, "Daemon already running, pid: %jd",
(intmax_t)pid);
warn("Cannot open or create pidfile");
}
if (daemon(0, 0) != 0) {
pidfile_remove(pfh);
err(1, "Failed to fork into the background");
}
pidfile_write(pfh);
}
kqfd = kqueue();
if (kqfd == -1) {
pidfile_remove(pfh);
err(1, "kqueue");
}
create_passive_sockets(kqfd, dport, true);
create_passive_sockets(kqfd, ioport, false);
handle_connections(kqfd);
shutdown_io();
if (pfh != NULL)
pidfile_remove(pfh);
return (0);
}