From 0c4ee619dff8f373268f030cd6e73b949e64d4ec Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 2 May 2024 16:32:09 -0700 Subject: [PATCH] ctl: Support for NVMe commands - Add support for queueing and executing NVMe admin and NVM commands via ctl_run and ctl_queue. This requires fixing a few places that were SCSI-specific to add NVME logic. - NVMe has much simpler command ordering requirements than SCSI. In particular, the HBA is not required to enforce any specific ordering for requests with overlapping LBAs. The host is required to manage that ordering. However, fused commands (currently only COMPARE and WRITE NVM commands can be fused) are required to be executed atomically. To support fused commands, make the second half of a fused command block on the first half, and have commands submitted after a fused command pair block on the second half. - Add handlers and command tables for admin and NVM commands that operate on individual namespaces and will be passed down from an NVMe over Fabrics controller to a CTL LUN. Reviewed by: ken, imp Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D44720 --- sys/cam/ctl/ctl.c | 861 ++++++++++++++++++++++++++++++- sys/cam/ctl/ctl_nvme_cmd_table.c | 35 ++ sys/cam/ctl/ctl_private.h | 20 +- sys/conf/files | 1 + sys/modules/ctl/Makefile | 1 + 5 files changed, 908 insertions(+), 10 deletions(-) create mode 100644 sys/cam/ctl/ctl_nvme_cmd_table.c diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c index 59e8cba1b671..039670c17529 100644 --- a/sys/cam/ctl/ctl.c +++ b/sys/cam/ctl/ctl.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include @@ -447,6 +448,8 @@ static int ctl_scsiio_lun_check(struct ctl_lun *lun, static void ctl_failover_lun(union ctl_io *io); static void ctl_scsiio_precheck(struct ctl_scsiio *ctsio); static int ctl_scsiio(struct ctl_scsiio *ctsio); +static void ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio); +static int ctl_nvmeio(struct ctl_nvmeio *ctnio); static int ctl_target_reset(union ctl_io *io); static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx, @@ -4963,7 +4966,6 @@ ctl_config_move_done(union ctl_io *io, bool samethr) int retval; CTL_DEBUG_PRINT(("ctl_config_move_done\n")); - CTL_IO_ASSERT(io, SCSI); if (ctl_debug & CTL_DEBUG_CDB_DATA) ctl_data_print(io); @@ -4998,7 +5000,17 @@ ctl_config_move_done(union ctl_io *io, bool samethr) * XXX KDM call ctl_scsiio() again for now, and check flag * bits to see whether we're allocated or not. */ - retval = ctl_scsiio(&io->scsiio); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + retval = ctl_scsiio(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + retval = ctl_nvmeio(&io->nvmeio); + break; + default: + __assert_unreachable(); + } } return (retval); } @@ -10598,6 +10610,725 @@ ctl_read_toc(struct ctl_scsiio *ctsio) return (CTL_RETVAL_COMPLETE); } +/* + * For NVMe commands, parse the LBA and length. + */ +static bool +ctl_nvme_get_lba_len(struct ctl_nvmeio *ctnio, uint64_t *lba, uint32_t *len) +{ + CTL_IO_ASSERT(ctnio, NVME); + + switch (ctnio->cmd.opc) { + case NVME_OPC_WRITE: + case NVME_OPC_READ: + case NVME_OPC_WRITE_UNCORRECTABLE: + case NVME_OPC_COMPARE: + case NVME_OPC_WRITE_ZEROES: + case NVME_OPC_VERIFY: + *lba = (uint64_t)le32toh(ctnio->cmd.cdw11) << 32 | + le32toh(ctnio->cmd.cdw10); + *len = (le32toh(ctnio->cmd.cdw12) & 0xffff) + 1; + return (true); + default: + *lba = 0; + *len = 0; + return (false); + } +} + +static bool +ctl_nvme_fua(struct ctl_nvmeio *ctnio) +{ + return ((le32toh(ctnio->cmd.cdw12) & (1U << 30)) != 0); +} + +int +ctl_nvme_identify(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + size_t len; + int retval; + uint8_t cns; + + CTL_DEBUG_PRINT(("ctl_nvme_identify\n")); + + CTL_IO_ASSERT(ctnio, NVME_ADMIN); + MPASS(ctnio->cmd.opc == NVME_OPC_IDENTIFY); + + /* + * The data buffer for Identify is always 4096 bytes, see + * 5.51.1 in NVMe base specification 1.4. + */ + len = 4096; + + ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); + ctnio->kern_data_len = len; + ctnio->kern_total_len = len; + ctnio->kern_rel_offset = 0; + ctnio->kern_sg_entries = 0; + + ctl_nvme_set_success(ctnio); + ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctnio->be_move_done = ctl_config_move_done; + + /* + * If we don't have a LUN, return an empty result for CNS == 0. + */ + if (lun == NULL) { + cns = le32toh(ctnio->cmd.cdw10) & 0xff; + switch (cns) { + case 0: + memset(ctnio->kern_data_ptr, 0, len); + ctl_datamove((union ctl_io *)ctnio); + break; + default: + ctl_nvme_set_invalid_field(ctnio); + break; + } + return (CTL_RETVAL_COMPLETE); + } + + retval = lun->backend->config_read((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_flush(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_flush\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_FLUSH); + + /* + * NVMe flushes always flush the entire namespace, not an LBA + * range. + */ + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_read_write(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags, retval; + bool isread; + + CTL_DEBUG_PRINT(("ctl_nvme_read_write: command: %#x\n", + ctnio->cmd.opc)); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE || + ctnio->cmd.opc == NVME_OPC_READ); + + flags = 0; + isread = ctnio->cmd.opc == NVME_OPC_READ; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + /* + * Set FUA and/or DPO if caches are disabled. + * + * For a read this may not be quite correct for the block + * backend as any earlier writes to the LBA range should be + * flushed to backing store as part of the read. + */ + if (ctl_nvme_fua(ctnio)) { + flags |= CTL_LLF_FUA; + if (isread) + flags |= CTL_LLF_DPO; + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags; + + ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_read_write: calling data_submit()\n")); + + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_write_uncorrectable\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_UNCORRECTABLE); + + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = 0; + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_compare(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_compare\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_COMPARE); + + flags = 0; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + if (ctl_nvme_fua(ctnio)) + flags |= CTL_LLF_FUA; + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = CTL_LLF_COMPARE | flags; + ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_compare: calling data_submit()\n")); + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_write_zeroes\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_ZEROES); + + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = 0; + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct nvme_dsm_range *r; + uint64_t lba; + uint32_t len, num_blocks; + u_int i, ranges; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_dataset_management\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_DATASET_MANAGEMENT); + + ranges = le32toh(ctnio->cmd.cdw10) & 0xff; + len = ranges * sizeof(struct nvme_dsm_range); + + /* + * If we've got a kernel request that hasn't been malloced yet, + * malloc it and tell the caller the data buffer is here. + */ + if ((ctnio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { + ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); + ctnio->kern_data_len = len; + ctnio->kern_total_len = len; + ctnio->kern_rel_offset = 0; + ctnio->kern_sg_entries = 0; + ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctnio->be_move_done = ctl_config_move_done; + ctl_datamove((union ctl_io *)ctnio); + + return (CTL_RETVAL_COMPLETE); + } + + /* + * Require a flat buffer of the correct size. + */ + if (ctnio->kern_sg_entries > 0 || + ctnio->kern_total_len - ctnio->kern_data_resid != len) + return (CTL_RETVAL_ERROR); + + /* + * Verify that none of the ranges are out of bounds. + */ + r = (struct nvme_dsm_range *)ctnio->kern_data_ptr; + for (i = 0; i < ranges; i++) { + lba = le64toh(r[i].starting_lba); + num_blocks = le32toh(r[i].length); + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + } + + CTL_DEBUG_PRINT(("ctl_nvme_dataset_management: calling config_write()\n")); + retval = lun->backend->config_write((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_verify(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_verify\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_VERIFY); + + flags = 0; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + if (ctl_nvme_fua(ctnio)) + flags |= CTL_LLF_FUA; + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = CTL_LLF_VERIFY | flags; + ctnio->kern_total_len = 0; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_verify: calling data_submit()\n")); + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +static const struct ctl_nvme_cmd_entry * +ctl_nvme_get_cmd_entry(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + + switch (ctnio->io_hdr.io_type) { + case CTL_IO_NVME: + entry = &nvme_nvm_cmd_table[ctnio->cmd.opc]; + break; + case CTL_IO_NVME_ADMIN: + entry = &nvme_admin_cmd_table[ctnio->cmd.opc]; + break; + default: + __assert_unreachable(); + } + return (entry); +} + +static const struct ctl_nvme_cmd_entry * +ctl_nvme_validate_command(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + + entry = ctl_nvme_get_cmd_entry(ctnio); + if (entry->execute == NULL) { + ctl_nvme_set_invalid_opcode(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + + /* Validate fused commands. */ + switch (NVMEV(NVME_CMD_FUSE, ctnio->cmd.fuse)) { + case NVME_FUSE_NORMAL: + break; + case NVME_FUSE_FIRST: + if (ctnio->io_hdr.io_type != CTL_IO_NVME || + ctnio->cmd.opc != NVME_OPC_COMPARE) { + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + break; + case NVME_FUSE_SECOND: + if (ctnio->io_hdr.io_type != CTL_IO_NVME || + ctnio->cmd.opc != NVME_OPC_COMPARE) { + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + break; + default: + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + + return (entry); +} + +/* + * This is a simpler version of ctl_scsiio_lun_check that fails + * requests on a LUN without active media. + * + * Returns true if the command has been completed with an error. + */ +static bool +ctl_nvmeio_lun_check(struct ctl_lun *lun, + const struct ctl_nvme_cmd_entry *entry, struct ctl_nvmeio *ctnio) +{ + mtx_assert(&lun->lun_lock, MA_OWNED); + + if ((entry->flags & CTL_CMD_FLAG_OK_ON_NO_MEDIA) == 0) { + if ((lun->flags & (CTL_LUN_EJECTED | CTL_LUN_NO_MEDIA | + CTL_LUN_STOPPED)) != 0) { + ctl_nvme_set_namespace_not_ready(ctnio); + return (true); + } + } + + return (false); +} + +/* + * Check for blockage against the OOA (Order Of Arrival) queue. + * Assumptions: + * - pending_io is generally either incoming, or on the blocked queue + * - starting I/O is the I/O we want to start the check with. + */ +static ctl_action +ctl_nvme_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, + union ctl_io **starting_io, union ctl_io **aborted_io) +{ + union ctl_io *ooa_io = *starting_io; + + CTL_IO_ASSERT(pending_io, NVME, NVME_ADMIN); + + mtx_assert(&lun->lun_lock, MA_OWNED); + + *aborted_io = NULL; + + /* + * Aborted commands are not going to be executed and may even + * not report completion, so we don't care about their order. + * Let them complete ASAP to clean the OOA queue. + */ + if (__predict_false(pending_io->io_hdr.flags & CTL_FLAG_ABORT)) + return (CTL_ACTION_PASS); + + /* + * NVMe has rather simple command ordering requirements. In + * particular, there is no requirement on the controller to + * enforce a specific order for overlapping LBAs. The only + * constraint is that fused operations (Compare and Write), + * must be completed as a unit. + * + * To support fused operations, the following strategy is used: + * - the first half of a fused command is not enqueued to rtr + * until the second half is enqueued + * - the second half of a fused command blocks on the first + * half of a fuse command + * - subsequent commands block on the second half of the + * fused command + */ + + /* + * Is the previously submitted command the first half of a + * fused operation? + */ + if (ooa_io != NULL && + NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == NVME_FUSE_FIRST) { + /* + * If this is the second half, enqueue the first half + * and block the second half on the first half. + */ + if (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse) == + NVME_FUSE_SECOND) { + /* + * XXX: Do we need to wait for other rtr requests + * to drain so this is truly atomic? + */ + return (CTL_ACTION_FUSED); + } + + /* Abort the first half. */ + ctl_nvme_set_missing_fused_command(&ooa_io->nvmeio); + *aborted_io = ooa_io; + } else { + switch (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse)) { + case NVME_FUSE_FIRST: + /* First half, wait for the second half. */ + return (CTL_ACTION_SKIP); + case NVME_FUSE_SECOND: + /* Second half without a matching first half, abort. */ + ctl_nvme_set_missing_fused_command(&pending_io->nvmeio); + *aborted_io = pending_io; + return (CTL_ACTION_SKIP); + } + } + + /* + * Scan the OOA queue looking for the most recent second half + * of a fused op. + */ + for (; ooa_io != NULL; + ooa_io = (union ctl_io *)LIST_NEXT(&ooa_io->io_hdr, ooa_links)) { + if (NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == + NVME_FUSE_SECOND) { + *starting_io = ooa_io; + return (CTL_ACTION_BLOCK); + } + } + + *starting_io = NULL; + return (CTL_ACTION_PASS); +} + +static void +ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio) +{ + struct ctl_softc *softc = CTL_SOFTC(ctnio); + struct ctl_lun *lun; + const struct ctl_nvme_cmd_entry *entry; + union ctl_io *bio, *aborted_io; + uint32_t targ_lun; + + lun = NULL; + targ_lun = ctnio->io_hdr.nexus.targ_mapped_lun; + if (targ_lun < ctl_max_luns) + lun = softc->ctl_luns[targ_lun]; + if (lun != NULL) { + /* + * If the LUN is invalid, pretend that it doesn't exist. + * It will go away as soon as all pending I/O has been + * completed. + */ + mtx_lock(&lun->lun_lock); + if (lun->flags & CTL_LUN_DISABLED) { + mtx_unlock(&lun->lun_lock); + lun = NULL; + } + } + CTL_LUN(ctnio) = lun; + if (lun != NULL) { + CTL_BACKEND_LUN(ctnio) = lun->be_lun; + + /* + * Every I/O goes into the OOA queue for a particular LUN, + * and stays there until completion. + */ +#ifdef CTL_TIME_IO + if (LIST_EMPTY(&lun->ooa_queue)) + lun->idle_time += getsbinuptime() - lun->last_busy; +#endif + LIST_INSERT_HEAD(&lun->ooa_queue, &ctnio->io_hdr, ooa_links); + } + + /* Get command entry and return error if it is unsupported. */ + entry = ctl_nvme_validate_command(ctnio); + if (entry == NULL) { + if (lun) + mtx_unlock(&lun->lun_lock); + return; + } + + ctnio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; + ctnio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; + + /* All NVMe commands other than IDENTIFY require a LUN. */ + if (lun == NULL) { + if (entry->flags & CTL_CMD_FLAG_OK_ON_NO_LUN) { + ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + ctl_enqueue_rtr((union ctl_io *)ctnio); + return; + } + + ctl_nvme_set_invalid_namespace(ctnio); + ctl_done((union ctl_io *)ctnio); + CTL_DEBUG_PRINT(("ctl_nvmeio_precheck: bailing out due to invalid LUN\n")); + return; + } else { + /* + * NVMe namespaces can only be backed by T_DIRECT LUNs. + */ + if (lun->be_lun->lun_type != T_DIRECT) { + mtx_unlock(&lun->lun_lock); + ctl_nvme_set_invalid_namespace(ctnio); + ctl_done((union ctl_io *)ctnio); + return; + } + } + + if (ctl_nvmeio_lun_check(lun, entry, ctnio) != 0) { + mtx_unlock(&lun->lun_lock); + ctl_done((union ctl_io *)ctnio); + return; + } + + bio = (union ctl_io *)LIST_NEXT(&ctnio->io_hdr, ooa_links); + switch (ctl_nvme_check_ooa(lun, (union ctl_io *)ctnio, &bio, + &aborted_io)) { + case CTL_ACTION_PASS: + ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + mtx_unlock(&lun->lun_lock); + ctl_enqueue_rtr((union ctl_io *)ctnio); + break; + case CTL_ACTION_FUSED: + /* Block the second half on the first half. */ + ctnio->io_hdr.blocker = bio; + TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr, + blocked_links); + + /* Pass the first half. */ + bio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + mtx_unlock(&lun->lun_lock); + ctl_enqueue_rtr(bio); + break; + case CTL_ACTION_SKIP: + mtx_unlock(&lun->lun_lock); + break; + case CTL_ACTION_BLOCK: + ctnio->io_hdr.blocker = bio; + TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr, + blocked_links); + mtx_unlock(&lun->lun_lock); + break; + default: + __assert_unreachable(); + } + if (aborted_io != NULL) + ctl_done(aborted_io); +} + +static int +ctl_nvmeio(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvmeio %s opc=%02X\n", + ctnio->io_hdr.io_type == CTL_IO_NVME ? "nvm" : "admin", + ctnio->cmd.opc)); + + entry = ctl_nvme_get_cmd_entry(ctnio); + MPASS(entry != NULL); + + /* + * If this I/O has been aborted, just send it straight to + * ctl_done() without executing it. + */ + if (ctnio->io_hdr.flags & CTL_FLAG_ABORT) { + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + /* + * All the checks should have been handled by ctl_nvmeio_precheck(). + * We should be clear now to just execute the I/O. + */ + retval = entry->execute(ctnio); + + return (retval); +} + /* * For known CDB types, parse the LBA and length. */ @@ -11016,7 +11747,7 @@ ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, * we know for sure that the blocker I/O does no longer count. */ static void -ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +ctl_scsi_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) { struct ctl_softc *softc = lun->ctl_softc; union ctl_io *bio, *obio; @@ -11111,6 +11842,72 @@ error: } } +static void +ctl_nvme_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +{ + union ctl_io *bio; + const struct ctl_nvme_cmd_entry *entry; + + CTL_IO_ASSERT(io, NVME, NVME_ADMIN); + + mtx_assert(&lun->lun_lock, MA_OWNED); + + if (io->io_hdr.blocker == NULL) + return; + + /* + * If this is the second half of a fused operation, it should + * be the only io on the blocked list. If the first half + * failed, complete the second half with an appropriate error. + */ + bio = io->io_hdr.blocker; + if (NVMEV(NVME_CMD_FUSE, io->nvmeio.cmd.fuse) == NVME_FUSE_SECOND) { + MPASS(io == + (union ctl_io *)TAILQ_FIRST(&bio->io_hdr.blocked_queue)); + MPASS(TAILQ_NEXT(&io->io_hdr, blocked_links) == NULL); + + TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr, + blocked_links); + if (bio->io_hdr.status != CTL_SUCCESS) { + ctl_nvme_set_failed_fused_command(&io->nvmeio); + ctl_done(io); + return; + } + } else { + /* + * This must be a command that was blocked on the + * second half of a fused operation. + */ + MPASS(NVMEV(NVME_CMD_FUSE, bio->nvmeio.cmd.fuse) == + NVME_FUSE_SECOND); + TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr, + blocked_links); + } + + entry = ctl_nvme_get_cmd_entry(&io->nvmeio); + if (ctl_nvmeio_lun_check(lun, entry, &io->nvmeio) != 0) { + ctl_done(io); + return; + } + + io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + ctl_enqueue_rtr(io); +} + +static void +ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +{ + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + return (ctl_scsi_try_unblock_io(lun, io, skip)); + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + return (ctl_nvme_try_unblock_io(lun, io, skip)); + default: + __assert_unreachable(); + } +} + /* * Try to unblock I/Os blocked by the specified I/O. * @@ -11824,6 +12621,7 @@ ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id, */ LIST_FOREACH(xioh, &lun->ooa_queue, ooa_links) { union ctl_io *xio = (union ctl_io *)xioh; + if ((targ_port == UINT32_MAX || targ_port == xioh->nexus.targ_port) && (init_id == UINT32_MAX || @@ -13196,7 +13994,22 @@ ctl_queue(union ctl_io *io) { struct ctl_port *port = CTL_PORT(io); - CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + case CTL_IO_TASK: + CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); + break; + case CTL_IO_NVME: + CTL_DEBUG_PRINT(("ctl_queue nvme nvm cmd=%02X\n", + io->nvmeio.cmd.opc)); + break; + case CTL_IO_NVME_ADMIN: + CTL_DEBUG_PRINT(("ctl_queue nvme admin cmd=%02X\n", + io->nvmeio.cmd.opc)); + break; + default: + break; + } #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; @@ -13210,6 +14023,8 @@ ctl_queue(union ctl_io *io) switch (io->io_hdr.io_type) { case CTL_IO_SCSI: case CTL_IO_TASK: + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: if (ctl_debug & CTL_DEBUG_CDB) ctl_io_print(io); ctl_enqueue_incoming(io); @@ -13249,6 +14064,12 @@ ctl_run(union ctl_io *io) ctl_io_print(io); ctl_run_task(io); break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + if (ctl_debug & CTL_DEBUG_CDB) + ctl_io_print(io); + ctl_nvmeio_precheck(&io->nvmeio); + break; default: printf("ctl_run: unknown I/O type %d\n", io->io_hdr.io_type); return (EINVAL); @@ -13418,19 +14239,41 @@ ctl_work_thread(void *arg) if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->incoming_queue, links); mtx_unlock(&thr->queue_lock); - if (io->io_hdr.io_type == CTL_IO_TASK) + switch (io->io_hdr.io_type) { + case CTL_IO_TASK: ctl_run_task(io); - else + break; + case CTL_IO_SCSI: ctl_scsiio_precheck(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + ctl_nvmeio_precheck(&io->nvmeio); + break; + default: + __assert_unreachable(); + } continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->rtr_queue, links); mtx_unlock(&thr->queue_lock); - retval = ctl_scsiio(&io->scsiio); - if (retval != CTL_RETVAL_COMPLETE) - CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + retval = ctl_scsiio(&io->scsiio); + if (retval != CTL_RETVAL_COMPLETE) + CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + retval = ctl_nvmeio(&io->nvmeio); + if (retval != CTL_RETVAL_COMPLETE) + CTL_DEBUG_PRINT(("ctl_nvmeio failed\n")); + break; + default: + __assert_unreachable(); + } continue; } diff --git a/sys/cam/ctl/ctl_nvme_cmd_table.c b/sys/cam/ctl/ctl_nvme_cmd_table.c new file mode 100644 index 000000000000..39e60de62b7f --- /dev/null +++ b/sys/cam/ctl/ctl_nvme_cmd_table.c @@ -0,0 +1,35 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + */ + +#include + +#include +#include +#include +#include +#include + +/* Administrative Command Set (CTL_IO_NVME_ADMIN). */ +const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256] = +{ + [NVME_OPC_IDENTIFY] = { ctl_nvme_identify, CTL_FLAG_DATA_IN | + CTL_CMD_FLAG_OK_ON_NO_LUN }, +}; + +/* NVM Command Set (CTL_IO_NVME). */ +const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256] = +{ + [NVME_OPC_FLUSH] = { ctl_nvme_flush, CTL_FLAG_DATA_NONE }, + [NVME_OPC_WRITE] = { ctl_nvme_read_write, CTL_FLAG_DATA_OUT }, + [NVME_OPC_READ] = { ctl_nvme_read_write, CTL_FLAG_DATA_IN }, + [NVME_OPC_WRITE_UNCORRECTABLE] = { ctl_nvme_write_uncorrectable, + CTL_FLAG_DATA_NONE }, + [NVME_OPC_COMPARE] = { ctl_nvme_compare, CTL_FLAG_DATA_OUT }, + [NVME_OPC_WRITE_ZEROES] = { ctl_nvme_write_zeroes, CTL_FLAG_DATA_NONE }, + [NVME_OPC_DATASET_MANAGEMENT] = { ctl_nvme_dataset_management, + CTL_FLAG_DATA_OUT }, + [NVME_OPC_VERIFY] = { ctl_nvme_verify, CTL_FLAG_DATA_NONE }, +}; diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h index 9dfe979bcb7f..cd7e499c60a6 100644 --- a/sys/cam/ctl/ctl_private.h +++ b/sys/cam/ctl/ctl_private.h @@ -78,7 +78,8 @@ typedef enum { CTL_ACTION_SKIP, CTL_ACTION_BLOCK, CTL_ACTION_OVERLAP, - CTL_ACTION_OVERLAP_TAG + CTL_ACTION_OVERLAP_TAG, + CTL_ACTION_FUSED, } ctl_action; /* @@ -139,6 +140,12 @@ struct ctl_cmd_entry { * after the opcode byte. */ }; +/* Only data flags are currently used for NVMe commands. */ +struct ctl_nvme_cmd_entry { + int (*execute)(struct ctl_nvmeio *); + ctl_io_flags flags; +}; + typedef enum { CTL_LUN_NONE = 0x000, CTL_LUN_CONTROL = 0x001, @@ -412,6 +419,8 @@ struct ctl_softc { #ifdef _KERNEL extern const struct ctl_cmd_entry ctl_cmd_table[256]; +extern const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256]; +extern const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256]; uint32_t ctl_get_initindex(struct ctl_nexus *nexus); int ctl_lun_map_init(struct ctl_port *port); @@ -459,6 +468,15 @@ int ctl_report_supported_tmf(struct ctl_scsiio *ctsio); int ctl_report_timestamp(struct ctl_scsiio *ctsio); int ctl_get_lba_status(struct ctl_scsiio *ctsio); +int ctl_nvme_identify(struct ctl_nvmeio *ctnio); +int ctl_nvme_flush(struct ctl_nvmeio *ctnio); +int ctl_nvme_read_write(struct ctl_nvmeio *ctnio); +int ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio); +int ctl_nvme_compare(struct ctl_nvmeio *ctnio); +int ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio); +int ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio); +int ctl_nvme_verify(struct ctl_nvmeio *ctnio); + void ctl_tpc_init(struct ctl_softc *softc); void ctl_tpc_shutdown(struct ctl_softc *softc); void ctl_tpc_lun_init(struct ctl_lun *lun); diff --git a/sys/conf/files b/sys/conf/files index 266018c5c70d..b23ec357a302 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -109,6 +109,7 @@ cam/ctl/ctl_frontend_ioctl.c optional ctl cam/ctl/ctl_frontend_iscsi.c optional ctl cfiscsi cam/ctl/ctl_ha.c optional ctl cam/ctl/ctl_nvme_all.c optional ctl +cam/ctl/ctl_nvme_cmd_table.c optional ctl cam/ctl/ctl_scsi_all.c optional ctl cam/ctl/ctl_tpc.c optional ctl cam/ctl/ctl_tpc_local.c optional ctl diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile index 32f150b41300..9fb94ddc9bca 100644 --- a/sys/modules/ctl/Makefile +++ b/sys/modules/ctl/Makefile @@ -13,6 +13,7 @@ SRCS+= ctl_frontend_cam_sim.c SRCS+= ctl_frontend_ioctl.c SRCS+= ctl_ha.c SRCS+= ctl_nvme_all.c +SRCS+= ctl_nvme_cmd_table.c SRCS+= ctl_scsi_all.c SRCS+= ctl_tpc.c SRCS+= ctl_tpc_local.c