diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c index 59e8cba1b671..039670c17529 100644 --- a/sys/cam/ctl/ctl.c +++ b/sys/cam/ctl/ctl.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include @@ -447,6 +448,8 @@ static int ctl_scsiio_lun_check(struct ctl_lun *lun, static void ctl_failover_lun(union ctl_io *io); static void ctl_scsiio_precheck(struct ctl_scsiio *ctsio); static int ctl_scsiio(struct ctl_scsiio *ctsio); +static void ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio); +static int ctl_nvmeio(struct ctl_nvmeio *ctnio); static int ctl_target_reset(union ctl_io *io); static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx, @@ -4963,7 +4966,6 @@ ctl_config_move_done(union ctl_io *io, bool samethr) int retval; CTL_DEBUG_PRINT(("ctl_config_move_done\n")); - CTL_IO_ASSERT(io, SCSI); if (ctl_debug & CTL_DEBUG_CDB_DATA) ctl_data_print(io); @@ -4998,7 +5000,17 @@ ctl_config_move_done(union ctl_io *io, bool samethr) * XXX KDM call ctl_scsiio() again for now, and check flag * bits to see whether we're allocated or not. */ - retval = ctl_scsiio(&io->scsiio); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + retval = ctl_scsiio(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + retval = ctl_nvmeio(&io->nvmeio); + break; + default: + __assert_unreachable(); + } } return (retval); } @@ -10598,6 +10610,725 @@ ctl_read_toc(struct ctl_scsiio *ctsio) return (CTL_RETVAL_COMPLETE); } +/* + * For NVMe commands, parse the LBA and length. + */ +static bool +ctl_nvme_get_lba_len(struct ctl_nvmeio *ctnio, uint64_t *lba, uint32_t *len) +{ + CTL_IO_ASSERT(ctnio, NVME); + + switch (ctnio->cmd.opc) { + case NVME_OPC_WRITE: + case NVME_OPC_READ: + case NVME_OPC_WRITE_UNCORRECTABLE: + case NVME_OPC_COMPARE: + case NVME_OPC_WRITE_ZEROES: + case NVME_OPC_VERIFY: + *lba = (uint64_t)le32toh(ctnio->cmd.cdw11) << 32 | + le32toh(ctnio->cmd.cdw10); + *len = (le32toh(ctnio->cmd.cdw12) & 0xffff) + 1; + return (true); + default: + *lba = 0; + *len = 0; + return (false); + } +} + +static bool +ctl_nvme_fua(struct ctl_nvmeio *ctnio) +{ + return ((le32toh(ctnio->cmd.cdw12) & (1U << 30)) != 0); +} + +int +ctl_nvme_identify(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + size_t len; + int retval; + uint8_t cns; + + CTL_DEBUG_PRINT(("ctl_nvme_identify\n")); + + CTL_IO_ASSERT(ctnio, NVME_ADMIN); + MPASS(ctnio->cmd.opc == NVME_OPC_IDENTIFY); + + /* + * The data buffer for Identify is always 4096 bytes, see + * 5.51.1 in NVMe base specification 1.4. + */ + len = 4096; + + ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); + ctnio->kern_data_len = len; + ctnio->kern_total_len = len; + ctnio->kern_rel_offset = 0; + ctnio->kern_sg_entries = 0; + + ctl_nvme_set_success(ctnio); + ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctnio->be_move_done = ctl_config_move_done; + + /* + * If we don't have a LUN, return an empty result for CNS == 0. + */ + if (lun == NULL) { + cns = le32toh(ctnio->cmd.cdw10) & 0xff; + switch (cns) { + case 0: + memset(ctnio->kern_data_ptr, 0, len); + ctl_datamove((union ctl_io *)ctnio); + break; + default: + ctl_nvme_set_invalid_field(ctnio); + break; + } + return (CTL_RETVAL_COMPLETE); + } + + retval = lun->backend->config_read((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_flush(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_flush\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_FLUSH); + + /* + * NVMe flushes always flush the entire namespace, not an LBA + * range. + */ + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_read_write(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags, retval; + bool isread; + + CTL_DEBUG_PRINT(("ctl_nvme_read_write: command: %#x\n", + ctnio->cmd.opc)); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE || + ctnio->cmd.opc == NVME_OPC_READ); + + flags = 0; + isread = ctnio->cmd.opc == NVME_OPC_READ; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + /* + * Set FUA and/or DPO if caches are disabled. + * + * For a read this may not be quite correct for the block + * backend as any earlier writes to the LBA range should be + * flushed to backing store as part of the read. + */ + if (ctl_nvme_fua(ctnio)) { + flags |= CTL_LLF_FUA; + if (isread) + flags |= CTL_LLF_DPO; + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags; + + ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_read_write: calling data_submit()\n")); + + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_write_uncorrectable\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_UNCORRECTABLE); + + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = 0; + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_compare(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_compare\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_COMPARE); + + flags = 0; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + if (ctl_nvme_fua(ctnio)) + flags |= CTL_LLF_FUA; + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = CTL_LLF_COMPARE | flags; + ctnio->kern_total_len = num_blocks * lun->be_lun->blocksize; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_compare: calling data_submit()\n")); + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_write_zeroes\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_WRITE_ZEROES); + + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = 0; + retval = lun->backend->config_write((union ctl_io *)ctnio); + + return (retval); +} + +int +ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct nvme_dsm_range *r; + uint64_t lba; + uint32_t len, num_blocks; + u_int i, ranges; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_dataset_management\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_DATASET_MANAGEMENT); + + ranges = le32toh(ctnio->cmd.cdw10) & 0xff; + len = ranges * sizeof(struct nvme_dsm_range); + + /* + * If we've got a kernel request that hasn't been malloced yet, + * malloc it and tell the caller the data buffer is here. + */ + if ((ctnio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { + ctnio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); + ctnio->kern_data_len = len; + ctnio->kern_total_len = len; + ctnio->kern_rel_offset = 0; + ctnio->kern_sg_entries = 0; + ctnio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctnio->be_move_done = ctl_config_move_done; + ctl_datamove((union ctl_io *)ctnio); + + return (CTL_RETVAL_COMPLETE); + } + + /* + * Require a flat buffer of the correct size. + */ + if (ctnio->kern_sg_entries > 0 || + ctnio->kern_total_len - ctnio->kern_data_resid != len) + return (CTL_RETVAL_ERROR); + + /* + * Verify that none of the ranges are out of bounds. + */ + r = (struct nvme_dsm_range *)ctnio->kern_data_ptr; + for (i = 0; i < ranges; i++) { + lba = le64toh(r[i].starting_lba); + num_blocks = le32toh(r[i].length); + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + } + + CTL_DEBUG_PRINT(("ctl_nvme_dataset_management: calling config_write()\n")); + retval = lun->backend->config_write((union ctl_io *)ctnio); + return (retval); +} + +int +ctl_nvme_verify(struct ctl_nvmeio *ctnio) +{ + struct ctl_lun *lun = CTL_LUN(ctnio); + struct ctl_lba_len_flags *lbalen; + uint64_t lba; + uint32_t num_blocks; + int flags; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvme_verify\n")); + + CTL_IO_ASSERT(ctnio, NVME); + MPASS(ctnio->cmd.opc == NVME_OPC_VERIFY); + + flags = 0; + ctl_nvme_get_lba_len(ctnio, &lba, &num_blocks); + if (ctl_nvme_fua(ctnio)) + flags |= CTL_LLF_FUA; + + /* + * The first check is to make sure we're in bounds, the second + * check is to catch wrap-around problems. If the lba + num blocks + * is less than the lba, then we've wrapped around and the block + * range is invalid anyway. + */ + if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) + || ((lba + num_blocks) < lba)) { + ctl_nvme_set_lba_out_of_range(ctnio); + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + lbalen = (struct ctl_lba_len_flags *) + &ctnio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; + lbalen->lba = lba; + lbalen->len = num_blocks; + lbalen->flags = CTL_LLF_VERIFY | flags; + ctnio->kern_total_len = 0; + ctnio->kern_rel_offset = 0; + + CTL_DEBUG_PRINT(("ctl_nvme_verify: calling data_submit()\n")); + retval = lun->backend->data_submit((union ctl_io *)ctnio); + return (retval); +} + +static const struct ctl_nvme_cmd_entry * +ctl_nvme_get_cmd_entry(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + + switch (ctnio->io_hdr.io_type) { + case CTL_IO_NVME: + entry = &nvme_nvm_cmd_table[ctnio->cmd.opc]; + break; + case CTL_IO_NVME_ADMIN: + entry = &nvme_admin_cmd_table[ctnio->cmd.opc]; + break; + default: + __assert_unreachable(); + } + return (entry); +} + +static const struct ctl_nvme_cmd_entry * +ctl_nvme_validate_command(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + + entry = ctl_nvme_get_cmd_entry(ctnio); + if (entry->execute == NULL) { + ctl_nvme_set_invalid_opcode(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + + /* Validate fused commands. */ + switch (NVMEV(NVME_CMD_FUSE, ctnio->cmd.fuse)) { + case NVME_FUSE_NORMAL: + break; + case NVME_FUSE_FIRST: + if (ctnio->io_hdr.io_type != CTL_IO_NVME || + ctnio->cmd.opc != NVME_OPC_COMPARE) { + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + break; + case NVME_FUSE_SECOND: + if (ctnio->io_hdr.io_type != CTL_IO_NVME || + ctnio->cmd.opc != NVME_OPC_COMPARE) { + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + break; + default: + ctl_nvme_set_invalid_field(ctnio); + ctl_done((union ctl_io *)ctnio); + return (NULL); + } + + return (entry); +} + +/* + * This is a simpler version of ctl_scsiio_lun_check that fails + * requests on a LUN without active media. + * + * Returns true if the command has been completed with an error. + */ +static bool +ctl_nvmeio_lun_check(struct ctl_lun *lun, + const struct ctl_nvme_cmd_entry *entry, struct ctl_nvmeio *ctnio) +{ + mtx_assert(&lun->lun_lock, MA_OWNED); + + if ((entry->flags & CTL_CMD_FLAG_OK_ON_NO_MEDIA) == 0) { + if ((lun->flags & (CTL_LUN_EJECTED | CTL_LUN_NO_MEDIA | + CTL_LUN_STOPPED)) != 0) { + ctl_nvme_set_namespace_not_ready(ctnio); + return (true); + } + } + + return (false); +} + +/* + * Check for blockage against the OOA (Order Of Arrival) queue. + * Assumptions: + * - pending_io is generally either incoming, or on the blocked queue + * - starting I/O is the I/O we want to start the check with. + */ +static ctl_action +ctl_nvme_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, + union ctl_io **starting_io, union ctl_io **aborted_io) +{ + union ctl_io *ooa_io = *starting_io; + + CTL_IO_ASSERT(pending_io, NVME, NVME_ADMIN); + + mtx_assert(&lun->lun_lock, MA_OWNED); + + *aborted_io = NULL; + + /* + * Aborted commands are not going to be executed and may even + * not report completion, so we don't care about their order. + * Let them complete ASAP to clean the OOA queue. + */ + if (__predict_false(pending_io->io_hdr.flags & CTL_FLAG_ABORT)) + return (CTL_ACTION_PASS); + + /* + * NVMe has rather simple command ordering requirements. In + * particular, there is no requirement on the controller to + * enforce a specific order for overlapping LBAs. The only + * constraint is that fused operations (Compare and Write), + * must be completed as a unit. + * + * To support fused operations, the following strategy is used: + * - the first half of a fused command is not enqueued to rtr + * until the second half is enqueued + * - the second half of a fused command blocks on the first + * half of a fuse command + * - subsequent commands block on the second half of the + * fused command + */ + + /* + * Is the previously submitted command the first half of a + * fused operation? + */ + if (ooa_io != NULL && + NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == NVME_FUSE_FIRST) { + /* + * If this is the second half, enqueue the first half + * and block the second half on the first half. + */ + if (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse) == + NVME_FUSE_SECOND) { + /* + * XXX: Do we need to wait for other rtr requests + * to drain so this is truly atomic? + */ + return (CTL_ACTION_FUSED); + } + + /* Abort the first half. */ + ctl_nvme_set_missing_fused_command(&ooa_io->nvmeio); + *aborted_io = ooa_io; + } else { + switch (NVMEV(NVME_CMD_FUSE, pending_io->nvmeio.cmd.fuse)) { + case NVME_FUSE_FIRST: + /* First half, wait for the second half. */ + return (CTL_ACTION_SKIP); + case NVME_FUSE_SECOND: + /* Second half without a matching first half, abort. */ + ctl_nvme_set_missing_fused_command(&pending_io->nvmeio); + *aborted_io = pending_io; + return (CTL_ACTION_SKIP); + } + } + + /* + * Scan the OOA queue looking for the most recent second half + * of a fused op. + */ + for (; ooa_io != NULL; + ooa_io = (union ctl_io *)LIST_NEXT(&ooa_io->io_hdr, ooa_links)) { + if (NVMEV(NVME_CMD_FUSE, ooa_io->nvmeio.cmd.fuse) == + NVME_FUSE_SECOND) { + *starting_io = ooa_io; + return (CTL_ACTION_BLOCK); + } + } + + *starting_io = NULL; + return (CTL_ACTION_PASS); +} + +static void +ctl_nvmeio_precheck(struct ctl_nvmeio *ctnio) +{ + struct ctl_softc *softc = CTL_SOFTC(ctnio); + struct ctl_lun *lun; + const struct ctl_nvme_cmd_entry *entry; + union ctl_io *bio, *aborted_io; + uint32_t targ_lun; + + lun = NULL; + targ_lun = ctnio->io_hdr.nexus.targ_mapped_lun; + if (targ_lun < ctl_max_luns) + lun = softc->ctl_luns[targ_lun]; + if (lun != NULL) { + /* + * If the LUN is invalid, pretend that it doesn't exist. + * It will go away as soon as all pending I/O has been + * completed. + */ + mtx_lock(&lun->lun_lock); + if (lun->flags & CTL_LUN_DISABLED) { + mtx_unlock(&lun->lun_lock); + lun = NULL; + } + } + CTL_LUN(ctnio) = lun; + if (lun != NULL) { + CTL_BACKEND_LUN(ctnio) = lun->be_lun; + + /* + * Every I/O goes into the OOA queue for a particular LUN, + * and stays there until completion. + */ +#ifdef CTL_TIME_IO + if (LIST_EMPTY(&lun->ooa_queue)) + lun->idle_time += getsbinuptime() - lun->last_busy; +#endif + LIST_INSERT_HEAD(&lun->ooa_queue, &ctnio->io_hdr, ooa_links); + } + + /* Get command entry and return error if it is unsupported. */ + entry = ctl_nvme_validate_command(ctnio); + if (entry == NULL) { + if (lun) + mtx_unlock(&lun->lun_lock); + return; + } + + ctnio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; + ctnio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; + + /* All NVMe commands other than IDENTIFY require a LUN. */ + if (lun == NULL) { + if (entry->flags & CTL_CMD_FLAG_OK_ON_NO_LUN) { + ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + ctl_enqueue_rtr((union ctl_io *)ctnio); + return; + } + + ctl_nvme_set_invalid_namespace(ctnio); + ctl_done((union ctl_io *)ctnio); + CTL_DEBUG_PRINT(("ctl_nvmeio_precheck: bailing out due to invalid LUN\n")); + return; + } else { + /* + * NVMe namespaces can only be backed by T_DIRECT LUNs. + */ + if (lun->be_lun->lun_type != T_DIRECT) { + mtx_unlock(&lun->lun_lock); + ctl_nvme_set_invalid_namespace(ctnio); + ctl_done((union ctl_io *)ctnio); + return; + } + } + + if (ctl_nvmeio_lun_check(lun, entry, ctnio) != 0) { + mtx_unlock(&lun->lun_lock); + ctl_done((union ctl_io *)ctnio); + return; + } + + bio = (union ctl_io *)LIST_NEXT(&ctnio->io_hdr, ooa_links); + switch (ctl_nvme_check_ooa(lun, (union ctl_io *)ctnio, &bio, + &aborted_io)) { + case CTL_ACTION_PASS: + ctnio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + mtx_unlock(&lun->lun_lock); + ctl_enqueue_rtr((union ctl_io *)ctnio); + break; + case CTL_ACTION_FUSED: + /* Block the second half on the first half. */ + ctnio->io_hdr.blocker = bio; + TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr, + blocked_links); + + /* Pass the first half. */ + bio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + mtx_unlock(&lun->lun_lock); + ctl_enqueue_rtr(bio); + break; + case CTL_ACTION_SKIP: + mtx_unlock(&lun->lun_lock); + break; + case CTL_ACTION_BLOCK: + ctnio->io_hdr.blocker = bio; + TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctnio->io_hdr, + blocked_links); + mtx_unlock(&lun->lun_lock); + break; + default: + __assert_unreachable(); + } + if (aborted_io != NULL) + ctl_done(aborted_io); +} + +static int +ctl_nvmeio(struct ctl_nvmeio *ctnio) +{ + const struct ctl_nvme_cmd_entry *entry; + int retval; + + CTL_DEBUG_PRINT(("ctl_nvmeio %s opc=%02X\n", + ctnio->io_hdr.io_type == CTL_IO_NVME ? "nvm" : "admin", + ctnio->cmd.opc)); + + entry = ctl_nvme_get_cmd_entry(ctnio); + MPASS(entry != NULL); + + /* + * If this I/O has been aborted, just send it straight to + * ctl_done() without executing it. + */ + if (ctnio->io_hdr.flags & CTL_FLAG_ABORT) { + ctl_done((union ctl_io *)ctnio); + return (CTL_RETVAL_COMPLETE); + } + + /* + * All the checks should have been handled by ctl_nvmeio_precheck(). + * We should be clear now to just execute the I/O. + */ + retval = entry->execute(ctnio); + + return (retval); +} + /* * For known CDB types, parse the LBA and length. */ @@ -11016,7 +11747,7 @@ ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, * we know for sure that the blocker I/O does no longer count. */ static void -ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +ctl_scsi_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) { struct ctl_softc *softc = lun->ctl_softc; union ctl_io *bio, *obio; @@ -11111,6 +11842,72 @@ error: } } +static void +ctl_nvme_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +{ + union ctl_io *bio; + const struct ctl_nvme_cmd_entry *entry; + + CTL_IO_ASSERT(io, NVME, NVME_ADMIN); + + mtx_assert(&lun->lun_lock, MA_OWNED); + + if (io->io_hdr.blocker == NULL) + return; + + /* + * If this is the second half of a fused operation, it should + * be the only io on the blocked list. If the first half + * failed, complete the second half with an appropriate error. + */ + bio = io->io_hdr.blocker; + if (NVMEV(NVME_CMD_FUSE, io->nvmeio.cmd.fuse) == NVME_FUSE_SECOND) { + MPASS(io == + (union ctl_io *)TAILQ_FIRST(&bio->io_hdr.blocked_queue)); + MPASS(TAILQ_NEXT(&io->io_hdr, blocked_links) == NULL); + + TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr, + blocked_links); + if (bio->io_hdr.status != CTL_SUCCESS) { + ctl_nvme_set_failed_fused_command(&io->nvmeio); + ctl_done(io); + return; + } + } else { + /* + * This must be a command that was blocked on the + * second half of a fused operation. + */ + MPASS(NVMEV(NVME_CMD_FUSE, bio->nvmeio.cmd.fuse) == + NVME_FUSE_SECOND); + TAILQ_REMOVE(&bio->io_hdr.blocked_queue, &io->io_hdr, + blocked_links); + } + + entry = ctl_nvme_get_cmd_entry(&io->nvmeio); + if (ctl_nvmeio_lun_check(lun, entry, &io->nvmeio) != 0) { + ctl_done(io); + return; + } + + io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; + ctl_enqueue_rtr(io); +} + +static void +ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) +{ + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + return (ctl_scsi_try_unblock_io(lun, io, skip)); + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + return (ctl_nvme_try_unblock_io(lun, io, skip)); + default: + __assert_unreachable(); + } +} + /* * Try to unblock I/Os blocked by the specified I/O. * @@ -11824,6 +12621,7 @@ ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id, */ LIST_FOREACH(xioh, &lun->ooa_queue, ooa_links) { union ctl_io *xio = (union ctl_io *)xioh; + if ((targ_port == UINT32_MAX || targ_port == xioh->nexus.targ_port) && (init_id == UINT32_MAX || @@ -13196,7 +13994,22 @@ ctl_queue(union ctl_io *io) { struct ctl_port *port = CTL_PORT(io); - CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + case CTL_IO_TASK: + CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); + break; + case CTL_IO_NVME: + CTL_DEBUG_PRINT(("ctl_queue nvme nvm cmd=%02X\n", + io->nvmeio.cmd.opc)); + break; + case CTL_IO_NVME_ADMIN: + CTL_DEBUG_PRINT(("ctl_queue nvme admin cmd=%02X\n", + io->nvmeio.cmd.opc)); + break; + default: + break; + } #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; @@ -13210,6 +14023,8 @@ ctl_queue(union ctl_io *io) switch (io->io_hdr.io_type) { case CTL_IO_SCSI: case CTL_IO_TASK: + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: if (ctl_debug & CTL_DEBUG_CDB) ctl_io_print(io); ctl_enqueue_incoming(io); @@ -13249,6 +14064,12 @@ ctl_run(union ctl_io *io) ctl_io_print(io); ctl_run_task(io); break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + if (ctl_debug & CTL_DEBUG_CDB) + ctl_io_print(io); + ctl_nvmeio_precheck(&io->nvmeio); + break; default: printf("ctl_run: unknown I/O type %d\n", io->io_hdr.io_type); return (EINVAL); @@ -13418,19 +14239,41 @@ ctl_work_thread(void *arg) if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->incoming_queue, links); mtx_unlock(&thr->queue_lock); - if (io->io_hdr.io_type == CTL_IO_TASK) + switch (io->io_hdr.io_type) { + case CTL_IO_TASK: ctl_run_task(io); - else + break; + case CTL_IO_SCSI: ctl_scsiio_precheck(&io->scsiio); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + ctl_nvmeio_precheck(&io->nvmeio); + break; + default: + __assert_unreachable(); + } continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->rtr_queue, links); mtx_unlock(&thr->queue_lock); - retval = ctl_scsiio(&io->scsiio); - if (retval != CTL_RETVAL_COMPLETE) - CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); + switch (io->io_hdr.io_type) { + case CTL_IO_SCSI: + retval = ctl_scsiio(&io->scsiio); + if (retval != CTL_RETVAL_COMPLETE) + CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); + break; + case CTL_IO_NVME: + case CTL_IO_NVME_ADMIN: + retval = ctl_nvmeio(&io->nvmeio); + if (retval != CTL_RETVAL_COMPLETE) + CTL_DEBUG_PRINT(("ctl_nvmeio failed\n")); + break; + default: + __assert_unreachable(); + } continue; } diff --git a/sys/cam/ctl/ctl_nvme_cmd_table.c b/sys/cam/ctl/ctl_nvme_cmd_table.c new file mode 100644 index 000000000000..39e60de62b7f --- /dev/null +++ b/sys/cam/ctl/ctl_nvme_cmd_table.c @@ -0,0 +1,35 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Chelsio Communications, Inc. + */ + +#include + +#include +#include +#include +#include +#include + +/* Administrative Command Set (CTL_IO_NVME_ADMIN). */ +const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256] = +{ + [NVME_OPC_IDENTIFY] = { ctl_nvme_identify, CTL_FLAG_DATA_IN | + CTL_CMD_FLAG_OK_ON_NO_LUN }, +}; + +/* NVM Command Set (CTL_IO_NVME). */ +const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256] = +{ + [NVME_OPC_FLUSH] = { ctl_nvme_flush, CTL_FLAG_DATA_NONE }, + [NVME_OPC_WRITE] = { ctl_nvme_read_write, CTL_FLAG_DATA_OUT }, + [NVME_OPC_READ] = { ctl_nvme_read_write, CTL_FLAG_DATA_IN }, + [NVME_OPC_WRITE_UNCORRECTABLE] = { ctl_nvme_write_uncorrectable, + CTL_FLAG_DATA_NONE }, + [NVME_OPC_COMPARE] = { ctl_nvme_compare, CTL_FLAG_DATA_OUT }, + [NVME_OPC_WRITE_ZEROES] = { ctl_nvme_write_zeroes, CTL_FLAG_DATA_NONE }, + [NVME_OPC_DATASET_MANAGEMENT] = { ctl_nvme_dataset_management, + CTL_FLAG_DATA_OUT }, + [NVME_OPC_VERIFY] = { ctl_nvme_verify, CTL_FLAG_DATA_NONE }, +}; diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h index 9dfe979bcb7f..cd7e499c60a6 100644 --- a/sys/cam/ctl/ctl_private.h +++ b/sys/cam/ctl/ctl_private.h @@ -78,7 +78,8 @@ typedef enum { CTL_ACTION_SKIP, CTL_ACTION_BLOCK, CTL_ACTION_OVERLAP, - CTL_ACTION_OVERLAP_TAG + CTL_ACTION_OVERLAP_TAG, + CTL_ACTION_FUSED, } ctl_action; /* @@ -139,6 +140,12 @@ struct ctl_cmd_entry { * after the opcode byte. */ }; +/* Only data flags are currently used for NVMe commands. */ +struct ctl_nvme_cmd_entry { + int (*execute)(struct ctl_nvmeio *); + ctl_io_flags flags; +}; + typedef enum { CTL_LUN_NONE = 0x000, CTL_LUN_CONTROL = 0x001, @@ -412,6 +419,8 @@ struct ctl_softc { #ifdef _KERNEL extern const struct ctl_cmd_entry ctl_cmd_table[256]; +extern const struct ctl_nvme_cmd_entry nvme_admin_cmd_table[256]; +extern const struct ctl_nvme_cmd_entry nvme_nvm_cmd_table[256]; uint32_t ctl_get_initindex(struct ctl_nexus *nexus); int ctl_lun_map_init(struct ctl_port *port); @@ -459,6 +468,15 @@ int ctl_report_supported_tmf(struct ctl_scsiio *ctsio); int ctl_report_timestamp(struct ctl_scsiio *ctsio); int ctl_get_lba_status(struct ctl_scsiio *ctsio); +int ctl_nvme_identify(struct ctl_nvmeio *ctnio); +int ctl_nvme_flush(struct ctl_nvmeio *ctnio); +int ctl_nvme_read_write(struct ctl_nvmeio *ctnio); +int ctl_nvme_write_uncorrectable(struct ctl_nvmeio *ctnio); +int ctl_nvme_compare(struct ctl_nvmeio *ctnio); +int ctl_nvme_write_zeroes(struct ctl_nvmeio *ctnio); +int ctl_nvme_dataset_management(struct ctl_nvmeio *ctnio); +int ctl_nvme_verify(struct ctl_nvmeio *ctnio); + void ctl_tpc_init(struct ctl_softc *softc); void ctl_tpc_shutdown(struct ctl_softc *softc); void ctl_tpc_lun_init(struct ctl_lun *lun); diff --git a/sys/conf/files b/sys/conf/files index 266018c5c70d..b23ec357a302 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -109,6 +109,7 @@ cam/ctl/ctl_frontend_ioctl.c optional ctl cam/ctl/ctl_frontend_iscsi.c optional ctl cfiscsi cam/ctl/ctl_ha.c optional ctl cam/ctl/ctl_nvme_all.c optional ctl +cam/ctl/ctl_nvme_cmd_table.c optional ctl cam/ctl/ctl_scsi_all.c optional ctl cam/ctl/ctl_tpc.c optional ctl cam/ctl/ctl_tpc_local.c optional ctl diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile index 32f150b41300..9fb94ddc9bca 100644 --- a/sys/modules/ctl/Makefile +++ b/sys/modules/ctl/Makefile @@ -13,6 +13,7 @@ SRCS+= ctl_frontend_cam_sim.c SRCS+= ctl_frontend_ioctl.c SRCS+= ctl_ha.c SRCS+= ctl_nvme_all.c +SRCS+= ctl_nvme_cmd_table.c SRCS+= ctl_scsi_all.c SRCS+= ctl_tpc.c SRCS+= ctl_tpc_local.c