@@ -28,6 +28,7 @@ typedef struct NvmeRequest {
struct NvmeNamespace *ns;
BlockAIOCB *aiocb;
uint16_t status;
+ void *opaque;
NvmeCqe cqe;
NvmeCmd cmd;
BlockAcctCookie acct;
@@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
+ case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
default: return "NVME_NVM_CMD_UNKNOWN";
}
}
@@ -28,10 +28,14 @@
#include "nvme.h"
#include "nvme-ns.h"
-static void nvme_ns_init(NvmeNamespace *ns)
+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
{
+ BlockDriverInfo bdi;
NvmeIdNs *id_ns = &ns->id_ns;
int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+ int npdg, ret;
ns->id_ns.dlfeat = 0x9;
@@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns)
id_ns->ncap = id_ns->nsze;
id_ns->nuse = id_ns->ncap;
- /* support DULBE */
- id_ns->nsfeat |= 0x4;
+ /* support DULBE and I/O optimization fields */
+ id_ns->nsfeat |= (0x4 | 0x10);
+
+ npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+ ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not get block driver info");
+ return ret;
+ }
+
+ if (bdi.cluster_size &&
+ bdi.cluster_size > ns->blkconf.discard_granularity) {
+ npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+ }
+
+ id_ns->npda = id_ns->npdg = npdg - 1;
+
+ return 0;
}
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1;
}
+ if (ns->blkconf.discard_granularity == -1) {
+ ns->blkconf.discard_granularity =
+ MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
+ }
+
ns->size = blk_getlength(ns->blkconf.blk);
if (ns->size < 0) {
error_setg_errno(errp, -ns->size, "could not get blockdev size");
@@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1;
}
- nvme_ns_init(ns);
+ if (nvme_ns_init(ns, errp)) {
+ return -1;
+ }
if (nvme_register_namespace(n, ns, errp)) {
return -1;
@@ -959,6 +959,104 @@ static void nvme_rw_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req);
}
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+ NvmeRequest *req = opaque;
+ uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+ trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+ if (ret) {
+ req->status = NVME_INTERNAL_DEV_ERROR;
+ trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
+ req->status);
+ }
+
+ (*discards)--;
+
+ if (*discards) {
+ return;
+ }
+
+ req->opaque = NULL;
+
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+ NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+
+ uint32_t attr = le32_to_cpu(dsm->attributes);
+ uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+ uint16_t status = NVME_SUCCESS;
+
+ trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+ if (attr & NVME_DSMGMT_AD) {
+ int64_t offset;
+ size_t len;
+ NvmeDsmRange range[nr];
+ uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+ status = nvme_dma(n, (uint8_t *)range, sizeof(range),
+ DMA_DIRECTION_TO_DEVICE, req);
+ if (status) {
+ return status;
+ }
+
+ /*
+ * AIO callbacks may be called immediately, so initialize discards to 1
+ * to make sure the the callback does not complete the request before
+ * all discards have been issued.
+ */
+ *discards = 1;
+
+ for (int i = 0; i < nr; i++) {
+ uint64_t slba = le64_to_cpu(range[i].slba);
+ uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+ if (nvme_check_bounds(n, ns, slba, nlb)) {
+ trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+ ns->id_ns.nsze);
+ continue;
+ }
+
+ trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+ nlb);
+
+ offset = nvme_l2b(ns, slba);
+ len = nvme_l2b(ns, nlb);
+
+ while (len) {
+ size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+ (*discards)++;
+
+ blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+ nvme_aio_discard_cb, req);
+
+ offset += bytes;
+ len -= bytes;
+ }
+ }
+
+ /* account for the 1-initialization */
+ (*discards)--;
+
+ if (*discards) {
+ status = NVME_NO_COMPLETE;
+ } else {
+ req->opaque = NULL;
+ status = req->status;
+ }
+ }
+
+ return status;
+}
+
static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
{
block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1088,6 +1186,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
case NVME_CMD_WRITE:
case NVME_CMD_READ:
return nvme_rw(n, req);
+ case NVME_CMD_DSM:
+ return nvme_dsm(n, req);
default:
trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
return NVME_INVALID_OPCODE | NVME_DNR;
@@ -2813,7 +2913,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->cqes = (0x4 << 4) | 0x4;
id->nn = cpu_to_le32(n->num_namespaces);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
- NVME_ONCS_FEATURES);
+ NVME_ONCS_FEATURES | NVME_ONCS_DSM);
id->vwc = 0x1;
id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |