Message ID | 20201022184959.240505-3-its@irrelevant.dk |
---|---|
State | New |
Headers | show |
Series | hw/block/nvme: dulbe and dsm support | expand |
Hi Klaus, On 10/22/20 8:49 PM, Klaus Jensen wrote: > From: Klaus Jensen <k.jensen@samsung.com> > > Add support for the Dataset Management command and the Deallocate > attribute. Deallocation results in discards being sent to the underlying > block device. Whether of not the blocks are actually deallocated is > affected by the same factors as Write Zeroes (see previous commit). > > format | discard | dsm (512b) dsm (4kb) dsm (64kb) Please use B/KiB units which are unambiguous (kb is for kbits) (if you queue this yourself, you can fix when applying, no need to repost). > ------------------------------------------------------ > qcow2 ignore n n n > qcow2 unmap n n y > raw ignore n n n > raw unmap n y y > > Again, a raw format and 4kb LBAs are preferable. > > In order to set the Namespace Preferred Deallocate Granularity and > Alignment fields (NPDG and NPDA), choose a sane minimum discard > granularity of 4kb. If we are using a passthru device supporting discard > at a 512b granularity, user should set the discard_granularity property Ditto. > explicitly. NPDG and NPDA will also account for the cluster_size of the > block driver if required (i.e. for QCOW2). > > See NVM Express 1.3d, Section 6.7 ("Dataset Management command"). > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com> > --- > hw/block/nvme.h | 2 + > include/block/nvme.h | 7 ++- > hw/block/nvme-ns.c | 36 +++++++++++++-- > hw/block/nvme.c | 101 ++++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 140 insertions(+), 6 deletions(-) > > diff --git a/hw/block/nvme.h b/hw/block/nvme.h > index e080a2318a50..574333caa3f9 100644 > --- a/hw/block/nvme.h > +++ b/hw/block/nvme.h > @@ -28,6 +28,7 @@ typedef struct NvmeRequest { > struct NvmeNamespace *ns; > BlockAIOCB *aiocb; > uint16_t status; > + void *opaque; > NvmeCqe cqe; > NvmeCmd cmd; > BlockAcctCookie acct; > @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) > case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; > case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; > case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; > + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; > default: return "NVME_NVM_CMD_UNKNOWN"; > } > } > diff --git a/include/block/nvme.h b/include/block/nvme.h > index 966c3bb304bd..e95ff6ca9b37 100644 > --- a/include/block/nvme.h > +++ b/include/block/nvme.h > @@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs { > uint16_t nabspf; > uint16_t noiob; > uint8_t nvmcap[16]; > - uint8_t rsvd64[40]; > + uint16_t npwg; > + uint16_t npwa; > + uint16_t npdg; > + uint16_t npda; > + uint16_t nows; > + uint8_t rsvd74[30]; > uint8_t nguid[16]; > uint64_t eui64; > NvmeLBAF lbaf[16]; If you consider "block/nvme.h" shared by 2 different subsystems, it is better to add the changes in a separate patch. That way the changes can be acked individually. > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c > index f1cc734c60f5..840651db7256 100644 > --- a/hw/block/nvme-ns.c > +++ b/hw/block/nvme-ns.c > @@ -28,10 +28,14 @@ > #include "nvme.h" > #include "nvme-ns.h" > > -static void nvme_ns_init(NvmeNamespace *ns) > +#define MIN_DISCARD_GRANULARITY (4 * KiB) > + > +static int nvme_ns_init(NvmeNamespace *ns, Error **errp) Hmm the Error* argument could be squashed in "hw/block/nvme: support multiple namespaces". Else better split patch in dumb units IMHO (maybe a reviewer's taste). > { > + BlockDriverInfo bdi; > NvmeIdNs *id_ns = &ns->id_ns; > int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); > + int npdg, ret; > > ns->id_ns.dlfeat = 0x9; > > @@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns) > id_ns->ncap = id_ns->nsze; > id_ns->nuse = id_ns->ncap; > > - /* support DULBE */ > - id_ns->nsfeat |= 0x4; > + /* support DULBE and I/O optimization fields */ > + id_ns->nsfeat |= (0x4 | 0x10); The comment helps, but isn't needed if you use explicit definitions for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits. This is why I personally prefer the registerfields API (see "hw/registerfields.h"). > + > + npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size; > + > + ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "could not get block driver info"); > + return ret; > + } > + > + if (bdi.cluster_size && > + bdi.cluster_size > ns->blkconf.discard_granularity) { > + npdg = bdi.cluster_size / ns->blkconf.logical_block_size; > + } > + > + id_ns->npda = id_ns->npdg = npdg - 1; > + > + return 0; > } > > static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > @@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > return -1; > } > > + if (ns->blkconf.discard_granularity == -1) { > + ns->blkconf.discard_granularity = > + MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); > + } > + > ns->size = blk_getlength(ns->blkconf.blk); > if (ns->size < 0) { > error_setg_errno(errp, -ns->size, "could not get blockdev size"); > @@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > return -1; > } > > - nvme_ns_init(ns); > + if (nvme_ns_init(ns, errp)) { > + return -1; > + } > > if (nvme_register_namespace(n, ns, errp)) { > return -1; > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index 4ab0705f5a92..7acb9e9dc38a 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -959,6 +959,103 @@ static void nvme_rw_cb(void *opaque, int ret) > nvme_enqueue_req_completion(nvme_cq(req), req); > } > > +static void nvme_aio_discard_cb(void *opaque, int ret) > +{ > + NvmeRequest *req = opaque; > + int *discards = req->opaque; > + > + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); > + > + if (ret) { > + req->status = NVME_INTERNAL_DEV_ERROR; > + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), > + req->status); > + } > + > + if (discards && --(*discards) > 0) { This line is hard to read. > + return; > + } > + > + g_free(req->opaque); > + req->opaque = NULL; > + > + nvme_enqueue_req_completion(nvme_cq(req), req); > +} > + > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) > +{ > + NvmeNamespace *ns = req->ns; > + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; > + NvmeDsmRange *range = NULL; g_autofree? > + int *discards = NULL; > + > + uint32_t attr = le32_to_cpu(dsm->attributes); > + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; > + > + uint16_t status = NVME_SUCCESS; > + > + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); > + > + if (attr & NVME_DSMGMT_AD) { > + int64_t offset; > + size_t len; > + > + range = g_new(NvmeDsmRange, nr); > + > + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), > + DMA_DIRECTION_TO_DEVICE, req); > + if (status) { > + goto out; > + } > + > + discards = g_new(int, 1); > + *discards = 1; > + req->opaque = discards; If opaque is a pointer, why not instead using it as an uintptr_t discards directly without using the heap? > + > + for (int i = 0; i < nr; i++) { > + uint64_t slba = le64_to_cpu(range[i].slba); > + uint32_t nlb = le32_to_cpu(range[i].nlb); > + > + if (nvme_check_bounds(n, ns, slba, nlb)) { > + trace_pci_nvme_err_invalid_lba_range(slba, nlb, > + ns->id_ns.nsze); > + continue; > + } > + > + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, > + nlb); > + > + offset = nvme_l2b(ns, slba); > + len = nvme_l2b(ns, nlb); > + > + while (len) { > + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); > + > + (*discards)++; > + > + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, > + nvme_aio_discard_cb, req); > + > + offset += bytes; > + len -= bytes; > + } > + } > + > + if (--(*discards)) { > + status = NVME_NO_COMPLETE; > + } else { > + g_free(discards); > + req->opaque = NULL; > + status = req->status; > + } > + } > + > +out: > + g_free(range); > + > + return status; > +} [...]
On Thu, Oct 22, 2020 at 11:02:04PM +0200, Philippe Mathieu-Daudé wrote: > On 10/22/20 8:49 PM, Klaus Jensen wrote: > > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) > > +{ > > + NvmeNamespace *ns = req->ns; > > + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; > > + NvmeDsmRange *range = NULL; > > g_autofree? Or just allocate the array on the stack. The upper limit is just 512 entries.
On Oct 22 23:02, Philippe Mathieu-Daudé wrote: > Hi Klaus, > Hi Philippe, Thanks for your comments! > On 10/22/20 8:49 PM, Klaus Jensen wrote: > > From: Klaus Jensen <k.jensen@samsung.com> > > > > Add support for the Dataset Management command and the Deallocate > > attribute. Deallocation results in discards being sent to the underlying > > block device. Whether of not the blocks are actually deallocated is > > affected by the same factors as Write Zeroes (see previous commit). > > > > format | discard | dsm (512b) dsm (4kb) dsm (64kb) > > Please use B/KiB units which are unambiguous (kb is for kbits) > (if you queue this yourself, you can fix when applying, no need > to repost). > Thanks, I'll change it. > > ------------------------------------------------------ > > qcow2 ignore n n n > > qcow2 unmap n n y > > raw ignore n n n > > raw unmap n y y > > > > Again, a raw format and 4kb LBAs are preferable. > > > > In order to set the Namespace Preferred Deallocate Granularity and > > Alignment fields (NPDG and NPDA), choose a sane minimum discard > > granularity of 4kb. If we are using a passthru device supporting discard > > at a 512b granularity, user should set the discard_granularity property > > Ditto. > > > explicitly. NPDG and NPDA will also account for the cluster_size of the > > block driver if required (i.e. for QCOW2). > > > > See NVM Express 1.3d, Section 6.7 ("Dataset Management command"). > > > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com> > > --- > > hw/block/nvme.h | 2 + > > include/block/nvme.h | 7 ++- > > hw/block/nvme-ns.c | 36 +++++++++++++-- > > hw/block/nvme.c | 101 ++++++++++++++++++++++++++++++++++++++++++- > > 4 files changed, 140 insertions(+), 6 deletions(-) > > > > diff --git a/hw/block/nvme.h b/hw/block/nvme.h > > index e080a2318a50..574333caa3f9 100644 > > --- a/hw/block/nvme.h > > +++ b/hw/block/nvme.h > > @@ -28,6 +28,7 @@ typedef struct NvmeRequest { > > struct NvmeNamespace *ns; > > BlockAIOCB *aiocb; > > uint16_t status; > > + void *opaque; > > NvmeCqe cqe; > > NvmeCmd cmd; > > BlockAcctCookie acct; > > @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) > > case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; > > case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; > > case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; > > + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; > > default: return "NVME_NVM_CMD_UNKNOWN"; > > } > > } > > diff --git a/include/block/nvme.h b/include/block/nvme.h > > index 966c3bb304bd..e95ff6ca9b37 100644 > > --- a/include/block/nvme.h > > +++ b/include/block/nvme.h > > @@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs { > > uint16_t nabspf; > > uint16_t noiob; > > uint8_t nvmcap[16]; > > - uint8_t rsvd64[40]; > > + uint16_t npwg; > > + uint16_t npwa; > > + uint16_t npdg; > > + uint16_t npda; > > + uint16_t nows; > > + uint8_t rsvd74[30]; > > uint8_t nguid[16]; > > uint64_t eui64; > > NvmeLBAF lbaf[16]; > > If you consider "block/nvme.h" shared by 2 different subsystems, > it is better to add the changes in a separate patch. That way > the changes can be acked individually. > Sure. Some other stuff here warrents a v6 I think, so I will split it. > > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c > > index f1cc734c60f5..840651db7256 100644 > > --- a/hw/block/nvme-ns.c > > +++ b/hw/block/nvme-ns.c > > @@ -28,10 +28,14 @@ > > #include "nvme.h" > > #include "nvme-ns.h" > > -static void nvme_ns_init(NvmeNamespace *ns) > > +#define MIN_DISCARD_GRANULARITY (4 * KiB) > > + > > +static int nvme_ns_init(NvmeNamespace *ns, Error **errp) > > Hmm the Error* argument could be squashed in "hw/block/nvme: > support multiple namespaces". Else better split patch in dumb > units IMHO (maybe a reviewer's taste). > Yeah, I guess I can squash that in. > > { > > + BlockDriverInfo bdi; > > NvmeIdNs *id_ns = &ns->id_ns; > > int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); > > + int npdg, ret; > > ns->id_ns.dlfeat = 0x9; > > @@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns) > > id_ns->ncap = id_ns->nsze; > > id_ns->nuse = id_ns->ncap; > > - /* support DULBE */ > > - id_ns->nsfeat |= 0x4; > > + /* support DULBE and I/O optimization fields */ > > + id_ns->nsfeat |= (0x4 | 0x10); > > The comment helps, but isn't needed if you use explicit definitions > for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE > and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits. > This is why I personally prefer the registerfields API (see > "hw/registerfields.h"). > I've been wanting to fix those constants - but the convention that they only extract bits pre-dates the nvme device and is from when the nvme block driver was introduced - I have just been following the precedence by defining them like that. > > + > > + npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size; > > + > > + ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "could not get block driver info"); > > + return ret; > > + } > > + > > + if (bdi.cluster_size && > > + bdi.cluster_size > ns->blkconf.discard_granularity) { > > + npdg = bdi.cluster_size / ns->blkconf.logical_block_size; > > + } > > + > > + id_ns->npda = id_ns->npdg = npdg - 1; > > + > > + return 0; > > } > > static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > > @@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > > return -1; > > } > > + if (ns->blkconf.discard_granularity == -1) { > > + ns->blkconf.discard_granularity = > > + MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); > > + } > > + > > ns->size = blk_getlength(ns->blkconf.blk); > > if (ns->size < 0) { > > error_setg_errno(errp, -ns->size, "could not get blockdev size"); > > @@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > > return -1; > > } > > - nvme_ns_init(ns); > > + if (nvme_ns_init(ns, errp)) { > > + return -1; > > + } > > if (nvme_register_namespace(n, ns, errp)) { > > return -1; > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > index 4ab0705f5a92..7acb9e9dc38a 100644 > > --- a/hw/block/nvme.c > > +++ b/hw/block/nvme.c > > @@ -959,6 +959,103 @@ static void nvme_rw_cb(void *opaque, int ret) > > nvme_enqueue_req_completion(nvme_cq(req), req); > > } > > +static void nvme_aio_discard_cb(void *opaque, int ret) > > +{ > > + NvmeRequest *req = opaque; > > + int *discards = req->opaque; > > + > > + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); > > + > > + if (ret) { > > + req->status = NVME_INTERNAL_DEV_ERROR; > > + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), > > + req->status); > > + } > > + > > + if (discards && --(*discards) > 0) { > > This line is hard to read. > Yes. Probably too clever for my own good. I'll fix it up. > > + return; > > + } > > + > > + g_free(req->opaque); > > + req->opaque = NULL; > > + > > + nvme_enqueue_req_completion(nvme_cq(req), req); > > +} > > + > > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) > > +{ > > + NvmeNamespace *ns = req->ns; > > + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; > > + NvmeDsmRange *range = NULL; > > g_autofree? > What sorcery is this?! I think I'll just replace it with a stack allocation like Keith suggested. > > + int *discards = NULL; > > + > > + uint32_t attr = le32_to_cpu(dsm->attributes); > > + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; > > + > > + uint16_t status = NVME_SUCCESS; > > + > > + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); > > + > > + if (attr & NVME_DSMGMT_AD) { > > + int64_t offset; > > + size_t len; > > + > > + range = g_new(NvmeDsmRange, nr); > > + > > + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), > > + DMA_DIRECTION_TO_DEVICE, req); > > + if (status) { > > + goto out; > > + } > > + > > + discards = g_new(int, 1); > > + *discards = 1; > > + req->opaque = discards; > > If opaque is a pointer, why not instead using it as an uintptr_t > discards directly without using the heap? > It was a "keep it simple, stupid" thing to just do the allocation. DSM is typically not on the fast path ;) But there is really no reason not to use that here.
diff --git a/hw/block/nvme.h b/hw/block/nvme.h index e080a2318a50..574333caa3f9 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -28,6 +28,7 @@ typedef struct NvmeRequest { struct NvmeNamespace *ns; BlockAIOCB *aiocb; uint16_t status; + void *opaque; NvmeCqe cqe; NvmeCmd cmd; BlockAcctCookie acct; @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; default: return "NVME_NVM_CMD_UNKNOWN"; } } diff --git a/include/block/nvme.h b/include/block/nvme.h index 966c3bb304bd..e95ff6ca9b37 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs { uint16_t nabspf; uint16_t noiob; uint8_t nvmcap[16]; - uint8_t rsvd64[40]; + uint16_t npwg; + uint16_t npwa; + uint16_t npdg; + uint16_t npda; + uint16_t nows; + uint8_t rsvd74[30]; uint8_t nguid[16]; uint64_t eui64; NvmeLBAF lbaf[16]; diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index f1cc734c60f5..840651db7256 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -28,10 +28,14 @@ #include "nvme.h" #include "nvme-ns.h" -static void nvme_ns_init(NvmeNamespace *ns) +#define MIN_DISCARD_GRANULARITY (4 * KiB) + +static int nvme_ns_init(NvmeNamespace *ns, Error **errp) { + BlockDriverInfo bdi; NvmeIdNs *id_ns = &ns->id_ns; int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); + int npdg, ret; ns->id_ns.dlfeat = 0x9; @@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns) id_ns->ncap = id_ns->nsze; id_ns->nuse = id_ns->ncap; - /* support DULBE */ - id_ns->nsfeat |= 0x4; + /* support DULBE and I/O optimization fields */ + id_ns->nsfeat |= (0x4 | 0x10); + + npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size; + + ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not get block driver info"); + return ret; + } + + if (bdi.cluster_size && + bdi.cluster_size > ns->blkconf.discard_granularity) { + npdg = bdi.cluster_size / ns->blkconf.logical_block_size; + } + + id_ns->npda = id_ns->npdg = npdg - 1; + + return 0; } static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) @@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } + if (ns->blkconf.discard_granularity == -1) { + ns->blkconf.discard_granularity = + MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); + } + ns->size = blk_getlength(ns->blkconf.blk); if (ns->size < 0) { error_setg_errno(errp, -ns->size, "could not get blockdev size"); @@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } - nvme_ns_init(ns); + if (nvme_ns_init(ns, errp)) { + return -1; + } if (nvme_register_namespace(n, ns, errp)) { return -1; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4ab0705f5a92..7acb9e9dc38a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -959,6 +959,103 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +static void nvme_aio_discard_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + int *discards = req->opaque; + + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); + + if (ret) { + req->status = NVME_INTERNAL_DEV_ERROR; + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), + req->status); + } + + if (discards && --(*discards) > 0) { + return; + } + + g_free(req->opaque); + req->opaque = NULL; + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; + NvmeDsmRange *range = NULL; + int *discards = NULL; + + uint32_t attr = le32_to_cpu(dsm->attributes); + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; + + uint16_t status = NVME_SUCCESS; + + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); + + if (attr & NVME_DSMGMT_AD) { + int64_t offset; + size_t len; + + range = g_new(NvmeDsmRange, nr); + + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + goto out; + } + + discards = g_new(int, 1); + *discards = 1; + req->opaque = discards; + + for (int i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t nlb = le32_to_cpu(range[i].nlb); + + if (nvme_check_bounds(n, ns, slba, nlb)) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, + ns->id_ns.nsze); + continue; + } + + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, + nlb); + + offset = nvme_l2b(ns, slba); + len = nvme_l2b(ns, nlb); + + while (len) { + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); + + (*discards)++; + + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, + nvme_aio_discard_cb, req); + + offset += bytes; + len -= bytes; + } + } + + if (--(*discards)) { + status = NVME_NO_COMPLETE; + } else { + g_free(discards); + req->opaque = NULL; + status = req->status; + } + } + +out: + g_free(range); + + return status; +} + static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, @@ -1088,6 +1185,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_CMD_WRITE: case NVME_CMD_READ: return nvme_rw(n, req); + case NVME_CMD_DSM: + return nvme_dsm(n, req); default: trace_pci_nvme_err_invalid_opc(req->cmd.opcode); return NVME_INVALID_OPCODE | NVME_DNR; @@ -2813,7 +2912,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | - NVME_ONCS_FEATURES); + NVME_ONCS_FEATURES | NVME_ONCS_DSM); id->vwc = 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |