Message ID | 20201021221736.100779-3-its@irrelevant.dk |
---|---|
State | New |
Headers | show |
Series | hw/block/nvme: dulbe and dsm support | expand |
On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote: > +static void nvme_aio_discard_cb(void *opaque, int ret) > +{ > + NvmeRequest *req = opaque; > + int *discards = req->opaque; > + > + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); > + > + if (ret) { > + req->status = NVME_INTERNAL_DEV_ERROR; > + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), > + req->status); > + } > + > + if (discards && --(*discards) > 0) { > + return; > + } > + > + g_free(req->opaque); > + req->opaque = NULL; > + > + nvme_enqueue_req_completion(nvme_cq(req), req); > +} > + > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) > +{ > + NvmeNamespace *ns = req->ns; > + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; > + NvmeDsmRange *range = NULL; > + int *discards = NULL; > + > + uint32_t attr = le32_to_cpu(dsm->attributes); > + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; > + > + uint16_t status = NVME_SUCCESS; > + > + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); > + > + if (attr & NVME_DSMGMT_AD) { > + int64_t offset; > + size_t len; > + > + range = g_new(NvmeDsmRange, nr); > + > + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), > + DMA_DIRECTION_TO_DEVICE, req); > + if (status) { > + goto out; > + } > + > + discards = g_new0(int, 1); > + req->opaque = discards; I think you need to initialize discards to 1 so that this function is holding a reference on it while the asynchronous part is running. Otherwise, the callback may see 'discards' at 0 and free it while we're trying to discard the next range. > + > + for (int i = 0; i < nr; i++) { > + uint64_t slba = le64_to_cpu(range[i].slba); > + uint32_t nlb = le32_to_cpu(range[i].nlb); > + > + if (nvme_check_bounds(n, ns, slba, nlb)) { > + trace_pci_nvme_err_invalid_lba_range(slba, nlb, > + ns->id_ns.nsze); > + continue; > + } > + > + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, > + nlb); > + > + offset = nvme_l2b(ns, slba); > + len = nvme_l2b(ns, nlb); > + > + while (len) { > + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); > + > + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, > + nvme_aio_discard_cb, req); > + > + (*discards)++; The increment ought to be before the aio call so that the _cb can't see the value before it's accounted for. > + > + offset += bytes; > + len -= bytes; > + } > + } > + > + if (*discards) { > + status = NVME_NO_COMPLETE; > + } else { > + g_free(discards); > + req->opaque = NULL; > + } > + } > + > +out: > + g_free(range); > + > + return status; > +} > + > static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) > { > block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) > case NVME_CMD_WRITE: > case NVME_CMD_READ: > return nvme_rw(n, req); > + case NVME_CMD_DSM: > + return nvme_dsm(n, req); > default: > trace_pci_nvme_err_invalid_opc(req->cmd.opcode); > return NVME_INVALID_OPCODE | NVME_DNR; > @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) > id->cqes = (0x4 << 4) | 0x4; > id->nn = cpu_to_le32(n->num_namespaces); > id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | > - NVME_ONCS_FEATURES); > + NVME_ONCS_FEATURES | NVME_ONCS_DSM); I think we should set ID_NS.NDPG and NDPA since there really is a preferred granularity and alignment. > > id->vwc = 0x1; > id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | > -- > 2.28.0 >
On Oct 21 17:59, Keith Busch wrote: > On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote: > > +static void nvme_aio_discard_cb(void *opaque, int ret) > > +{ > > + NvmeRequest *req = opaque; > > + int *discards = req->opaque; > > + > > + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); > > + > > + if (ret) { > > + req->status = NVME_INTERNAL_DEV_ERROR; > > + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), > > + req->status); > > + } > > + > > + if (discards && --(*discards) > 0) { > > + return; > > + } > > + > > + g_free(req->opaque); > > + req->opaque = NULL; > > + > > + nvme_enqueue_req_completion(nvme_cq(req), req); > > +} > > + > > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) > > +{ > > + NvmeNamespace *ns = req->ns; > > + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; > > + NvmeDsmRange *range = NULL; > > + int *discards = NULL; > > + > > + uint32_t attr = le32_to_cpu(dsm->attributes); > > + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; > > + > > + uint16_t status = NVME_SUCCESS; > > + > > + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); > > + > > + if (attr & NVME_DSMGMT_AD) { > > + int64_t offset; > > + size_t len; > > + > > + range = g_new(NvmeDsmRange, nr); > > + > > + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), > > + DMA_DIRECTION_TO_DEVICE, req); > > + if (status) { > > + goto out; > > + } > > + > > + discards = g_new0(int, 1); > > + req->opaque = discards; > > I think you need to initialize discards to 1 so that this function is > holding a reference on it while the asynchronous part is running. > Otherwise, the callback may see 'discards' at 0 and free it while we're > trying to discard the next range. > Yes - you are right. The callback might actually get called immediately in some cases. > > + > > + for (int i = 0; i < nr; i++) { > > + uint64_t slba = le64_to_cpu(range[i].slba); > > + uint32_t nlb = le32_to_cpu(range[i].nlb); > > + > > + if (nvme_check_bounds(n, ns, slba, nlb)) { > > + trace_pci_nvme_err_invalid_lba_range(slba, nlb, > > + ns->id_ns.nsze); > > + continue; > > + } > > + > > + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, > > + nlb); > > + > > + offset = nvme_l2b(ns, slba); > > + len = nvme_l2b(ns, nlb); > > + > > + while (len) { > > + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); > > + > > + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, > > + nvme_aio_discard_cb, req); > > + > > + (*discards)++; > > The increment ought to be before the aio call so that the _cb can't see > the value before it's accounted for. > Right. Same issue as above. > > + > > + offset += bytes; > > + len -= bytes; > > + } > > + } > > + > > + if (*discards) { And then we decrement here before checking for 0. Thanks, looks better now. > > + status = NVME_NO_COMPLETE; > > + } else { > > + g_free(discards); > > + req->opaque = NULL; > > + } > > + } > > + > > +out: > > + g_free(range); > > + > > + return status; > > +} > > + > > static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) > > { > > block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > > @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) > > case NVME_CMD_WRITE: > > case NVME_CMD_READ: > > return nvme_rw(n, req); > > + case NVME_CMD_DSM: > > + return nvme_dsm(n, req); > > default: > > trace_pci_nvme_err_invalid_opc(req->cmd.opcode); > > return NVME_INVALID_OPCODE | NVME_DNR; > > @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) > > id->cqes = (0x4 << 4) | 0x4; > > id->nn = cpu_to_le32(n->num_namespaces); > > id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | > > - NVME_ONCS_FEATURES); > > + NVME_ONCS_FEATURES | NVME_ONCS_DSM); > > I think we should set ID_NS.NDPG and NDPA since there really is a > preferred granularity and alignment. > Yeah. I had some issues with doing this reliably. But I think I nailed it, see my v4. > > > > id->vwc = 0x1; > > id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | > > -- > > 2.28.0 > > -- One of us - No more doubt, silence or taboo about mental illness.
diff --git a/hw/block/nvme.h b/hw/block/nvme.h index e080a2318a50..574333caa3f9 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -28,6 +28,7 @@ typedef struct NvmeRequest { struct NvmeNamespace *ns; BlockAIOCB *aiocb; uint16_t status; + void *opaque; NvmeCqe cqe; NvmeCmd cmd; BlockAcctCookie acct; @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; default: return "NVME_NVM_CMD_UNKNOWN"; } } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 1758cfed965c..a6dd8ae8e220 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -959,6 +959,101 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +static void nvme_aio_discard_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + int *discards = req->opaque; + + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); + + if (ret) { + req->status = NVME_INTERNAL_DEV_ERROR; + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), + req->status); + } + + if (discards && --(*discards) > 0) { + return; + } + + g_free(req->opaque); + req->opaque = NULL; + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; + NvmeDsmRange *range = NULL; + int *discards = NULL; + + uint32_t attr = le32_to_cpu(dsm->attributes); + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; + + uint16_t status = NVME_SUCCESS; + + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); + + if (attr & NVME_DSMGMT_AD) { + int64_t offset; + size_t len; + + range = g_new(NvmeDsmRange, nr); + + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + goto out; + } + + discards = g_new0(int, 1); + req->opaque = discards; + + for (int i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t nlb = le32_to_cpu(range[i].nlb); + + if (nvme_check_bounds(n, ns, slba, nlb)) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, + ns->id_ns.nsze); + continue; + } + + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, + nlb); + + offset = nvme_l2b(ns, slba); + len = nvme_l2b(ns, nlb); + + while (len) { + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); + + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, + nvme_aio_discard_cb, req); + + (*discards)++; + + offset += bytes; + len -= bytes; + } + } + + if (*discards) { + status = NVME_NO_COMPLETE; + } else { + g_free(discards); + req->opaque = NULL; + } + } + +out: + g_free(range); + + return status; +} + static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_CMD_WRITE: case NVME_CMD_READ: return nvme_rw(n, req); + case NVME_CMD_DSM: + return nvme_dsm(n, req); default: trace_pci_nvme_err_invalid_opc(req->cmd.opcode); return NVME_INVALID_OPCODE | NVME_DNR; @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | - NVME_ONCS_FEATURES); + NVME_ONCS_FEATURES | NVME_ONCS_DSM); id->vwc = 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |