diff mbox series

[08/17] hw/block/nvme: refactor aio submission

Message ID 20200904141956.576630-9-its@irrelevant.dk
State New
Headers show
Series hw/block/nvme: multiple namespaces support | expand

Commit Message

Klaus Jensen Sept. 4, 2020, 2:19 p.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

This pulls block layer aio submission to a common function an introduces
the NvmeAIO structure that encapsulates this.

This adds more code with no immediate benefit, but is in preparation for
supporting multiple aios per request.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 191 ++++++++++++++++++++++++++++++++----------
 hw/block/nvme.h       |  51 +++++++++++
 hw/block/trace-events |   3 +
 3 files changed, 203 insertions(+), 42 deletions(-)

Comments

Keith Busch Sept. 4, 2020, 7:47 p.m. UTC | #1
On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index bfac3385cb64..3e32f39c7c1d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
>  };
>  
>  static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);

You don't need the forward declaration here. Just move the
implementation above where it's used. It looks safe: nvme_aio_cb()
doesn't have any circular dependencies.
Klaus Jensen Sept. 4, 2020, 8:38 p.m. UTC | #2
On Sep  4 12:47, Keith Busch wrote:
> On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index bfac3385cb64..3e32f39c7c1d 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> >  };
> >  
> >  static void nvme_process_sq(void *opaque);
> > +static void nvme_aio_cb(void *opaque, int ret);
> 
> You don't need the forward declaration here. Just move the
> implementation above where it's used. It looks safe: nvme_aio_cb()
> doesn't have any circular dependencies.

You are right, of course. But it is getting a circular dependency in a
later patch. I left it there to reduce code movement later.
Keith Busch Sept. 4, 2020, 9:15 p.m. UTC | #3
On Fri, Sep 04, 2020 at 10:38:39PM +0200, Klaus Jensen wrote:
> On Sep  4 12:47, Keith Busch wrote:
> > On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index bfac3385cb64..3e32f39c7c1d 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> > >  };
> > >  
> > >  static void nvme_process_sq(void *opaque);
> > > +static void nvme_aio_cb(void *opaque, int ret);
> > 
> > You don't need the forward declaration here. Just move the
> > implementation above where it's used. It looks safe: nvme_aio_cb()
> > doesn't have any circular dependencies.
> 
> You are right, of course. But it is getting a circular dependency in a
> later patch. I left it there to reduce code movement later.

Is that coming in a future patch? Not finding it in this series.

About the whole patch in general, are multiple aio's per-request coming
in later patch as well? I didn't see any use for it here, and the
overhead of dynamic allocation and zeroing a new struct in the IO path
is a bit concerning for performance. I'd like to see your intended use
for this.
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index bfac3385cb64..3e32f39c7c1d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -110,6 +110,7 @@  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 };
 
 static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
 
 static uint16_t nvme_cid(NvmeRequest *req)
 {
@@ -611,39 +612,151 @@  static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
     return NVME_SUCCESS;
 }
 
-static void nvme_rw_cb(void *opaque, int ret)
+static NvmeAIO *nvme_aio_new(NvmeAIOOp opc, BlockBackend *blk, int64_t offset)
 {
-    NvmeRequest *req = opaque;
-    NvmeSQueue *sq = req->sq;
-    NvmeCtrl *n = sq->ctrl;
-    NvmeCQueue *cq = n->cq[sq->cqid];
+    NvmeAIO *aio = g_new0(NvmeAIO, 1);
 
-    trace_pci_nvme_rw_cb(nvme_cid(req));
+    aio->opc = opc;
+    aio->blk = blk;
+    aio->offset = offset;
 
-    if (!ret) {
-        block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
-        req->status = NVME_SUCCESS;
-    } else {
-        block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
-        req->status = NVME_INTERNAL_DEV_ERROR;
+    return aio;
+}
+
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+    g_free(aio);
+}
+
+static uint16_t nvme_do_aio(NvmeAIO *aio)
+{
+    NvmeRequest *req = aio->req;
+
+    BlockBackend *blk = aio->blk;
+    BlockAcctCookie *acct = &req->acct;
+    BlockAcctStats *stats = blk_get_stats(blk);
+
+    bool is_write;
+
+    switch (aio->opc) {
+    case NVME_AIO_OPC_FLUSH:
+        block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+        req->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+        break;
+
+    case NVME_AIO_OPC_WRITE_ZEROES:
+        block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
+        req->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
+                                           BDRV_REQ_MAY_UNMAP, nvme_aio_cb,
+                                           aio);
+        break;
+
+    case NVME_AIO_OPC_READ:
+    case NVME_AIO_OPC_WRITE:
+        is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+        block_acct_start(stats, acct, aio->len,
+                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+        if (aio->flags & NVME_AIO_DMA) {
+            QEMUSGList *qsg = (QEMUSGList *)aio->payload;
+
+            if (is_write) {
+                req->aiocb = dma_blk_write(blk, qsg, aio->offset,
+                                           BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+            } else {
+                req->aiocb = dma_blk_read(blk, qsg, aio->offset,
+                                          BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+            }
+        } else {
+            QEMUIOVector *iov = (QEMUIOVector *)aio->payload;
+
+            if (is_write) {
+                req->aiocb = blk_aio_pwritev(blk, aio->offset, iov, 0,
+                                             nvme_aio_cb, aio);
+            } else {
+                req->aiocb = blk_aio_preadv(blk, aio->offset, iov, 0,
+                                            nvme_aio_cb, aio);
+            }
+        }
+
+        break;
     }
 
-    nvme_enqueue_req_completion(cq, req);
+    return NVME_NO_COMPLETE;
+}
+
+static uint16_t nvme_aio_add(NvmeRequest *req, NvmeAIO *aio)
+{
+    aio->req = req;
+
+    trace_pci_nvme_aio_add(nvme_cid(req), aio, blk_name(aio->blk),
+                           aio->offset, aio->len, nvme_aio_opc_str(aio),
+                           req);
+
+    return nvme_do_aio(aio);
+}
+
+static void nvme_aio_cb(void *opaque, int ret)
+{
+    NvmeAIO *aio = opaque;
+    NvmeRequest *req = aio->req;
+
+    BlockBackend *blk = aio->blk;
+    BlockAcctCookie *acct = &req->acct;
+    BlockAcctStats *stats = blk_get_stats(blk);
+
+    Error *local_err = NULL;
+
+    trace_pci_nvme_aio_cb(nvme_cid(req), aio, blk_name(blk), aio->offset,
+                          aio->len, nvme_aio_opc_str(aio), req);
+
+    if (!ret) {
+        block_acct_done(stats, acct);
+        req->status = NVME_SUCCESS;
+    } else {
+        uint16_t status;
+
+        block_acct_failed(stats, acct);
+
+        switch (aio->opc) {
+        case NVME_AIO_OPC_READ:
+            status = NVME_UNRECOVERED_READ;
+            break;
+        case NVME_AIO_OPC_FLUSH:
+        case NVME_AIO_OPC_WRITE:
+        case NVME_AIO_OPC_WRITE_ZEROES:
+            status = NVME_WRITE_FAULT;
+            break;
+        default:
+            status = NVME_INTERNAL_DEV_ERROR;
+            break;
+        }
+
+        trace_pci_nvme_err_aio(nvme_cid(req), aio, blk_name(blk),
+                               aio->offset, nvme_aio_opc_str(aio), req,
+                               status);
+
+        error_setg_errno(&local_err, -ret, "aio failed");
+        error_report_err(local_err);
+
+        req->status = status;
+    }
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+    nvme_aio_destroy(aio);
 }
 
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
-    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
-                     BLOCK_ACCT_FLUSH);
-    req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
-
-    return NVME_NO_COMPLETE;
+    return nvme_aio_add(req, nvme_aio_new(NVME_AIO_OPC_FLUSH, n->conf.blk, 0));
 }
 
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
+    NvmeAIO *aio;
     uint64_t slba = le64_to_cpu(rw->slba);
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t offset = nvme_l2b(ns, slba);
@@ -658,24 +771,23 @@  static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
         return status;
     }
 
-    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
-                     BLOCK_ACCT_WRITE);
-    req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
-                                       BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-    return NVME_NO_COMPLETE;
+    aio = nvme_aio_new(NVME_AIO_OPC_WRITE_ZEROES, n->conf.blk, offset);
+    aio->len = count;
+
+    return nvme_aio_add(req, aio);
 }
 
 static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
+    NvmeAIO *aio;
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t slba = le64_to_cpu(rw->slba);
 
     uint64_t data_size = nvme_l2b(ns, nlb);
     uint64_t data_offset = nvme_l2b(ns, slba);
-    int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
-    enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+    bool is_write = nvme_req_is_write(req);
     uint16_t status;
 
     trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), nlb,
@@ -698,28 +810,23 @@  static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
-    if (req->qsg.nsg > 0) {
-        block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size,
-                         acct);
-        req->aiocb = is_write ?
-            dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                          nvme_rw_cb, req) :
-            dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                         nvme_rw_cb, req);
+    aio = nvme_aio_new(is_write ? NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ,
+                       n->conf.blk, data_offset);
+
+    if (req->qsg.sg) {
+        aio->payload = &req->qsg;
+        aio->len = req->qsg.size;
+        aio->flags |= NVME_AIO_DMA;
     } else {
-        block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size,
-                         acct);
-        req->aiocb = is_write ?
-            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
-                            req) :
-            blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
-                           req);
+        aio->payload = &req->iov;
+        aio->len = req->iov.size;
     }
 
-    return NVME_NO_COMPLETE;
+    return nvme_aio_add(req, aio);
 
 invalid:
-    block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+    block_acct_invalid(blk_get_stats(n->conf.blk),
+                       is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
     return status;
 }
 
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index ce9e931420d7..7a11b0b37317 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -32,6 +32,17 @@  typedef struct NvmeRequest {
     QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 
+static inline bool nvme_req_is_write(NvmeRequest *req)
+{
+    switch (req->cmd.opcode) {
+    case NVME_CMD_WRITE:
+    case NVME_CMD_WRITE_ZEROES:
+        return true;
+    default:
+        return false;
+    }
+}
+
 static inline const char *nvme_adm_opc_str(uint8_t opc)
 {
     switch (opc) {
@@ -60,6 +71,38 @@  static inline const char *nvme_io_opc_str(uint8_t opc)
     }
 }
 
+typedef enum NvmeAIOOp {
+    NVME_AIO_OPC_FLUSH        = 0x1,
+    NVME_AIO_OPC_READ         = 0x2,
+    NVME_AIO_OPC_WRITE        = 0x3,
+    NVME_AIO_OPC_WRITE_ZEROES = 0x4,
+} NvmeAIOOp;
+
+typedef enum NvmeAIOFlags {
+    NVME_AIO_DMA = 1 << 0,
+} NvmeAIOFlags;
+
+typedef struct NvmeAIO {
+    NvmeAIOOp       opc;
+    NvmeRequest     *req;
+    BlockBackend    *blk;
+    int64_t         offset;
+    size_t          len;
+    int             flags;
+    void            *payload;
+} NvmeAIO;
+
+static inline const char *nvme_aio_opc_str(NvmeAIO *aio)
+{
+    switch (aio->opc) {
+    case NVME_AIO_OPC_FLUSH:        return "NVME_AIO_OPC_FLUSH";
+    case NVME_AIO_OPC_READ:         return "NVME_AIO_OPC_READ";
+    case NVME_AIO_OPC_WRITE:        return "NVME_AIO_OPC_WRITE";
+    case NVME_AIO_OPC_WRITE_ZEROES: return "NVME_AIO_OPC_WRITE_ZEROES";
+    default:                        return "NVME_AIO_OPC_UNKNOWN";
+    }
+}
+
 typedef struct NvmeSQueue {
     struct NvmeCtrl *ctrl;
     uint16_t    sqid;
@@ -171,4 +214,12 @@  static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
     return n->ns_size >> nvme_ns_lbads(ns);
 }
 
+static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
+{
+    NvmeSQueue *sq = req->sq;
+    NvmeCtrl *n = sq->ctrl;
+
+    return n->cq[sq->cqid];
+}
+
 #endif /* HW_NVME_H */
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 0823d0fb47c5..fb3bf1be5e07 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -39,6 +39,8 @@  pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2,
 pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname \"%s\""
 pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname \"%s\""
 pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" \"%s\" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
+pci_nvme_aio_add(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t len, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" len %"PRIu64" opc \"%s\" req %p"
+pci_nvme_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t len, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" len %"PRIu64" opc \"%s\" req %p"
 pci_nvme_rw_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
@@ -88,6 +90,7 @@  pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
 pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
+pci_nvme_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
 pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""