@@ -12,6 +12,13 @@ nvme-ns Options
namespace. It is specified in terms of a power of two. Only values between
9 and 12 (both inclusive) are supported.
+ `pstate`; This parameter specifies another blockdev to be used for storing
+ persistent state such as logical block allocation tracking. Adding this
+ parameter enables various optional features of the device.
+
+ -drive id=pstate,file=pstate.img,format=raw
+ -device nvme-ns,pstate=pstate,...
+
Reference Specifications
------------------------
@@ -31,7 +31,20 @@ typedef struct NvmeNamespace {
int64_t size;
NvmeIdNs id_ns;
+ struct {
+ BlockBackend *blk;
+
+ struct {
+ unsigned long *map;
+ int64_t offset;
+ } utilization;
+ } pstate;
+
NvmeNamespaceParams params;
+
+ struct {
+ uint32_t err_rec;
+ } features;
} NvmeNamespace;
static inline uint32_t nvme_nsid(NvmeNamespace *ns)
@@ -683,6 +683,7 @@ enum NvmeStatusCodes {
NVME_E2E_REF_ERROR = 0x0284,
NVME_CMP_FAILURE = 0x0285,
NVME_ACCESS_DENIED = 0x0286,
+ NVME_DULB = 0x0287,
NVME_MORE = 0x2000,
NVME_DNR = 0x4000,
NVME_NO_COMPLETE = 0xffff,
@@ -898,6 +899,9 @@ enum NvmeIdCtrlLpa {
#define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1)
#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
+#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
enum NvmeFeatureIds {
NVME_ARBITRATION = 0x1,
NVME_POWER_MANAGEMENT = 0x2,
@@ -1018,6 +1022,7 @@ enum NvmeNsIdentifierType {
#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
@@ -25,9 +25,36 @@
#include "hw/qdev-properties.h"
#include "hw/qdev-core.h"
+#include "trace.h"
+
#include "nvme.h"
#include "nvme-ns.h"
+static int nvme_blk_truncate(BlockBackend *blk, size_t len, Error **errp)
+{
+ int ret;
+ uint64_t perm, shared_perm;
+
+ blk_get_perm(blk, &perm, &shared_perm);
+
+ ret = blk_set_perm(blk, perm | BLK_PERM_RESIZE, shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = blk_truncate(blk, len, false, PREALLOC_MODE_OFF, 0, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = blk_set_perm(blk, perm, shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
static void nvme_ns_init(NvmeNamespace *ns)
{
NvmeIdNs *id_ns = &ns->id_ns;
@@ -45,6 +72,67 @@ static void nvme_ns_init(NvmeNamespace *ns)
id_ns->nuse = id_ns->ncap;
}
+static int nvme_ns_setup_blk_pstate(NvmeNamespace *ns, Error **errp)
+{
+ BlockBackend *blk = ns->pstate.blk;
+ uint64_t perm, shared_perm;
+ ssize_t len;
+ size_t pstate_len;
+ int ret;
+
+ perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+ shared_perm = BLK_PERM_ALL;
+
+ ret = blk_set_perm(blk, perm, shared_perm, errp);
+ if (ret) {
+ return ret;
+ }
+
+ pstate_len = ROUND_UP(DIV_ROUND_UP(nvme_ns_nlbas(ns), 8),
+ BDRV_SECTOR_SIZE);
+
+ len = blk_getlength(blk);
+ if (len < 0) {
+ error_setg_errno(errp, -len, "could not determine pstate size");
+ return len;
+ }
+
+ unsigned long *map = bitmap_new(nvme_ns_nlbas(ns));
+ ns->pstate.utilization.offset = 0;
+
+ if (!len) {
+ ret = nvme_blk_truncate(blk, pstate_len, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ns->pstate.utilization.map = map;
+ } else {
+ if (len != pstate_len) {
+ error_setg(errp, "pstate size mismatch "
+ "(expected %zd bytes; was %zu bytes)",
+ pstate_len, len);
+ return -1;
+ }
+
+ ret = blk_pread(blk, 0, map, pstate_len);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read pstate");
+ return ret;
+ }
+#ifdef HOST_WORDS_BIGENDIAN
+ ns->pstate.utilization.map = bitmap_new(nvme_ns_nlbas(ns));
+ bitmap_from_le(ns->pstate.utilization.map, map, nvme_ns_nlbas(ns));
+#else
+ ns->pstate.utilization.map = map;
+#endif
+
+ return 0;
+ }
+
+ return 0;
+}
+
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
{
if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -96,6 +184,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
}
nvme_ns_init(ns);
+
+ if (ns->pstate.blk) {
+ if (nvme_ns_setup_blk_pstate(ns, errp)) {
+ return -1;
+ }
+
+ /*
+ * With a pstate file in place we can enable the Deallocated or
+ * Unwritten Logical Block Error feature.
+ */
+ ns->id_ns.nsfeat |= 0x4;
+ }
+
if (nvme_register_namespace(n, ns, errp)) {
return -1;
}
@@ -106,11 +207,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
void nvme_ns_drain(NvmeNamespace *ns)
{
blk_drain(ns->blkconf.blk);
+
+ if (ns->pstate.blk) {
+ blk_drain(ns->pstate.blk);
+ }
}
void nvme_ns_flush(NvmeNamespace *ns)
{
blk_flush(ns->blkconf.blk);
+
+ if (ns->pstate.blk) {
+ blk_flush(ns->pstate.blk);
+ }
}
static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -131,6 +240,7 @@ static Property nvme_ns_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
DEFINE_PROP_UINT8("lbads", NvmeNamespace, params.lbads, BDRV_SECTOR_BITS),
+ DEFINE_PROP_DRIVE("pstate", NvmeNamespace, pstate.blk),
DEFINE_PROP_END_OF_LIST(),
};
@@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
[NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
+ [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
[NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
[NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
[NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
@@ -888,6 +889,61 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
return NVME_SUCCESS;
}
+static inline uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
+ uint32_t nlb)
+{
+ uint64_t elba = slba + nlb;
+
+ if (find_next_zero_bit(ns->pstate.utilization.map, elba, slba) < elba) {
+ return NVME_DULB;
+ }
+
+ return NVME_SUCCESS;
+}
+
+static int nvme_allocate(NvmeNamespace *ns, uint64_t slba, uint32_t nlb)
+{
+ int nlongs, first;
+ int64_t offset;
+ unsigned long *map, *src;
+ int ret;
+
+ if (!(ns->pstate.blk && nvme_check_dulbe(ns, slba, nlb))) {
+ return 0;
+ }
+
+ trace_pci_nvme_allocate(nvme_nsid(ns), slba, nlb);
+
+ bitmap_set(ns->pstate.utilization.map, slba, nlb);
+
+ nlongs = BITS_TO_LONGS(nlb) + 1;
+ first = slba / BITS_PER_LONG;
+ offset = ns->pstate.utilization.offset + first * sizeof(unsigned long);
+ src = ns->pstate.utilization.map;
+
+#ifdef HOST_WORDS_BIGENDIAN
+ map = g_new(nlongs, sizeof(unsigned long));
+ for (int i = first; i < first + nlongs; i++) {
+# if HOST_LONG_BITS == 64
+ map[i] = bswap64(src[i]);
+# else
+ map[i] = bswap32(src[i]);
+# endif
+ }
+#else
+ map = src;
+#endif
+
+ ret = blk_pwrite(ns->pstate.blk, offset, &map[first],
+ nlongs * sizeof(unsigned long), 0);
+
+#ifdef HOST_WORDS_BIGENDIAN
+ g_free(map);
+#endif
+ return ret;
+}
+
+
static void nvme_rw_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
@@ -1006,6 +1062,7 @@ static uint16_t nvme_rwz(NvmeCtrl *n, NvmeRequest *req)
uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
size_t len = nvme_l2b(ns, nlb);
+ bool is_write = nvme_req_is_write(req);
uint16_t status;
trace_pci_nvme_rwz(nvme_cid(req), nvme_io_opc_str(rw->opcode),
@@ -1017,6 +1074,16 @@ static uint16_t nvme_rwz(NvmeCtrl *n, NvmeRequest *req)
goto invalid;
}
+ if (!is_write) {
+ if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+ status = nvme_check_dulbe(ns, slba, nlb);
+ if (status) {
+ trace_pci_nvme_err_dulbe(nvme_cid(req), slba, nlb);
+ goto invalid;
+ }
+ }
+ }
+
if (req->cmd.opcode & NVME_CMD_OPCODE_DATA_TRANSFER_MASK) {
status = nvme_check_mdts(n, len);
if (status) {
@@ -1030,12 +1097,18 @@ static uint16_t nvme_rwz(NvmeCtrl *n, NvmeRequest *req)
}
}
+ if (is_write) {
+ if (nvme_allocate(ns, slba, nlb) < 0) {
+ status = NVME_INTERNAL_DEV_ERROR;
+ goto invalid;
+ }
+ }
+
return nvme_do_aio(ns->blkconf.blk, nvme_l2b(ns, slba), len, req);
invalid:
block_acct_invalid(blk_get_stats(ns->blkconf.blk),
- nvme_req_is_write(req) ? BLOCK_ACCT_WRITE :
- BLOCK_ACCT_READ);
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
return status;
}
@@ -1638,6 +1711,8 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
{
+ NvmeNamespace *ns;
+
NvmeCmd *cmd = &req->cmd;
uint32_t dw10 = le32_to_cpu(cmd->cdw10);
uint32_t dw11 = le32_to_cpu(cmd->cdw11);
@@ -1708,6 +1783,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
}
return NVME_INVALID_FIELD | NVME_DNR;
+ case NVME_ERROR_RECOVERY:
+ if (!nvme_nsid_valid(n, nsid)) {
+ return NVME_INVALID_NSID | NVME_DNR;
+ }
+
+ ns = nvme_ns(n, nsid);
+ if (unlikely(!ns)) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ result = ns->features.err_rec;
+ goto out;
case NVME_VOLATILE_WRITE_CACHE:
result = n->features.vwc;
trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
@@ -1780,7 +1867,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
{
- NvmeNamespace *ns;
+ NvmeNamespace *ns = NULL;
NvmeCmd *cmd = &req->cmd;
uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -1847,6 +1934,26 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
NVME_LOG_SMART_INFO);
}
+ break;
+ case NVME_ERROR_RECOVERY:
+ if (nsid == NVME_NSID_BROADCAST) {
+ for (int i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+
+ if (!ns) {
+ continue;
+ }
+
+ if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
+ ns->features.err_rec = dw11;
+ }
+ }
+
+ break;
+ }
+
+ assert(ns);
+ ns->features.err_rec = dw11;
break;
case NVME_VOLATILE_WRITE_CACHE:
n->features.vwc = dw11 & 0x1;
@@ -42,6 +42,7 @@ pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, cons
pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_rwz(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t len, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" len %"PRIu64" lba 0x%"PRIx64""
pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
+pci_nvme_allocate(uint32_t ns, uint64_t slba, uint32_t nlb) "nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_do_aio(uint16_t cid, uint8_t opc, const char *opname, const char *blkname, int64_t offset, size_t len) "cid %"PRIu16" opc 0x%"PRIx8" opname '%s' blk '%s' offset %"PRId64" len %zu"
pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
@@ -89,6 +90,7 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
# nvme traces for error conditions
pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
pci_nvme_err_req_status(uint16_t cid, uint32_t nsid, uint16_t status, uint8_t opc) "cid %"PRIu16" nsid %"PRIu32" status 0x%"PRIx16" opc 0x%"PRIx8""
+pci_nvme_err_dulbe(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
pci_nvme_err_cfs(void) "controller fatal status"