Message ID | 20210427093110.16461-1-mwilck@suse.com |
---|---|
State | Superseded |
Headers | show |
Series | [v4] nvme: rdma/tcp: fix list corruption with anatt timer | expand |
Martin, can you give this patch a spin and check if this solves your issue? diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0d0de3433f37..68f4d9d0ce58 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -780,6 +780,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; + size_t ana_log_size; int error; /* check if multipath is enabled and we have the capability */ @@ -787,47 +789,45 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0; + if (!ctrl->identified) { + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + INIT_WORK(&ctrl->ana_work, nvme_ana_work); + } + ctrl->anacap = id->anacap; ctrl->anatt = id->anatt; ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); - mutex_init(&ctrl->ana_lock); - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); - - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + + ctrl->max_namespaces * sizeof(__le32); + if (ana_log_size > max_transfer_size) { dev_err(ctrl->device, - "ANA log page size (%zd) larger than MDTS (%d).\n", - ctrl->ana_log_size, - ctrl->max_hw_sectors << SECTOR_SHIFT); + "ANA log page size (%zd) larger than MDTS (%zd).\n", + ana_log_size, max_transfer_size); dev_err(ctrl->device, "disabling ANA support.\n"); return 0; } - INIT_WORK(&ctrl->ana_work, nvme_ana_work); - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); - if (!ctrl->ana_log_buf) { - error = -ENOMEM; - goto out; + if (ana_log_size > ctrl->ana_log_size) { + nvme_mpath_uninit(ctrl); + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + return -ENOMEM; + ctrl->ana_log_size = ana_log_size; } error = nvme_read_ana_log(ctrl); if (error) - goto out_free_ana_log_buf; - return 0; -out_free_ana_log_buf: - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = NULL; -out: + nvme_mpath_uninit(ctrl); return error; } void nvme_mpath_uninit(struct nvme_ctrl *ctrl) { + nvme_mpath_stop(ctrl); kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; }
On Thu, Apr 29, 2021 at 02:24:33PM +0200, Christoph Hellwig wrote: > Martin, > > can you give this patch a spin and check if this solves your issue? ping? > > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c > index 0d0de3433f37..68f4d9d0ce58 100644 > --- a/drivers/nvme/host/multipath.c > +++ b/drivers/nvme/host/multipath.c > @@ -780,6 +780,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) > > int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) > { > + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; > + size_t ana_log_size; > int error; > > /* check if multipath is enabled and we have the capability */ > @@ -787,47 +789,45 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) > !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) > return 0; > > + if (!ctrl->identified) { > + mutex_init(&ctrl->ana_lock); > + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); > + INIT_WORK(&ctrl->ana_work, nvme_ana_work); > + } > + > ctrl->anacap = id->anacap; > ctrl->anatt = id->anatt; > ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); > ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); > > - mutex_init(&ctrl->ana_lock); > - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); > - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + > - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); > - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); > - > - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { > + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + > + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + > + ctrl->max_namespaces * sizeof(__le32); > + if (ana_log_size > max_transfer_size) { > dev_err(ctrl->device, > - "ANA log page size (%zd) larger than MDTS (%d).\n", > - ctrl->ana_log_size, > - ctrl->max_hw_sectors << SECTOR_SHIFT); > + "ANA log page size (%zd) larger than MDTS (%zd).\n", > + ana_log_size, max_transfer_size); > dev_err(ctrl->device, "disabling ANA support.\n"); > return 0; > } > > - INIT_WORK(&ctrl->ana_work, nvme_ana_work); > - kfree(ctrl->ana_log_buf); > - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); > - if (!ctrl->ana_log_buf) { > - error = -ENOMEM; > - goto out; > + if (ana_log_size > ctrl->ana_log_size) { > + nvme_mpath_uninit(ctrl); > + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); > + if (!ctrl->ana_log_buf) > + return -ENOMEM; > + ctrl->ana_log_size = ana_log_size; > } > > error = nvme_read_ana_log(ctrl); > if (error) > - goto out_free_ana_log_buf; > - return 0; > -out_free_ana_log_buf: > - kfree(ctrl->ana_log_buf); > - ctrl->ana_log_buf = NULL; > -out: > + nvme_mpath_uninit(ctrl); > return error; > } > > void nvme_mpath_uninit(struct nvme_ctrl *ctrl) > { > + nvme_mpath_stop(ctrl); > kfree(ctrl->ana_log_buf); > ctrl->ana_log_buf = NULL; > } ---end quoted text---
Hello Christoph, On Tue, 2021-05-04 at 09:52 +0200, Christoph Hellwig wrote: > On Thu, Apr 29, 2021 at 02:24:33PM +0200, Christoph Hellwig wrote: > > Martin, > > > > can you give this patch a spin and check if this solves your issue? > > ping? I've provided a test kernel with your patch to the SUSE partner in question, but it's hard to reproduce, the test will take time. Regards Martin
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a1d476e1ac02..c63dd5dfa7ff 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -586,6 +586,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl) del_timer_sync(&ctrl->anatt_timer); cancel_work_sync(&ctrl->ana_work); } +EXPORT_SYMBOL_GPL(nvme_mpath_stop); #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ struct device_attribute subsys_attr_##_name = \ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index be905d4fdb47..fc07a7b0dc1d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1202,6 +1202,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) return; } + nvme_mpath_stop(&ctrl->ctrl); nvme_rdma_reconnect_or_remove(ctrl); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a0f00cb8f9f3..46287b4f4d10 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2068,6 +2068,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) return; } + nvme_mpath_stop(ctrl); nvme_tcp_reconnect_or_remove(ctrl); }