@@ -103,6 +103,8 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
* @dev_data: Device data passed to get_queue_affinity()
* @fallback: If true, fallback to default blk-mq mapping in case of
* any failure
+ * @managed_irq: If driver is likely to use managed irq, pass @managed_irq
+ * as true.
*
* Generic function to setup each queue mapping in @qmap. It will query
* each queue's affinity via @get_queue_affinity and built queue mapping
@@ -113,7 +115,7 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
*/
int blk_mq_dev_map_queues(struct blk_mq_queue_map *qmap, void *dev_data,
int dev_off, get_queue_affinty_fn *get_queue_affinity,
- bool fallback)
+ bool fallback, bool managed_irq)
{
const struct cpumask *mask;
unsigned int queue, cpu;
@@ -136,6 +138,8 @@ int blk_mq_dev_map_queues(struct blk_mq_queue_map *qmap, void *dev_data,
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
+ qmap->use_managed_irq = managed_irq;
+
return 0;
fallback:
@@ -192,7 +192,8 @@ struct blk_mq_hw_ctx {
struct blk_mq_queue_map {
unsigned int *mq_map;
unsigned int nr_queues;
- unsigned int queue_offset;
+ unsigned int queue_offset:31;
+ unsigned int use_managed_irq:1;
};
/**
@@ -558,7 +559,7 @@ typedef const struct cpumask * (get_queue_affinty_fn)(void *dev_data,
int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
int blk_mq_dev_map_queues(struct blk_mq_queue_map *qmap, void *dev_data,
int dev_off, get_queue_affinty_fn *get_queue_affinity,
- bool fallback);
+ bool fallback, bool managed_irq);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
void blk_mq_quiesce_queue_nowait(struct request_queue *q);
Managed irq is special because genirq core will shut down it when all cpus in its affinity mask are offline, so blk-mq has to drain requests and prevent new allocation on the hw queue before its managed irq is shutdown. In current implementation, we drain all hctx when the last cpu in hctx->cpumask is going to be offline. However, we need to avoid the draining of hw queues which don't use managed irq, one kind of user is nvme fc/rdma/tcp because these controllers require to submit connection request successfully even though all cpus in hctx->cpumask are offline. And we have lots of kernel panic reports on blk_mq_alloc_request_hctx(). Once we know if one qmap uses managed irq or not, we needn't to drain requests for hctx which doesn't use managed irq, and we can allow to allocate request on hctx in which all CPUs in hctx->cpumask are offline, then not only fix kernel panic in blk_mq_alloc_request_hctx(), but also meet nvme fc/rdma/tcp's requirement. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- block/blk-mq-map.c | 6 +++++- include/linux/blk-mq.h | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-)