diff mbox series

[v4,1/9] lib/group_cpus: let group_cpu_evenly return number of groups

Message ID 20241217-isolcpus-io-queues-v4-1-5d355fbb1e14@kernel.org
State New
Headers show
Series blk: honor isolcpus configuration | expand

Commit Message

Daniel Wagner Dec. 17, 2024, 6:29 p.m. UTC
group_cpu_evenly might allocated less groups then the requested:

group_cpu_evenly
  __group_cpus_evenly
    alloc_nodes_groups
      # allocated total groups may be less than numgrps when
      # active total CPU number is less then numgrps

In this case, the caller will do an out of bound access because the
caller assumes the masks returned has numgrps.

Return the number of groups created so the caller can limit the access
range accordingly.

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 block/blk-mq-cpumap.c        |  7 ++++---
 drivers/virtio/virtio_vdpa.c |  2 +-
 fs/fuse/virtio_fs.c          |  7 ++++---
 include/linux/group_cpus.h   |  2 +-
 kernel/irq/affinity.c        |  2 +-
 lib/group_cpus.c             | 23 +++++++++++++----------
 6 files changed, 24 insertions(+), 19 deletions(-)

Comments

Hannes Reinecke Jan. 7, 2025, 7:51 a.m. UTC | #1
On 12/17/24 19:29, Daniel Wagner wrote:
> group_cpu_evenly might allocated less groups then the requested:
> 
> group_cpu_evenly
>    __group_cpus_evenly
>      alloc_nodes_groups
>        # allocated total groups may be less than numgrps when
>        # active total CPU number is less then numgrps
> 
> In this case, the caller will do an out of bound access because the
> caller assumes the masks returned has numgrps.
> 
> Return the number of groups created so the caller can limit the access
> range accordingly.
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>   block/blk-mq-cpumap.c        |  7 ++++---
>   drivers/virtio/virtio_vdpa.c |  2 +-
>   fs/fuse/virtio_fs.c          |  7 ++++---
>   include/linux/group_cpus.h   |  2 +-
>   kernel/irq/affinity.c        |  2 +-
>   lib/group_cpus.c             | 23 +++++++++++++----------
>   6 files changed, 24 insertions(+), 19 deletions(-)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index ad8d6a363f24ae11968b42f7bcfd6a719a0499b7..85c0a7073bd8bff5d34aad1729d45d89da4c4bd1 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -19,9 +19,10 @@
>   void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>   {
>   	const struct cpumask *masks;
> -	unsigned int queue, cpu;
> +	unsigned int queue, cpu, nr_masks;
>   
> -	masks = group_cpus_evenly(qmap->nr_queues);
> +	nr_masks = qmap->nr_queues;
> +	masks = group_cpus_evenly(&nr_masks);

Hmph. I am a big fan of separating input and output paramenters;
most ABI definitions will be doing that anyway.
Makes it also really hard to track whether the output parameters
had been set at all. Care to split it up?

Cheers,

Hannes
Daniel Wagner Jan. 7, 2025, 8:20 a.m. UTC | #2
On Tue, Jan 07, 2025 at 08:51:57AM +0100, Hannes Reinecke wrote:
> >   void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
> >   {
> >   	const struct cpumask *masks;
> > -	unsigned int queue, cpu;
> > +	unsigned int queue, cpu, nr_masks;
> > -	masks = group_cpus_evenly(qmap->nr_queues);
> > +	nr_masks = qmap->nr_queues;
> > +	masks = group_cpus_evenly(&nr_masks);
> 
> Hmph. I am a big fan of separating input and output paramenters;
> most ABI definitions will be doing that anyway.
> Makes it also really hard to track whether the output parameters
> had been set at all. Care to split it up?

What API do you have in mind?
Hannes Reinecke Jan. 7, 2025, 10:35 a.m. UTC | #3
On 1/7/25 09:20, Daniel Wagner wrote:
> On Tue, Jan 07, 2025 at 08:51:57AM +0100, Hannes Reinecke wrote:
>>>    void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>>>    {
>>>    	const struct cpumask *masks;
>>> -	unsigned int queue, cpu;
>>> +	unsigned int queue, cpu, nr_masks;
>>> -	masks = group_cpus_evenly(qmap->nr_queues);
>>> +	nr_masks = qmap->nr_queues;
>>> +	masks = group_cpus_evenly(&nr_masks);
>>
>> Hmph. I am a big fan of separating input and output paramenters;
>> most ABI definitions will be doing that anyway.
>> Makes it also really hard to track whether the output parameters
>> had been set at all. Care to split it up?
> 
> What API do you have in mind?

ABI, not API.
x86 ABI has 'rdi' as the first input register, and 'rax' as the first 
output register. So there is no difference for the compiler whether
you have 'x(&a); b = a;' or 'x(a, &b)'.
But you gain on usability, as in the second case you can check whether
'b' had been set by the function.

Cheers,

Hannes
Daniel Wagner Jan. 7, 2025, 10:46 a.m. UTC | #4
On Tue, Jan 07, 2025 at 11:35:10AM +0100, Hannes Reinecke wrote:
> On 1/7/25 09:20, Daniel Wagner wrote:
> > On Tue, Jan 07, 2025 at 08:51:57AM +0100, Hannes Reinecke wrote:
> > > >    void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
> > > >    {
> > > >    	const struct cpumask *masks;
> > > > -	unsigned int queue, cpu;
> > > > +	unsigned int queue, cpu, nr_masks;
> > > > -	masks = group_cpus_evenly(qmap->nr_queues);
> > > > +	nr_masks = qmap->nr_queues;
> > > > +	masks = group_cpus_evenly(&nr_masks);
> > > 
> > > Hmph. I am a big fan of separating input and output paramenters;
> > > most ABI definitions will be doing that anyway.
> > > Makes it also really hard to track whether the output parameters
> > > had been set at all. Care to split it up?
> > 
> > What API do you have in mind?
> 
> ABI, not API.

I got that, still what C API do you want to see?
Hannes Reinecke Jan. 7, 2025, 11:35 a.m. UTC | #5
On 1/7/25 11:46, Daniel Wagner wrote:
> On Tue, Jan 07, 2025 at 11:35:10AM +0100, Hannes Reinecke wrote:
>> On 1/7/25 09:20, Daniel Wagner wrote:
>>> On Tue, Jan 07, 2025 at 08:51:57AM +0100, Hannes Reinecke wrote:
>>>>>     void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>>>>>     {
>>>>>     	const struct cpumask *masks;
>>>>> -	unsigned int queue, cpu;
>>>>> +	unsigned int queue, cpu, nr_masks;
>>>>> -	masks = group_cpus_evenly(qmap->nr_queues);
>>>>> +	nr_masks = qmap->nr_queues;
>>>>> +	masks = group_cpus_evenly(&nr_masks);
>>>>
>>>> Hmph. I am a big fan of separating input and output paramenters;
>>>> most ABI definitions will be doing that anyway.
>>>> Makes it also really hard to track whether the output parameters
>>>> had been set at all. Care to split it up?
>>>
>>> What API do you have in mind?
>>
>> ABI, not API.
> 
> I got that, still what C API do you want to see?
> 

masks = group_cpus_evenly(qmap->nr_queues, &nr_queues);

maybe?

Cheers,

Hannes
diff mbox series

Patch

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index ad8d6a363f24ae11968b42f7bcfd6a719a0499b7..85c0a7073bd8bff5d34aad1729d45d89da4c4bd1 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -19,9 +19,10 @@ 
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 {
 	const struct cpumask *masks;
-	unsigned int queue, cpu;
+	unsigned int queue, cpu, nr_masks;
 
-	masks = group_cpus_evenly(qmap->nr_queues);
+	nr_masks = qmap->nr_queues;
+	masks = group_cpus_evenly(&nr_masks);
 	if (!masks) {
 		for_each_possible_cpu(cpu)
 			qmap->mq_map[cpu] = qmap->queue_offset;
@@ -29,7 +30,7 @@  void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 	}
 
 	for (queue = 0; queue < qmap->nr_queues; queue++) {
-		for_each_cpu(cpu, &masks[queue])
+		for_each_cpu(cpu, &masks[queue % nr_masks])
 			qmap->mq_map[cpu] = qmap->queue_offset + queue;
 	}
 	kfree(masks);
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 1f60c9d5cb1810a6f208c24bb2ac640d537391a0..c478cccf5fd68b9c9c01332046c24316573d97cd 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -330,7 +330,7 @@  create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
 		unsigned int this_vecs = affd->set_size[i];
 		int j;
-		struct cpumask *result = group_cpus_evenly(this_vecs);
+		struct cpumask *result = group_cpus_evenly(&this_vecs);
 
 		if (!result) {
 			kfree(masks);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 82afe78ec542358e2db6f4d955d521652ae363ec..5acd875f1e9c9840dd9d2f3245665c91230f57a8 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -862,7 +862,7 @@  static void virtio_fs_requests_done_work(struct work_struct *work)
 static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
 {
 	const struct cpumask *mask, *masks;
-	unsigned int q, cpu;
+	unsigned int q, cpu, nr_masks;
 
 	/* First attempt to map using existing transport layer affinities
 	 * e.g. PCIe MSI-X
@@ -882,7 +882,8 @@  static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f
 	return;
 fallback:
 	/* Attempt to map evenly in groups over the CPUs */
-	masks = group_cpus_evenly(fs->num_request_queues);
+	nr_masks = fs->num_request_queues;
+	masks = group_cpus_evenly(&nr_masks);
 	/* If even this fails we default to all CPUs use first request queue */
 	if (!masks) {
 		for_each_possible_cpu(cpu)
@@ -891,7 +892,7 @@  static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f
 	}
 
 	for (q = 0; q < fs->num_request_queues; q++) {
-		for_each_cpu(cpu, &masks[q])
+		for_each_cpu(cpu, &masks[q % nr_masks])
 			fs->mq_map[cpu] = q + VQ_REQUEST;
 	}
 	kfree(masks);
diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h
index e42807ec61f6e8cf3787af7daa0d8686edfef0a3..8659534a3423e92746738ac57e713b7416e05271 100644
--- a/include/linux/group_cpus.h
+++ b/include/linux/group_cpus.h
@@ -9,6 +9,6 @@ 
 #include <linux/kernel.h>
 #include <linux/cpu.h>
 
-struct cpumask *group_cpus_evenly(unsigned int numgrps);
+struct cpumask *group_cpus_evenly(unsigned int *numgrps);
 
 #endif
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 44a4eba80315cc098ecfa366ca1d88483641b12a..0188e133f1a508a623e33f08a0fca2e1f2cbf4e4 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -71,7 +71,7 @@  irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
 		unsigned int this_vecs = affd->set_size[i];
 		int j;
-		struct cpumask *result = group_cpus_evenly(this_vecs);
+		struct cpumask *result = group_cpus_evenly(&this_vecs);
 
 		if (!result) {
 			kfree(masks);
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index ee272c4cefcc13907ce9f211f479615d2e3c9154..73da83ca2c45347a3a443d42d4f16801a47effd5 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -334,7 +334,8 @@  static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  * @numgrps: number of groups
  *
  * Return: cpumask array if successful, NULL otherwise. And each element
- * includes CPUs assigned to this group
+ * includes CPUs assigned to this group. numgrps will be updated to the
+ * actuall allocated number of masks.
  *
  * Try to put close CPUs from viewpoint of CPU and NUMA locality into
  * same group, and run two-stage grouping:
@@ -344,9 +345,9 @@  static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  * We guarantee in the resulted grouping that all CPUs are covered, and
  * no same CPU is assigned to multiple groups
  */
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+struct cpumask *group_cpus_evenly(unsigned int *numgrps)
 {
-	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
+	unsigned int curgrp = 0, nr_present = 0, nr_others = 0, nr_grps;
 	cpumask_var_t *node_to_cpumask;
 	cpumask_var_t nmsk, npresmsk;
 	int ret = -ENOMEM;
@@ -362,7 +363,8 @@  struct cpumask *group_cpus_evenly(unsigned int numgrps)
 	if (!node_to_cpumask)
 		goto fail_npresmsk;
 
-	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
+	nr_grps = *numgrps;
+	masks = kcalloc(nr_grps, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		goto fail_node_to_cpumask;
 
@@ -383,7 +385,7 @@  struct cpumask *group_cpus_evenly(unsigned int numgrps)
 	cpumask_copy(npresmsk, data_race(cpu_present_mask));
 
 	/* grouping present CPUs first */
-	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+	ret = __group_cpus_evenly(curgrp, nr_grps, node_to_cpumask,
 				  npresmsk, nmsk, masks);
 	if (ret < 0)
 		goto fail_build_affinity;
@@ -395,19 +397,19 @@  struct cpumask *group_cpus_evenly(unsigned int numgrps)
 	 * group space, assign the non present CPUs to the already
 	 * allocated out groups.
 	 */
-	if (nr_present >= numgrps)
+	if (nr_present >= nr_grps)
 		curgrp = 0;
 	else
 		curgrp = nr_present;
 	cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
-	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+	ret = __group_cpus_evenly(curgrp, nr_grps, node_to_cpumask,
 				  npresmsk, nmsk, masks);
 	if (ret >= 0)
 		nr_others = ret;
 
  fail_build_affinity:
 	if (ret >= 0)
-		WARN_ON(nr_present + nr_others < numgrps);
+		WARN_ON(nr_present + nr_others < nr_grps);
 
  fail_node_to_cpumask:
 	free_node_to_cpumask(node_to_cpumask);
@@ -421,12 +423,13 @@  struct cpumask *group_cpus_evenly(unsigned int numgrps)
 		kfree(masks);
 		return NULL;
 	}
+	*numgrps = nr_present + nr_others;
 	return masks;
 }
 #else /* CONFIG_SMP */
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+struct cpumask *group_cpus_evenly(unsigned int *numgrps)
 {
-	struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
+	struct cpumask *masks = kcalloc(*numgrps, sizeof(*masks), GFP_KERNEL);
 
 	if (!masks)
 		return NULL;