Message ID | 20210807072057.184698-1-clg@kaod.org |
---|---|
State | Accepted |
Commit | cbc06f051c524dcfe52ef0d1f30647828e226d30 |
Headers | show |
Series | [v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs | expand |
On 07/08/2021 09:20, Cédric Le Goater wrote: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We can not directly allocate and request the IPI on demand because > bringup_up() is called under the IRQ sparse lock. The alternative is > to allocate the IPIs for all possible nodes at startup and to request > the mapping on demand when the first CPU of a node is brought up. > > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: stable@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com> > Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> > Cc: Laurent Vivier <lvivier@redhat.com> > Signed-off-by: Cédric Le Goater <clg@kaod.org> > Message-Id: <20210629131542.743888-1-clg@kaod.org> > Signed-off-by: Cédric Le Goater <clg@kaod.org> > --- > arch/powerpc/sysdev/xive/common.c | 35 +++++++++++++++++++++---------- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c > index dbdbbc2f1dc5..943fd30095af 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; > static struct xive_ipi_desc { > unsigned int irq; > char name[16]; > + atomic_t started; > } *xive_ipis; > > /* > @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = { > .alloc = xive_ipi_irq_domain_alloc, > }; > > -static int __init xive_request_ipi(void) > +static int __init xive_init_ipis(void) > { > struct fwnode_handle *fwnode; > struct irq_domain *ipi_domain; > @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = &xive_ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* > * Map one IPI interrupt per node for all cpus of that node. > * Since the HW interrupt number doesn't have any meaning, > @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) > xid->irq = ret; > > snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > - > - ret = request_irq(xid->irq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); > - > - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > } > > return ret; > @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) > return ret; > } > > +static int __init xive_request_ipi(unsigned int cpu) > +{ > + struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; > + int ret; > + > + if (atomic_inc_return(&xid->started) > 1) > + return 0; > + > + ret = request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, > + xid->name, NULL); > + > + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > + return ret; > +} > + > static int xive_setup_cpu_ipi(unsigned int cpu) > { > unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > if (xc->hw_ipi != XIVE_BAD_IRQ) > return 0; > > + /* Register the IPI */ > + xive_request_ipi(cpu); > + > /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > if (xive_ops->get_ipi(cpu, xc)) > return -EIO; > @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) > if (xc->hw_ipi == XIVE_BAD_IRQ) > return; > > + /* TODO: clear IPI mapping */ > + > /* Mask the IPI */ > xive_do_source_set_mask(&xc->ipi_data, true); > > @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) > smp_ops->cause_ipi = xive_cause_ipi; > > /* Register the IPI */ > - xive_request_ipi(); > + xive_init_ipis(); > > /* Allocate and setup IPI for the boot CPU */ > xive_setup_cpu_ipi(smp_processor_id()); > Tested-by: Laurent Vivier <lvivier@redhat.com>
* C?dric Le Goater <clg@kaod.org> [2021-08-07 09:20:57]: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We can not directly allocate and request the IPI on demand because > bringup_up() is called under the IRQ sparse lock. The alternative is > to allocate the IPIs for all possible nodes at startup and to request > the mapping on demand when the first CPU of a node is brought up. > Thank you, this version too works for me. Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: stable@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com> > Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> > Cc: Laurent Vivier <lvivier@redhat.com> > Signed-off-by: Cédric Le Goater <clg@kaod.org> > Message-Id: <20210629131542.743888-1-clg@kaod.org> > Signed-off-by: Cédric Le Goater <clg@kaod.org> > --- > arch/powerpc/sysdev/xive/common.c | 35 +++++++++++++++++++++---------- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c > index dbdbbc2f1dc5..943fd30095af 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; > static struct xive_ipi_desc { > unsigned int irq; > char name[16]; > + atomic_t started; > } *xive_ipis; > > /* > @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = { > .alloc = xive_ipi_irq_domain_alloc, > }; > > -static int __init xive_request_ipi(void) > +static int __init xive_init_ipis(void) > { > struct fwnode_handle *fwnode; > struct irq_domain *ipi_domain; > @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = &xive_ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* > * Map one IPI interrupt per node for all cpus of that node. > * Since the HW interrupt number doesn't have any meaning, > @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) > xid->irq = ret; > > snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > - > - ret = request_irq(xid->irq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); > - > - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > } > > return ret; > @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) > return ret; > } > > +static int __init xive_request_ipi(unsigned int cpu) > +{ > + struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; > + int ret; > + > + if (atomic_inc_return(&xid->started) > 1) > + return 0; > + > + ret = request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, > + xid->name, NULL); > + > + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > + return ret; > +} > + > static int xive_setup_cpu_ipi(unsigned int cpu) > { > unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > if (xc->hw_ipi != XIVE_BAD_IRQ) > return 0; > > + /* Register the IPI */ > + xive_request_ipi(cpu); > + > /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > if (xive_ops->get_ipi(cpu, xc)) > return -EIO; > @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) > if (xc->hw_ipi == XIVE_BAD_IRQ) > return; > > + /* TODO: clear IPI mapping */ > + > /* Mask the IPI */ > xive_do_source_set_mask(&xc->ipi_data, true); > > @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) > smp_ops->cause_ipi = xive_cause_ipi; > > /* Register the IPI */ > - xive_request_ipi(); > + xive_init_ipis(); > > /* Allocate and setup IPI for the boot CPU */ > xive_setup_cpu_ipi(smp_processor_id()); > -- > 2.31.1 > -- Thanks and Regards Srikar Dronamraju
On 8/7/21 9:20 AM, Cédric Le Goater wrote: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We can not directly allocate and request the IPI on demand because > bringup_up() is called under the IRQ sparse lock. The alternative is > to allocate the IPIs for all possible nodes at startup and to request > the mapping on demand when the first CPU of a node is brought up. > > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: stable@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com> > Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> > Cc: Laurent Vivier <lvivier@redhat.com> > Signed-off-by: Cédric Le Goater <clg@kaod.org> > Message-Id: <20210629131542.743888-1-clg@kaod.org> > Signed-off-by: Cédric Le Goater <clg@kaod.org> > --- > arch/powerpc/sysdev/xive/common.c | 35 +++++++++++++++++++++---------- > 1 file changed, 24 insertions(+), 11 deletions(-) I forgot to add that this version does break irqbalance anymore, since Linux is not mapping interrupts of CPU-less nodes. Anyhow, irqbalance is now fixed : https://github.com/Irqbalance/irqbalance/commit/a7f81483a95a94d6a62ca7fb999a090e01c0de9b So v1 (plus irqbalance patch above) or v2 are safe to use. I do prefer v2. Thanks to Laurent and Srikar for the extra tests, C. > diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c > index dbdbbc2f1dc5..943fd30095af 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; > static struct xive_ipi_desc { > unsigned int irq; > char name[16]; > + atomic_t started; > } *xive_ipis; > > /* > @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = { > .alloc = xive_ipi_irq_domain_alloc, > }; > > -static int __init xive_request_ipi(void) > +static int __init xive_init_ipis(void) > { > struct fwnode_handle *fwnode; > struct irq_domain *ipi_domain; > @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = &xive_ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* > * Map one IPI interrupt per node for all cpus of that node. > * Since the HW interrupt number doesn't have any meaning, > @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) > xid->irq = ret; > > snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > - > - ret = request_irq(xid->irq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); > - > - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > } > > return ret; > @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) > return ret; > } > > +static int __init xive_request_ipi(unsigned int cpu) > +{ > + struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; > + int ret; > + > + if (atomic_inc_return(&xid->started) > 1) > + return 0; > + > + ret = request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, > + xid->name, NULL); > + > + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > + return ret; > +} > + > static int xive_setup_cpu_ipi(unsigned int cpu) > { > unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > if (xc->hw_ipi != XIVE_BAD_IRQ) > return 0; > > + /* Register the IPI */ > + xive_request_ipi(cpu); > + > /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > if (xive_ops->get_ipi(cpu, xc)) > return -EIO; > @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) > if (xc->hw_ipi == XIVE_BAD_IRQ) > return; > > + /* TODO: clear IPI mapping */ > + > /* Mask the IPI */ > xive_do_source_set_mask(&xc->ipi_data, true); > > @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) > smp_ops->cause_ipi = xive_cause_ipi; > > /* Register the IPI */ > - xive_request_ipi(); > + xive_init_ipis(); > > /* Allocate and setup IPI for the boot CPU */ > xive_setup_cpu_ipi(smp_processor_id()); >
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index dbdbbc2f1dc5..943fd30095af 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; static struct xive_ipi_desc { unsigned int irq; char name[16]; + atomic_t started; } *xive_ipis; /* @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = { .alloc = xive_ipi_irq_domain_alloc, }; -static int __init xive_request_ipi(void) +static int __init xive_init_ipis(void) { struct fwnode_handle *fwnode; struct irq_domain *ipi_domain; @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) struct xive_ipi_desc *xid = &xive_ipis[node]; struct xive_ipi_alloc_info info = { node }; - /* Skip nodes without CPUs */ - if (cpumask_empty(cpumask_of_node(node))) - continue; - /* * Map one IPI interrupt per node for all cpus of that node. * Since the HW interrupt number doesn't have any meaning, @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) xid->irq = ret; snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); - - ret = request_irq(xid->irq, xive_muxed_ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); - - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); } return ret; @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) return ret; } +static int __init xive_request_ipi(unsigned int cpu) +{ + struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; + int ret; + + if (atomic_inc_return(&xid->started) > 1) + return 0; + + ret = request_irq(xid->irq, xive_muxed_ipi_action, + IRQF_PERCPU | IRQF_NO_THREAD, + xid->name, NULL); + + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); + return ret; +} + static int xive_setup_cpu_ipi(unsigned int cpu) { unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) if (xc->hw_ipi != XIVE_BAD_IRQ) return 0; + /* Register the IPI */ + xive_request_ipi(cpu); + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ if (xive_ops->get_ipi(cpu, xc)) return -EIO; @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) if (xc->hw_ipi == XIVE_BAD_IRQ) return; + /* TODO: clear IPI mapping */ + /* Mask the IPI */ xive_do_source_set_mask(&xc->ipi_data, true); @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) smp_ops->cause_ipi = xive_cause_ipi; /* Register the IPI */ - xive_request_ipi(); + xive_init_ipis(); /* Allocate and setup IPI for the boot CPU */ xive_setup_cpu_ipi(smp_processor_id());