diff mbox series

[v2,2/6] spapr_numa: forbid asymmetrical NUMA setups

Message ID 20200924195058.362984-3-danielhb413@gmail.com
State Superseded
Headers show
Series pseries NUMA distance calculation | expand

Commit Message

Daniel Henrique Barboza Sept. 24, 2020, 7:50 p.m. UTC
The pSeries machine does not support asymmetrical NUMA
configurations. This doesn't make much of a different
since we're not using user input for pSeries NUMA setup,
but this will change in the next patches.

To avoid breaking existing setups, gate this change by
checking for legacy NUMA support.

Reviewed-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
---
 hw/ppc/spapr_numa.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

Comments

David Gibson Sept. 25, 2020, 2:36 a.m. UTC | #1
On Thu, Sep 24, 2020 at 04:50:54PM -0300, Daniel Henrique Barboza wrote:
> The pSeries machine does not support asymmetrical NUMA

> configurations. This doesn't make much of a different

> since we're not using user input for pSeries NUMA setup,

> but this will change in the next patches.

> 

> To avoid breaking existing setups, gate this change by

> checking for legacy NUMA support.

> 

> Reviewed-by: Greg Kurz <groug@kaod.org>

> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>


Reviewed-by: David Gibson <david@gibson.dropbear.id.au>


> ---

>  hw/ppc/spapr_numa.c | 34 ++++++++++++++++++++++++++++++++++

>  1 file changed, 34 insertions(+)

> 

> diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c

> index 64fe567f5d..fe395e80a3 100644

> --- a/hw/ppc/spapr_numa.c

> +++ b/hw/ppc/spapr_numa.c

> @@ -19,6 +19,24 @@

>  /* Moved from hw/ppc/spapr_pci_nvlink2.c */

>  #define SPAPR_GPU_NUMA_ID           (cpu_to_be32(1))

>  

> +static bool spapr_numa_is_symmetrical(MachineState *ms)

> +{

> +    int src, dst;

> +    int nb_numa_nodes = ms->numa_state->num_nodes;

> +    NodeInfo *numa_info = ms->numa_state->nodes;

> +

> +    for (src = 0; src < nb_numa_nodes; src++) {

> +        for (dst = src; dst < nb_numa_nodes; dst++) {

> +            if (numa_info[src].distance[dst] !=

> +                numa_info[dst].distance[src]) {

> +                return false;

> +            }

> +        }

> +    }

> +

> +    return true;

> +}

> +

>  void spapr_numa_associativity_init(SpaprMachineState *spapr,

>                                     MachineState *machine)

>  {

> @@ -61,6 +79,22 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr,

>  

>          spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);

>      }

> +

> +    /*

> +     * Legacy NUMA guests (pseries-5.1 and older, or guests with only

> +     * 1 NUMA node) will not benefit from anything we're going to do

> +     * after this point.

> +     */

> +    if (spapr_machine_using_legacy_numa(spapr)) {

> +        return;

> +    }

> +

> +    if (!spapr_numa_is_symmetrical(machine)) {

> +        error_report("Asymmetrical NUMA topologies aren't supported "

> +                     "in the pSeries machine");

> +        exit(EXIT_FAILURE);

> +    }

> +

>  }

>  

>  void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
David Gibson Sept. 25, 2020, 3:48 a.m. UTC | #2
On Thu, Sep 24, 2020 at 04:50:54PM -0300, Daniel Henrique Barboza wrote:
> The pSeries machine does not support asymmetrical NUMA

> configurations. This doesn't make much of a different

> since we're not using user input for pSeries NUMA setup,

> but this will change in the next patches.

> 

> To avoid breaking existing setups, gate this change by

> checking for legacy NUMA support.

> 

> Reviewed-by: Greg Kurz <groug@kaod.org>

> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>


Having read the rest of the series, I realized there's another type of
configuration that PAPR can't represent, so possibly we should add
logic to catch that as well.  That's what I'm going to call
"non-transitive" configurations, e.g.

Node	0	1	2
0	10	20	40
1	20	10	20
2	40	20	10	

Basically the closeness of 0 to 1 and 1 to 2 forces them all to be in
the same domain at every PAPR level, even though 0-2 is supposed to be
more expensive.

> ---

>  hw/ppc/spapr_numa.c | 34 ++++++++++++++++++++++++++++++++++

>  1 file changed, 34 insertions(+)

> 

> diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c

> index 64fe567f5d..fe395e80a3 100644

> --- a/hw/ppc/spapr_numa.c

> +++ b/hw/ppc/spapr_numa.c

> @@ -19,6 +19,24 @@

>  /* Moved from hw/ppc/spapr_pci_nvlink2.c */

>  #define SPAPR_GPU_NUMA_ID           (cpu_to_be32(1))

>  

> +static bool spapr_numa_is_symmetrical(MachineState *ms)

> +{

> +    int src, dst;

> +    int nb_numa_nodes = ms->numa_state->num_nodes;

> +    NodeInfo *numa_info = ms->numa_state->nodes;

> +

> +    for (src = 0; src < nb_numa_nodes; src++) {

> +        for (dst = src; dst < nb_numa_nodes; dst++) {

> +            if (numa_info[src].distance[dst] !=

> +                numa_info[dst].distance[src]) {

> +                return false;

> +            }

> +        }

> +    }

> +

> +    return true;

> +}

> +

>  void spapr_numa_associativity_init(SpaprMachineState *spapr,

>                                     MachineState *machine)

>  {

> @@ -61,6 +79,22 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr,

>  

>          spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);

>      }

> +

> +    /*

> +     * Legacy NUMA guests (pseries-5.1 and older, or guests with only

> +     * 1 NUMA node) will not benefit from anything we're going to do

> +     * after this point.

> +     */

> +    if (spapr_machine_using_legacy_numa(spapr)) {

> +        return;

> +    }

> +

> +    if (!spapr_numa_is_symmetrical(machine)) {

> +        error_report("Asymmetrical NUMA topologies aren't supported "

> +                     "in the pSeries machine");

> +        exit(EXIT_FAILURE);

> +    }

> +

>  }

>  

>  void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Daniel Henrique Barboza Sept. 25, 2020, 12:41 p.m. UTC | #3
On 9/25/20 12:48 AM, David Gibson wrote:
> On Thu, Sep 24, 2020 at 04:50:54PM -0300, Daniel Henrique Barboza wrote:

>> The pSeries machine does not support asymmetrical NUMA

>> configurations. This doesn't make much of a different

>> since we're not using user input for pSeries NUMA setup,

>> but this will change in the next patches.

>>

>> To avoid breaking existing setups, gate this change by

>> checking for legacy NUMA support.

>>

>> Reviewed-by: Greg Kurz <groug@kaod.org>

>> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>

> 

> Having read the rest of the series, I realized there's another type of

> configuration that PAPR can't represent, so possibly we should add

> logic to catch that as well.  That's what I'm going to call

> "non-transitive" configurations, e.g.

> 

> Node	0	1	2

> 0	10	20	40

> 1	20	10	20

> 2	40	20	10	

> 

> Basically the closeness of 0 to 1 and 1 to 2 forces them all to be in

> the same domain at every PAPR level, even though 0-2 is supposed to be

> more expensive.



Yes, this is correct. I'm not sure how to proceed in this case
though. Should we error out?


DHB

> 

>> ---

>>   hw/ppc/spapr_numa.c | 34 ++++++++++++++++++++++++++++++++++

>>   1 file changed, 34 insertions(+)

>>

>> diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c

>> index 64fe567f5d..fe395e80a3 100644

>> --- a/hw/ppc/spapr_numa.c

>> +++ b/hw/ppc/spapr_numa.c

>> @@ -19,6 +19,24 @@

>>   /* Moved from hw/ppc/spapr_pci_nvlink2.c */

>>   #define SPAPR_GPU_NUMA_ID           (cpu_to_be32(1))

>>   

>> +static bool spapr_numa_is_symmetrical(MachineState *ms)

>> +{

>> +    int src, dst;

>> +    int nb_numa_nodes = ms->numa_state->num_nodes;

>> +    NodeInfo *numa_info = ms->numa_state->nodes;

>> +

>> +    for (src = 0; src < nb_numa_nodes; src++) {

>> +        for (dst = src; dst < nb_numa_nodes; dst++) {

>> +            if (numa_info[src].distance[dst] !=

>> +                numa_info[dst].distance[src]) {

>> +                return false;

>> +            }

>> +        }

>> +    }

>> +

>> +    return true;

>> +}

>> +

>>   void spapr_numa_associativity_init(SpaprMachineState *spapr,

>>                                      MachineState *machine)

>>   {

>> @@ -61,6 +79,22 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr,

>>   

>>           spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);

>>       }

>> +

>> +    /*

>> +     * Legacy NUMA guests (pseries-5.1 and older, or guests with only

>> +     * 1 NUMA node) will not benefit from anything we're going to do

>> +     * after this point.

>> +     */

>> +    if (spapr_machine_using_legacy_numa(spapr)) {

>> +        return;

>> +    }

>> +

>> +    if (!spapr_numa_is_symmetrical(machine)) {

>> +        error_report("Asymmetrical NUMA topologies aren't supported "

>> +                     "in the pSeries machine");

>> +        exit(EXIT_FAILURE);

>> +    }

>> +

>>   }

>>   

>>   void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,

>
David Gibson Sept. 26, 2020, 7:49 a.m. UTC | #4
On Fri, Sep 25, 2020 at 09:41:02AM -0300, Daniel Henrique Barboza wrote:
> 

> 

> On 9/25/20 12:48 AM, David Gibson wrote:

> > On Thu, Sep 24, 2020 at 04:50:54PM -0300, Daniel Henrique Barboza wrote:

> > > The pSeries machine does not support asymmetrical NUMA

> > > configurations. This doesn't make much of a different

> > > since we're not using user input for pSeries NUMA setup,

> > > but this will change in the next patches.

> > > 

> > > To avoid breaking existing setups, gate this change by

> > > checking for legacy NUMA support.

> > > 

> > > Reviewed-by: Greg Kurz <groug@kaod.org>

> > > Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>

> > 

> > Having read the rest of the series, I realized there's another type of

> > configuration that PAPR can't represent, so possibly we should add

> > logic to catch that as well.  That's what I'm going to call

> > "non-transitive" configurations, e.g.

> > 

> > Node	0	1	2

> > 0	10	20	40

> > 1	20	10	20

> > 2	40	20	10	

> > 

> > Basically the closeness of 0 to 1 and 1 to 2 forces them all to be in

> > the same domain at every PAPR level, even though 0-2 is supposed to be

> > more expensive.

> 

> Yes, this is correct. I'm not sure how to proceed in this case

> though. Should we error out?


Given that we're already erroring on asymmetric configurations, I
think it makes sense to error for these as well.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Daniel Henrique Barboza Sept. 27, 2020, 11:41 a.m. UTC | #5
On 9/26/20 4:49 AM, David Gibson wrote:
> On Fri, Sep 25, 2020 at 09:41:02AM -0300, Daniel Henrique Barboza wrote:

>>

>>

>> On 9/25/20 12:48 AM, David Gibson wrote:

>>> On Thu, Sep 24, 2020 at 04:50:54PM -0300, Daniel Henrique Barboza wrote:

>>>> The pSeries machine does not support asymmetrical NUMA

>>>> configurations. This doesn't make much of a different

>>>> since we're not using user input for pSeries NUMA setup,

>>>> but this will change in the next patches.

>>>>

>>>> To avoid breaking existing setups, gate this change by

>>>> checking for legacy NUMA support.

>>>>

>>>> Reviewed-by: Greg Kurz <groug@kaod.org>

>>>> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>

>>>

>>> Having read the rest of the series, I realized there's another type of

>>> configuration that PAPR can't represent, so possibly we should add

>>> logic to catch that as well.  That's what I'm going to call

>>> "non-transitive" configurations, e.g.

>>>

>>> Node	0	1	2

>>> 0	10	20	40

>>> 1	20	10	20

>>> 2	40	20	10	

>>>

>>> Basically the closeness of 0 to 1 and 1 to 2 forces them all to be in

>>> the same domain at every PAPR level, even though 0-2 is supposed to be

>>> more expensive.

>>

>> Yes, this is correct. I'm not sure how to proceed in this case

>> though. Should we error out?

> 

> Given that we're already erroring on asymmetric configurations, I

> think it makes sense to error for these as well.


Thing is that asymmetrical configurations is an easy concept to enforce
to the user - distance from A to B can't be different from B to A.

In the example you gave above, with 3 NUMA nodes, is easy to spot where
the non-transitivity rule would hit. I'm afraid that if we add 2-3 more
NUMA nodes in the mix this will stop being straightforward, with more and
more combinations hitting the 'non-transitivity' rule, and erroring out
will end up being frustrating to the user.

I'd say that we should report this in the documentation as one more
limitation of the implementation (and PAPR). I wouldn't oppose with
throwing a warning message either, letting the user know that the
approximation will be less precise than it already would be in this case.


Thanks,


DHB



>
David Gibson Sept. 28, 2020, 6:25 a.m. UTC | #6
On Sun, Sep 27, 2020 at 08:41:30AM -0300, Daniel Henrique Barboza wrote:
> 
> 
> On 9/26/20 4:49 AM, David Gibson wrote:
> > On Fri, Sep 25, 2020 at 09:41:02AM -0300, Daniel Henrique Barboza wrote:
> > > 
> > > 
> > > On 9/25/20 12:48 AM, David Gibson wrote:
> > > > On Thu, Sep 24, 2020 at 04:50:54PM -0300, Daniel Henrique Barboza wrote:
> > > > > The pSeries machine does not support asymmetrical NUMA
> > > > > configurations. This doesn't make much of a different
> > > > > since we're not using user input for pSeries NUMA setup,
> > > > > but this will change in the next patches.
> > > > > 
> > > > > To avoid breaking existing setups, gate this change by
> > > > > checking for legacy NUMA support.
> > > > > 
> > > > > Reviewed-by: Greg Kurz <groug@kaod.org>
> > > > > Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> > > > 
> > > > Having read the rest of the series, I realized there's another type of
> > > > configuration that PAPR can't represent, so possibly we should add
> > > > logic to catch that as well.  That's what I'm going to call
> > > > "non-transitive" configurations, e.g.
> > > > 
> > > > Node	0	1	2
> > > > 0	10	20	40
> > > > 1	20	10	20
> > > > 2	40	20	10	
> > > > 
> > > > Basically the closeness of 0 to 1 and 1 to 2 forces them all to be in
> > > > the same domain at every PAPR level, even though 0-2 is supposed to be
> > > > more expensive.
> > > 
> > > Yes, this is correct. I'm not sure how to proceed in this case
> > > though. Should we error out?
> > 
> > Given that we're already erroring on asymmetric configurations, I
> > think it makes sense to error for these as well.
> 
> Thing is that asymmetrical configurations is an easy concept to enforce
> to the user - distance from A to B can't be different from B to A.
> 
> In the example you gave above, with 3 NUMA nodes, is easy to spot where
> the non-transitivity rule would hit. I'm afraid that if we add 2-3 more
> NUMA nodes in the mix this will stop being straightforward, with more and
> more combinations hitting the 'non-transitivity' rule, and erroring out
> will end up being frustrating to the user.
> 
> I'd say that we should report this in the documentation as one more
> limitation of the implementation (and PAPR). I wouldn't oppose with
> throwing a warning message either, letting the user know that the
> approximation will be less precise than it already would be in this
> case.

Hmm... yes, I see your point.  Ok, let's go with your suggestion.
diff mbox series

Patch

diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
index 64fe567f5d..fe395e80a3 100644
--- a/hw/ppc/spapr_numa.c
+++ b/hw/ppc/spapr_numa.c
@@ -19,6 +19,24 @@ 
 /* Moved from hw/ppc/spapr_pci_nvlink2.c */
 #define SPAPR_GPU_NUMA_ID           (cpu_to_be32(1))
 
+static bool spapr_numa_is_symmetrical(MachineState *ms)
+{
+    int src, dst;
+    int nb_numa_nodes = ms->numa_state->num_nodes;
+    NodeInfo *numa_info = ms->numa_state->nodes;
+
+    for (src = 0; src < nb_numa_nodes; src++) {
+        for (dst = src; dst < nb_numa_nodes; dst++) {
+            if (numa_info[src].distance[dst] !=
+                numa_info[dst].distance[src]) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 void spapr_numa_associativity_init(SpaprMachineState *spapr,
                                    MachineState *machine)
 {
@@ -61,6 +79,22 @@  void spapr_numa_associativity_init(SpaprMachineState *spapr,
 
         spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);
     }
+
+    /*
+     * Legacy NUMA guests (pseries-5.1 and older, or guests with only
+     * 1 NUMA node) will not benefit from anything we're going to do
+     * after this point.
+     */
+    if (spapr_machine_using_legacy_numa(spapr)) {
+        return;
+    }
+
+    if (!spapr_numa_is_symmetrical(machine)) {
+        error_report("Asymmetrical NUMA topologies aren't supported "
+                     "in the pSeries machine");
+        exit(EXIT_FAILURE);
+    }
+
 }
 
 void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,