Message ID | 20220818211619.4193362-2-jeremy.linton@arm.com |
---|---|
State | Superseded |
Headers | show |
Series | Disable FIE on machines with slow counters | expand |
On Thu, Aug 18, 2022 at 11:24 PM Jeremy Linton <jeremy.linton@arm.com> wrote: > > PCC regions utilize a mailbox to set/retrieve register values used by > the CPPC code. This is fine as long as the operations are > infrequent. With the FIE code enabled though the overhead can range > from 2-11% of system CPU overhead (ex: as measured by top) on Arm > based machines. > > So, before enabling FIE assure none of the registers used by > cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also > enable a module parameter which can also disable it at boot or module > reload. > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> > --- > drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ > drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- > include/acpi/cppc_acpi.h | 5 +++++ > 3 files changed, 73 insertions(+), 4 deletions(-) > > diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c > index 1e15a9f25ae9..c840bf606b30 100644 > --- a/drivers/acpi/cppc_acpi.c > +++ b/drivers/acpi/cppc_acpi.c > @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) > } > EXPORT_SYMBOL_GPL(cppc_get_perf_caps); > > +/** > + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. > + * > + * CPPC has flexibility about how counters describing CPU perf are delivered. "CPU performance counters are accessed" > + * One of the choices is PCC regions, which can have a high access latency. This > + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. > + * > + * Return: true if any of the counters are in PCC regions, false otherwise > + */ > +bool cppc_perf_ctrs_in_pcc(void) > +{ > + int cpu; > + > + for_each_present_cpu(cpu) { > + struct cpc_register_resource *ref_perf_reg; > + struct cpc_desc *cpc_desc; > + > + cpc_desc = per_cpu(cpc_desc_ptr, cpu); > + > + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || > + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || > + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) > + return true; > + > + > + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; > + > + /* > + * If reference perf register is not supported then we should > + * use the nominal perf value > + */ > + if (!CPC_SUPPORTED(ref_perf_reg)) > + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; > + > + if (CPC_IN_PCC(ref_perf_reg)) > + return true; > + } > + return false; > +} > +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); > + > /** > * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. > * @cpunum: CPU from which to read counters. > diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c > index 24eaf0ec344d..32fcb0bf74a4 100644 > --- a/drivers/cpufreq/cppc_cpufreq.c > +++ b/drivers/cpufreq/cppc_cpufreq.c > @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { > > static struct cpufreq_driver cppc_cpufreq_driver; > > +static enum { > + FIE_UNSET = -1, > + FIE_ENABLED, > + FIE_DISABLED > +} fie_disabled = FIE_UNSET; > + > #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE > +module_param(fie_disabled, int, 0444); > +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); > > /* Frequency invariance support */ > struct cppc_freq_invariance { > @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) > struct cppc_freq_invariance *cppc_fi; > int cpu, ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > for_each_cpu(cpu, policy->cpus) { > @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) > struct cppc_freq_invariance *cppc_fi; > int cpu; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > /* policy->cpus will be empty here, use related_cpus instead */ > @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) > }; > int ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + switch (fie_disabled) { > + /* honor user request */ > + case FIE_DISABLED: > + case FIE_ENABLED: > + break; > + case FIE_UNSET: > + default: Would be more straightforward to do if (fie_disabled == FIE_UNSET) { here. > + fie_disabled = FIE_ENABLED; > + if (cppc_perf_ctrs_in_pcc()) { > + pr_info("FIE not enabled on systems with registers in PCC\n"); > + fie_disabled = FIE_DISABLED; > + } > + break; > + } > + if (fie_disabled) > return; > > kworker_fie = kthread_create_worker(0, "cppc_fie"); > @@ -247,7 +269,7 @@ static void __init cppc_freq_invariance_init(void) > > static void cppc_freq_invariance_exit(void) > { > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > kthread_destroy_worker(kworker_fie); > @@ -936,6 +958,7 @@ static void cppc_check_hisi_workaround(void) > wa_info[i].oem_revision == tbl->oem_revision) { > /* Overwrite the get() callback */ > cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; > + fie_disabled = FIE_DISABLED; > break; > } > } > diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h > index f73d357ecdf5..c5614444031f 100644 > --- a/include/acpi/cppc_acpi.h > +++ b/include/acpi/cppc_acpi.h > @@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); > extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); > extern int cppc_set_enable(int cpu, bool enable); > extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); > +extern bool cppc_perf_ctrs_in_pcc(void); > extern bool acpi_cpc_valid(void); > extern bool cppc_allow_fast_switch(void); > extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); > @@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) > { > return -ENOTSUPP; > } > +static inline bool cppc_perf_ctrs_in_pcc(void) > +{ > + return false; > +} > static inline bool acpi_cpc_valid(void) > { > return false; > -- Apart from the above it looks fine to me, but I would like to get an ACK from Viresh on the second patch. Thanks!
Hi, On 8/23/22 12:10, Rafael J. Wysocki wrote: > On Thu, Aug 18, 2022 at 11:24 PM Jeremy Linton <jeremy.linton@arm.com> wrote: >> >> PCC regions utilize a mailbox to set/retrieve register values used by >> the CPPC code. This is fine as long as the operations are >> infrequent. With the FIE code enabled though the overhead can range >> from 2-11% of system CPU overhead (ex: as measured by top) on Arm >> based machines. >> >> So, before enabling FIE assure none of the registers used by >> cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also >> enable a module parameter which can also disable it at boot or module >> reload. >> >> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> >> --- >> drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ >> drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- >> include/acpi/cppc_acpi.h | 5 +++++ >> 3 files changed, 73 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c >> index 1e15a9f25ae9..c840bf606b30 100644 >> --- a/drivers/acpi/cppc_acpi.c >> +++ b/drivers/acpi/cppc_acpi.c >> @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) >> } >> EXPORT_SYMBOL_GPL(cppc_get_perf_caps); >> >> +/** >> + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. >> + * >> + * CPPC has flexibility about how counters describing CPU perf are delivered. > > "CPU performance counters are accessed" Sure, > > >> + * One of the choices is PCC regions, which can have a high access latency. This >> + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. >> + * >> + * Return: true if any of the counters are in PCC regions, false otherwise >> + */ >> +bool cppc_perf_ctrs_in_pcc(void) >> +{ >> + int cpu; >> + >> + for_each_present_cpu(cpu) { >> + struct cpc_register_resource *ref_perf_reg; >> + struct cpc_desc *cpc_desc; >> + >> + cpc_desc = per_cpu(cpc_desc_ptr, cpu); >> + >> + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || >> + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || >> + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) >> + return true; >> + >> + >> + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; >> + >> + /* >> + * If reference perf register is not supported then we should >> + * use the nominal perf value >> + */ >> + if (!CPC_SUPPORTED(ref_perf_reg)) >> + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; >> + >> + if (CPC_IN_PCC(ref_perf_reg)) >> + return true; >> + } >> + return false; >> +} >> +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); >> + >> /** >> * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. >> * @cpunum: CPU from which to read counters. >> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c >> index 24eaf0ec344d..32fcb0bf74a4 100644 >> --- a/drivers/cpufreq/cppc_cpufreq.c >> +++ b/drivers/cpufreq/cppc_cpufreq.c >> @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { >> >> static struct cpufreq_driver cppc_cpufreq_driver; >> >> +static enum { >> + FIE_UNSET = -1, >> + FIE_ENABLED, >> + FIE_DISABLED >> +} fie_disabled = FIE_UNSET; >> + >> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE >> +module_param(fie_disabled, int, 0444); >> +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); >> >> /* Frequency invariance support */ >> struct cppc_freq_invariance { >> @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) >> struct cppc_freq_invariance *cppc_fi; >> int cpu, ret; >> >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> >> for_each_cpu(cpu, policy->cpus) { >> @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) >> struct cppc_freq_invariance *cppc_fi; >> int cpu; >> >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> >> /* policy->cpus will be empty here, use related_cpus instead */ >> @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) >> }; >> int ret; >> >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + switch (fie_disabled) { >> + /* honor user request */ >> + case FIE_DISABLED: >> + case FIE_ENABLED: >> + break; >> + case FIE_UNSET: >> + default: > > Would be more straightforward to do > > if (fie_disabled == FIE_UNSET) { > > here. Right, but then it wouldn't catch the other billion+ values that are the result of not being able to export a limit (AFAIK) on the module parameter. I could use an if: if !((fie_disabled == FIE_DISABLE) || (fie_disabled == FIE_ENABLED)) { } if that is preferable. I thought the case with the explict default: though made it clearer that it was treating all those other values as unset. > >> + fie_disabled = FIE_ENABLED; >> + if (cppc_perf_ctrs_in_pcc()) { >> + pr_info("FIE not enabled on systems with registers in PCC\n"); >> + fie_disabled = FIE_DISABLED; >> + } >> + break; >> + } >> + if (fie_disabled) >> return; >> >> kworker_fie = kthread_create_worker(0, "cppc_fie"); >> @@ -247,7 +269,7 @@ static void __init cppc_freq_invariance_init(void) >> >> static void cppc_freq_invariance_exit(void) >> { >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> >> kthread_destroy_worker(kworker_fie); >> @@ -936,6 +958,7 @@ static void cppc_check_hisi_workaround(void) >> wa_info[i].oem_revision == tbl->oem_revision) { >> /* Overwrite the get() callback */ >> cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; >> + fie_disabled = FIE_DISABLED; >> break; >> } >> } >> diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h >> index f73d357ecdf5..c5614444031f 100644 >> --- a/include/acpi/cppc_acpi.h >> +++ b/include/acpi/cppc_acpi.h >> @@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); >> extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); >> extern int cppc_set_enable(int cpu, bool enable); >> extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); >> +extern bool cppc_perf_ctrs_in_pcc(void); >> extern bool acpi_cpc_valid(void); >> extern bool cppc_allow_fast_switch(void); >> extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); >> @@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) >> { >> return -ENOTSUPP; >> } >> +static inline bool cppc_perf_ctrs_in_pcc(void) >> +{ >> + return false; >> +} >> static inline bool acpi_cpc_valid(void) >> { >> return false; >> -- > > Apart from the above it looks fine to me, but I would like to get an > ACK from Viresh on the second patch. > > Thanks! Thanks for looking at this.
On 18-08-22, 16:16, Jeremy Linton wrote: > diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c > +bool cppc_perf_ctrs_in_pcc(void) > +{ > + int cpu; > + > + for_each_present_cpu(cpu) { > + struct cpc_register_resource *ref_perf_reg; > + struct cpc_desc *cpc_desc; > + > + cpc_desc = per_cpu(cpc_desc_ptr, cpu); > + > + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || > + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || > + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) > + return true; > + > + > + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; > + > + /* > + * If reference perf register is not supported then we should > + * use the nominal perf value > + */ > + if (!CPC_SUPPORTED(ref_perf_reg)) > + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; > + > + if (CPC_IN_PCC(ref_perf_reg)) > + return true; > + } Add a blank line here please. > + return false; > +} > +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); > + > /** > * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. > * @cpunum: CPU from which to read counters. > diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c > index 24eaf0ec344d..32fcb0bf74a4 100644 > --- a/drivers/cpufreq/cppc_cpufreq.c > +++ b/drivers/cpufreq/cppc_cpufreq.c > @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { > > static struct cpufreq_driver cppc_cpufreq_driver; > > +static enum { > + FIE_UNSET = -1, > + FIE_ENABLED, > + FIE_DISABLED > +} fie_disabled = FIE_UNSET; > + > #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE > +module_param(fie_disabled, int, 0444); > +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); > > /* Frequency invariance support */ > struct cppc_freq_invariance { > @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) > struct cppc_freq_invariance *cppc_fi; > int cpu, ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > for_each_cpu(cpu, policy->cpus) { > @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) > struct cppc_freq_invariance *cppc_fi; > int cpu; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > /* policy->cpus will be empty here, use related_cpus instead */ > @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) > }; > int ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + switch (fie_disabled) { > + /* honor user request */ > + case FIE_DISABLED: > + case FIE_ENABLED: > + break; > + case FIE_UNSET: > + default: > + fie_disabled = FIE_ENABLED; > + if (cppc_perf_ctrs_in_pcc()) { > + pr_info("FIE not enabled on systems with registers in PCC\n"); > + fie_disabled = FIE_DISABLED; > + } > + break; > + } here too. > + if (fie_disabled) > return; > > kworker_fie = kthread_create_worker(0, "cppc_fie"); > @@ -247,7 +269,7 @@ static void __init cppc_freq_invariance_init(void) > > static void cppc_freq_invariance_exit(void) > { > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > kthread_destroy_worker(kworker_fie); > @@ -936,6 +958,7 @@ static void cppc_check_hisi_workaround(void) > wa_info[i].oem_revision == tbl->oem_revision) { > /* Overwrite the get() callback */ > cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; > + fie_disabled = FIE_DISABLED; > break; > } > } > diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h > index f73d357ecdf5..c5614444031f 100644 > --- a/include/acpi/cppc_acpi.h > +++ b/include/acpi/cppc_acpi.h > @@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); > extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); > extern int cppc_set_enable(int cpu, bool enable); > extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); > +extern bool cppc_perf_ctrs_in_pcc(void); > extern bool acpi_cpc_valid(void); > extern bool cppc_allow_fast_switch(void); > extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); > @@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) > { > return -ENOTSUPP; > } > +static inline bool cppc_perf_ctrs_in_pcc(void) > +{ > + return false; > +} > static inline bool acpi_cpc_valid(void) > { > return false; Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
On Tue, Aug 23, 2022 at 8:46 PM Jeremy Linton <jeremy.linton@arm.com> wrote: > > Hi, > > On 8/23/22 12:10, Rafael J. Wysocki wrote: > > On Thu, Aug 18, 2022 at 11:24 PM Jeremy Linton <jeremy.linton@arm.com> wrote: > >> > >> PCC regions utilize a mailbox to set/retrieve register values used by > >> the CPPC code. This is fine as long as the operations are > >> infrequent. With the FIE code enabled though the overhead can range > >> from 2-11% of system CPU overhead (ex: as measured by top) on Arm > >> based machines. > >> > >> So, before enabling FIE assure none of the registers used by > >> cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also > >> enable a module parameter which can also disable it at boot or module > >> reload. > >> > >> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> > >> --- > >> drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ > >> drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- > >> include/acpi/cppc_acpi.h | 5 +++++ > >> 3 files changed, 73 insertions(+), 4 deletions(-) > >> > >> diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c > >> index 1e15a9f25ae9..c840bf606b30 100644 > >> --- a/drivers/acpi/cppc_acpi.c > >> +++ b/drivers/acpi/cppc_acpi.c > >> @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) > >> } > >> EXPORT_SYMBOL_GPL(cppc_get_perf_caps); > >> > >> +/** > >> + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. > >> + * > >> + * CPPC has flexibility about how counters describing CPU perf are delivered. > > > > "CPU performance counters are accessed" > > Sure, > > > > > > >> + * One of the choices is PCC regions, which can have a high access latency. This > >> + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. > >> + * > >> + * Return: true if any of the counters are in PCC regions, false otherwise > >> + */ > >> +bool cppc_perf_ctrs_in_pcc(void) > >> +{ > >> + int cpu; > >> + > >> + for_each_present_cpu(cpu) { > >> + struct cpc_register_resource *ref_perf_reg; > >> + struct cpc_desc *cpc_desc; > >> + > >> + cpc_desc = per_cpu(cpc_desc_ptr, cpu); > >> + > >> + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || > >> + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || > >> + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) > >> + return true; > >> + > >> + > >> + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; > >> + > >> + /* > >> + * If reference perf register is not supported then we should > >> + * use the nominal perf value > >> + */ > >> + if (!CPC_SUPPORTED(ref_perf_reg)) > >> + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; > >> + > >> + if (CPC_IN_PCC(ref_perf_reg)) > >> + return true; > >> + } > >> + return false; > >> +} > >> +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); > >> + > >> /** > >> * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. > >> * @cpunum: CPU from which to read counters. > >> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c > >> index 24eaf0ec344d..32fcb0bf74a4 100644 > >> --- a/drivers/cpufreq/cppc_cpufreq.c > >> +++ b/drivers/cpufreq/cppc_cpufreq.c > >> @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { > >> > >> static struct cpufreq_driver cppc_cpufreq_driver; > >> > >> +static enum { > >> + FIE_UNSET = -1, > >> + FIE_ENABLED, > >> + FIE_DISABLED > >> +} fie_disabled = FIE_UNSET; > >> + > >> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE > >> +module_param(fie_disabled, int, 0444); > >> +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); > >> > >> /* Frequency invariance support */ > >> struct cppc_freq_invariance { > >> @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) > >> struct cppc_freq_invariance *cppc_fi; > >> int cpu, ret; > >> > >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > >> + if (fie_disabled) > >> return; > >> > >> for_each_cpu(cpu, policy->cpus) { > >> @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) > >> struct cppc_freq_invariance *cppc_fi; > >> int cpu; > >> > >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > >> + if (fie_disabled) > >> return; > >> > >> /* policy->cpus will be empty here, use related_cpus instead */ > >> @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) > >> }; > >> int ret; > >> > >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > >> + switch (fie_disabled) { > >> + /* honor user request */ > >> + case FIE_DISABLED: > >> + case FIE_ENABLED: > >> + break; > >> + case FIE_UNSET: > >> + default: > > > > Would be more straightforward to do > > > > if (fie_disabled == FIE_UNSET) { > > > > here. > > Right, but then it wouldn't catch the other billion+ values that are the > result of not being able to export a limit (AFAIK) on the module > parameter. I could use an if: Hmm. I've missed the module_param() part. It doesn't even make sense to use enum for the variable type in that case. Also you can always do if (fie_disabled < 0) { ... } > if !((fie_disabled == FIE_DISABLE) || (fie_disabled == FIE_ENABLED)) { > > } > > > if that is preferable. I thought the case with the explict default: > though made it clearer that it was treating all those other values as unset. > > > > >> + fie_disabled = FIE_ENABLED; > >> + if (cppc_perf_ctrs_in_pcc()) { > >> + pr_info("FIE not enabled on systems with registers in PCC\n"); > >> + fie_disabled = FIE_DISABLED; > >> + } > >> + break; > >> + } > >> + if (fie_disabled) > >> return; > >> > >> kworker_fie = kthread_create_worker(0, "cppc_fie"); > >> @@ -247,7 +269,7 @@ static void __init cppc_freq_invariance_init(void) > >> > >> static void cppc_freq_invariance_exit(void) > >> { > >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > >> + if (fie_disabled) > >> return; > >> > >> kthread_destroy_worker(kworker_fie); > >> @@ -936,6 +958,7 @@ static void cppc_check_hisi_workaround(void) > >> wa_info[i].oem_revision == tbl->oem_revision) { > >> /* Overwrite the get() callback */ > >> cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; > >> + fie_disabled = FIE_DISABLED; > >> break; > >> } > >> } > >> diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h > >> index f73d357ecdf5..c5614444031f 100644 > >> --- a/include/acpi/cppc_acpi.h > >> +++ b/include/acpi/cppc_acpi.h > >> @@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); > >> extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); > >> extern int cppc_set_enable(int cpu, bool enable); > >> extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); > >> +extern bool cppc_perf_ctrs_in_pcc(void); > >> extern bool acpi_cpc_valid(void); > >> extern bool cppc_allow_fast_switch(void); > >> extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); > >> @@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) > >> { > >> return -ENOTSUPP; > >> } > >> +static inline bool cppc_perf_ctrs_in_pcc(void) > >> +{ > >> + return false; > >> +} > >> static inline bool acpi_cpc_valid(void) > >> { > >> return false; > >> -- > > > > Apart from the above it looks fine to me, but I would like to get an > > ACK from Viresh on the second patch. > > > > Thanks! > > Thanks for looking at this. > >
Hi Jeremy, +CC Dietmar, Morten and Souvik On 8/18/22 22:16, Jeremy Linton wrote: > PCC regions utilize a mailbox to set/retrieve register values used by > the CPPC code. This is fine as long as the operations are > infrequent. With the FIE code enabled though the overhead can range > from 2-11% of system CPU overhead (ex: as measured by top) on Arm > based machines. > > So, before enabling FIE assure none of the registers used by > cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also > enable a module parameter which can also disable it at boot or module > reload. > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> > --- > drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ > drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- > include/acpi/cppc_acpi.h | 5 +++++ > 3 files changed, 73 insertions(+), 4 deletions(-) > > diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c > index 1e15a9f25ae9..c840bf606b30 100644 > --- a/drivers/acpi/cppc_acpi.c > +++ b/drivers/acpi/cppc_acpi.c > @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) > } > EXPORT_SYMBOL_GPL(cppc_get_perf_caps); > > +/** > + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. > + * > + * CPPC has flexibility about how counters describing CPU perf are delivered. > + * One of the choices is PCC regions, which can have a high access latency. This > + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. > + * > + * Return: true if any of the counters are in PCC regions, false otherwise > + */ > +bool cppc_perf_ctrs_in_pcc(void) > +{ > + int cpu; > + > + for_each_present_cpu(cpu) { > + struct cpc_register_resource *ref_perf_reg; > + struct cpc_desc *cpc_desc; > + > + cpc_desc = per_cpu(cpc_desc_ptr, cpu); > + > + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || > + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || > + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) > + return true; > + > + > + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; > + > + /* > + * If reference perf register is not supported then we should > + * use the nominal perf value > + */ > + if (!CPC_SUPPORTED(ref_perf_reg)) > + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; > + > + if (CPC_IN_PCC(ref_perf_reg)) > + return true; > + } Do we have a platform which returns false here? > + return false; > +} > +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); > + > /** > * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. > * @cpunum: CPU from which to read counters. > diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c > index 24eaf0ec344d..32fcb0bf74a4 100644 > --- a/drivers/cpufreq/cppc_cpufreq.c > +++ b/drivers/cpufreq/cppc_cpufreq.c > @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { > > static struct cpufreq_driver cppc_cpufreq_driver; > > +static enum { > + FIE_UNSET = -1, > + FIE_ENABLED, > + FIE_DISABLED > +} fie_disabled = FIE_UNSET; > + > #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE > +module_param(fie_disabled, int, 0444); > +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); Why we need the modules support? I would drop this, since the fie_disabled would be set properly when needed. The code would be cleaner (more below). > > /* Frequency invariance support */ > struct cppc_freq_invariance { > @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) > struct cppc_freq_invariance *cppc_fi; > int cpu, ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > for_each_cpu(cpu, policy->cpus) { > @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) > struct cppc_freq_invariance *cppc_fi; > int cpu; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + if (fie_disabled) > return; > > /* policy->cpus will be empty here, use related_cpus instead */ > @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) > }; > int ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > + switch (fie_disabled) { > + /* honor user request */ > + case FIE_DISABLED: > + case FIE_ENABLED: This module's over-write doesn't look 'clean'. Is it OK to allow a user to go with the poor performing system (likely on many platforms)? Or we assume that there are platforms which has a bit faster mailboxes and they already have the FIE issue impacting task's utilization measurements. It looks like we are not sure about the solution. On one hand we implement those checks in the cppc_perf_ctrs_in_pcc() which could set the flag, but on the other hand we allow user to decide. IMO this creates diversity that we are not able to control. It creates another tunable knob in the kernel, which then is forgotten to check. I still haven't seen information that the old FIE was an issue on those servers and had impact on task utilization measurements. This should be a main requirement for this new feature. This would be after we proved that the utilization problem was due to the FIE and not something else (like uArch variation or workload variation). IMO let's revert the ACPI_CPPC_CPUFREQ_FIE. When we get data that FIE is an issue on those servers we can come back to this topic. Regards, Lukasz
Hi, On 8/24/22 09:41, Lukasz Luba wrote: > Hi Jeremy, > > +CC Dietmar, Morten and Souvik > > On 8/18/22 22:16, Jeremy Linton wrote: >> PCC regions utilize a mailbox to set/retrieve register values used by >> the CPPC code. This is fine as long as the operations are >> infrequent. With the FIE code enabled though the overhead can range >> from 2-11% of system CPU overhead (ex: as measured by top) on Arm >> based machines. >> >> So, before enabling FIE assure none of the registers used by >> cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also >> enable a module parameter which can also disable it at boot or module >> reload. >> >> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> >> --- >> drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ >> drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- >> include/acpi/cppc_acpi.h | 5 +++++ >> 3 files changed, 73 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c >> index 1e15a9f25ae9..c840bf606b30 100644 >> --- a/drivers/acpi/cppc_acpi.c >> +++ b/drivers/acpi/cppc_acpi.c >> @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct >> cppc_perf_caps *perf_caps) >> } >> EXPORT_SYMBOL_GPL(cppc_get_perf_caps); >> +/** >> + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC >> region. >> + * >> + * CPPC has flexibility about how counters describing CPU perf are >> delivered. >> + * One of the choices is PCC regions, which can have a high access >> latency. This >> + * routine allows callers of cppc_get_perf_ctrs() to know this ahead >> of time. >> + * >> + * Return: true if any of the counters are in PCC regions, false >> otherwise >> + */ >> +bool cppc_perf_ctrs_in_pcc(void) >> +{ >> + int cpu; >> + >> + for_each_present_cpu(cpu) { >> + struct cpc_register_resource *ref_perf_reg; >> + struct cpc_desc *cpc_desc; >> + >> + cpc_desc = per_cpu(cpc_desc_ptr, cpu); >> + >> + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || >> + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || >> + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) >> + return true; >> + >> + >> + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; >> + >> + /* >> + * If reference perf register is not supported then we should >> + * use the nominal perf value >> + */ >> + if (!CPC_SUPPORTED(ref_perf_reg)) >> + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; >> + >> + if (CPC_IN_PCC(ref_perf_reg)) >> + return true; >> + } > > Do we have a platform which returns false here? I'm not aware of one, but I don't have access to every bit of HW either. > >> + return false; >> +} >> +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); >> + >> /** >> * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. >> * @cpunum: CPU from which to read counters. >> diff --git a/drivers/cpufreq/cppc_cpufreq.c >> b/drivers/cpufreq/cppc_cpufreq.c >> index 24eaf0ec344d..32fcb0bf74a4 100644 >> --- a/drivers/cpufreq/cppc_cpufreq.c >> +++ b/drivers/cpufreq/cppc_cpufreq.c >> @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { >> static struct cpufreq_driver cppc_cpufreq_driver; >> +static enum { >> + FIE_UNSET = -1, >> + FIE_ENABLED, >> + FIE_DISABLED >> +} fie_disabled = FIE_UNSET; >> + >> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE >> +module_param(fie_disabled, int, 0444); >> +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine >> (FIE)"); > > Why we need the modules support? > I would drop this, since the fie_disabled would be set properly when > needed. The code would be cleaner (more below). Well the original version was simpler, but I tend to agree with Ionela who proposed this version in a previous review. The module param at this point is a debugging/testing statment since it allows the user to force FIE on or off independent of the PCC decision. Until we have a clear statment about how/when/where this feature is useful, having the ability to make the choice dynamically at runtime is quite useful and less intrusive than having multiple kernels/modules on the machine with the config option flipped, and requiring a reboot. > >> /* Frequency invariance support */ >> struct cppc_freq_invariance { >> @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct >> cpufreq_policy *policy) >> struct cppc_freq_invariance *cppc_fi; >> int cpu, ret; >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> for_each_cpu(cpu, policy->cpus) { >> @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct >> cpufreq_policy *policy) >> struct cppc_freq_invariance *cppc_fi; >> int cpu; >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> /* policy->cpus will be empty here, use related_cpus instead */ >> @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) >> }; >> int ret; >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + switch (fie_disabled) { >> + /* honor user request */ >> + case FIE_DISABLED: >> + case FIE_ENABLED: > > This module's over-write doesn't look 'clean'. > Is it OK to allow a user to go with the poor performing > system (likely on many platforms)? Or we assume that there are > platforms which has a bit faster mailboxes and they already > have the FIE issue impacting task's utilization measurements. I think with this patch applied we aren't any worse than before, but that is based on the fact that I've not seen a machine that has actual CPPC hardware registers (rather than mailboxes). So I think your suggesting that we will then have to revisit the code (to maybe avoid all the cppc_fie task/etc overhead) if a machine appears with hardware registers. And I tend to sorta agree, but that is what the second patch is for :) which will likely be what most distro's end up applying on generic kernels. > > It looks like we are not sure about the solution. On one hand > we implement those checks in the cppc_perf_ctrs_in_pcc() > which could set the flag, but on the other hand we allow user > to decide. IMO this creates diversity that we are not able to control. > It creates another tunable knob in the kernel, which then is forgotten > to check. Your average user will never turn this knob, and if they do, its likely to solve a problem, or test for performace. The fact that we aren't 100% sure of where/when this feature is useful is the argument for making it a tunable. > > I still haven't seen information that the old FIE was an issue on those > servers and had impact on task utilization measurements. This should be > a main requirement for this new feature. This would be after we proved > that the utilization problem was due to the FIE and not something else > (like uArch variation or workload variation). > > IMO let's revert the ACPI_CPPC_CPUFREQ_FIE. When we get data that > FIE is an issue on those servers we can come back to this topic. I don't really have an opinion about this, maybe someone else can comment :) Although, with both of these patches applied we can kick the decision down the road and revisit it in a couple years, and maybe have a clearer view.
Hi, On 8/24/22 01:13, Viresh Kumar wrote: > On 18-08-22, 16:16, Jeremy Linton wrote: >> diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c >> +bool cppc_perf_ctrs_in_pcc(void) >> +{ >> + int cpu; >> + >> + for_each_present_cpu(cpu) { >> + struct cpc_register_resource *ref_perf_reg; >> + struct cpc_desc *cpc_desc; >> + >> + cpc_desc = per_cpu(cpc_desc_ptr, cpu); >> + >> + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || >> + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || >> + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) >> + return true; >> + >> + >> + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; >> + >> + /* >> + * If reference perf register is not supported then we should >> + * use the nominal perf value >> + */ >> + if (!CPC_SUPPORTED(ref_perf_reg)) >> + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; >> + >> + if (CPC_IN_PCC(ref_perf_reg)) >> + return true; >> + } > > Add a blank line here please. Sure, > >> + return false; >> +} >> +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); >> + >> /** >> * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. >> * @cpunum: CPU from which to read counters. >> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c >> index 24eaf0ec344d..32fcb0bf74a4 100644 >> --- a/drivers/cpufreq/cppc_cpufreq.c >> +++ b/drivers/cpufreq/cppc_cpufreq.c >> @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { >> >> static struct cpufreq_driver cppc_cpufreq_driver; >> >> +static enum { >> + FIE_UNSET = -1, >> + FIE_ENABLED, >> + FIE_DISABLED >> +} fie_disabled = FIE_UNSET; >> + >> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE >> +module_param(fie_disabled, int, 0444); >> +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); >> >> /* Frequency invariance support */ >> struct cppc_freq_invariance { >> @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) >> struct cppc_freq_invariance *cppc_fi; >> int cpu, ret; >> >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> >> for_each_cpu(cpu, policy->cpus) { >> @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) >> struct cppc_freq_invariance *cppc_fi; >> int cpu; >> >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> >> /* policy->cpus will be empty here, use related_cpus instead */ >> @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) >> }; >> int ret; >> >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + switch (fie_disabled) { >> + /* honor user request */ >> + case FIE_DISABLED: >> + case FIE_ENABLED: >> + break; >> + case FIE_UNSET: >> + default: >> + fie_disabled = FIE_ENABLED; >> + if (cppc_perf_ctrs_in_pcc()) { >> + pr_info("FIE not enabled on systems with registers in PCC\n"); >> + fie_disabled = FIE_DISABLED; >> + } >> + break; >> + } > > here too. Sure, > >> + if (fie_disabled) >> return; >> >> kworker_fie = kthread_create_worker(0, "cppc_fie"); >> @@ -247,7 +269,7 @@ static void __init cppc_freq_invariance_init(void) >> >> static void cppc_freq_invariance_exit(void) >> { >> - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) >> + if (fie_disabled) >> return; >> >> kthread_destroy_worker(kworker_fie); >> @@ -936,6 +958,7 @@ static void cppc_check_hisi_workaround(void) >> wa_info[i].oem_revision == tbl->oem_revision) { >> /* Overwrite the get() callback */ >> cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; >> + fie_disabled = FIE_DISABLED; >> break; >> } >> } >> diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h >> index f73d357ecdf5..c5614444031f 100644 >> --- a/include/acpi/cppc_acpi.h >> +++ b/include/acpi/cppc_acpi.h >> @@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); >> extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); >> extern int cppc_set_enable(int cpu, bool enable); >> extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); >> +extern bool cppc_perf_ctrs_in_pcc(void); >> extern bool acpi_cpc_valid(void); >> extern bool cppc_allow_fast_switch(void); >> extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); >> @@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) >> { >> return -ENOTSUPP; >> } >> +static inline bool cppc_perf_ctrs_in_pcc(void) >> +{ >> + return false; >> +} >> static inline bool acpi_cpc_valid(void) >> { >> return false; > > Acked-by: Viresh Kumar <viresh.kumar@linaro.org> > Thanks for looking at this.
+Vincent. On 24-08-22, 15:41, Lukasz Luba wrote: > Hi Jeremy, > > +CC Dietmar, Morten and Souvik > > On 8/18/22 22:16, Jeremy Linton wrote: > > PCC regions utilize a mailbox to set/retrieve register values used by > > the CPPC code. This is fine as long as the operations are > > infrequent. With the FIE code enabled though the overhead can range > > from 2-11% of system CPU overhead (ex: as measured by top) on Arm > > based machines. > > > > So, before enabling FIE assure none of the registers used by > > cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also > > enable a module parameter which can also disable it at boot or module > > reload. > > > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> > > --- > > drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ > > drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- > > include/acpi/cppc_acpi.h | 5 +++++ > > 3 files changed, 73 insertions(+), 4 deletions(-) > > > > diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c > > index 1e15a9f25ae9..c840bf606b30 100644 > > --- a/drivers/acpi/cppc_acpi.c > > +++ b/drivers/acpi/cppc_acpi.c > > @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) > > } > > EXPORT_SYMBOL_GPL(cppc_get_perf_caps); > > +/** > > + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. > > + * > > + * CPPC has flexibility about how counters describing CPU perf are delivered. > > + * One of the choices is PCC regions, which can have a high access latency. This > > + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. > > + * > > + * Return: true if any of the counters are in PCC regions, false otherwise > > + */ > > +bool cppc_perf_ctrs_in_pcc(void) > > +{ > > + int cpu; > > + > > + for_each_present_cpu(cpu) { > > + struct cpc_register_resource *ref_perf_reg; > > + struct cpc_desc *cpc_desc; > > + > > + cpc_desc = per_cpu(cpc_desc_ptr, cpu); > > + > > + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || > > + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || > > + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) > > + return true; > > + > > + > > + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; > > + > > + /* > > + * If reference perf register is not supported then we should > > + * use the nominal perf value > > + */ > > + if (!CPC_SUPPORTED(ref_perf_reg)) > > + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; > > + > > + if (CPC_IN_PCC(ref_perf_reg)) > > + return true; > > + } > > Do we have a platform which returns false here? > > > + return false; > > +} > > +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); > > + > > /** > > * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. > > * @cpunum: CPU from which to read counters. > > diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c > > index 24eaf0ec344d..32fcb0bf74a4 100644 > > --- a/drivers/cpufreq/cppc_cpufreq.c > > +++ b/drivers/cpufreq/cppc_cpufreq.c > > @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { > > static struct cpufreq_driver cppc_cpufreq_driver; > > +static enum { > > + FIE_UNSET = -1, > > + FIE_ENABLED, > > + FIE_DISABLED > > +} fie_disabled = FIE_UNSET; > > + > > #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE > > +module_param(fie_disabled, int, 0444); > > +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); > > Why we need the modules support? > I would drop this, since the fie_disabled would be set properly when > needed. The code would be cleaner (more below). > > > /* Frequency invariance support */ > > struct cppc_freq_invariance { > > @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) > > struct cppc_freq_invariance *cppc_fi; > > int cpu, ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > > + if (fie_disabled) > > return; > > for_each_cpu(cpu, policy->cpus) { > > @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) > > struct cppc_freq_invariance *cppc_fi; > > int cpu; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > > + if (fie_disabled) > > return; > > /* policy->cpus will be empty here, use related_cpus instead */ > > @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) > > }; > > int ret; > > - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) > > + switch (fie_disabled) { > > + /* honor user request */ > > + case FIE_DISABLED: > > + case FIE_ENABLED: > > This module's over-write doesn't look 'clean'. > Is it OK to allow a user to go with the poor performing > system (likely on many platforms)? Or we assume that there are > platforms which has a bit faster mailboxes and they already > have the FIE issue impacting task's utilization measurements. > > It looks like we are not sure about the solution. On one hand > we implement those checks in the cppc_perf_ctrs_in_pcc() > which could set the flag, but on the other hand we allow user > to decide. IMO this creates diversity that we are not able to control. > It creates another tunable knob in the kernel, which then is forgotten > to check. > > I still haven't seen information that the old FIE was an issue on those > servers and had impact on task utilization measurements. This should be > a main requirement for this new feature. This would be after we proved > that the utilization problem was due to the FIE and not something else (like > uArch variation or workload variation). > > IMO let's revert the ACPI_CPPC_CPUFREQ_FIE. When we get data that > FIE is an issue on those servers we can come back to this topic. > > Regards, > Lukasz
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 1e15a9f25ae9..c840bf606b30 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1240,6 +1240,47 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) } EXPORT_SYMBOL_GPL(cppc_get_perf_caps); +/** + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. + * + * CPPC has flexibility about how counters describing CPU perf are delivered. + * One of the choices is PCC regions, which can have a high access latency. This + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. + * + * Return: true if any of the counters are in PCC regions, false otherwise + */ +bool cppc_perf_ctrs_in_pcc(void) +{ + int cpu; + + for_each_present_cpu(cpu) { + struct cpc_register_resource *ref_perf_reg; + struct cpc_desc *cpc_desc; + + cpc_desc = per_cpu(cpc_desc_ptr, cpu); + + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) + return true; + + + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; + + /* + * If reference perf register is not supported then we should + * use the nominal perf value + */ + if (!CPC_SUPPORTED(ref_perf_reg)) + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; + + if (CPC_IN_PCC(ref_perf_reg)) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); + /** * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. * @cpunum: CPU from which to read counters. diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 24eaf0ec344d..32fcb0bf74a4 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = { static struct cpufreq_driver cppc_cpufreq_driver; +static enum { + FIE_UNSET = -1, + FIE_ENABLED, + FIE_DISABLED +} fie_disabled = FIE_UNSET; + #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE +module_param(fie_disabled, int, 0444); +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); /* Frequency invariance support */ struct cppc_freq_invariance { @@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) struct cppc_freq_invariance *cppc_fi; int cpu, ret; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; for_each_cpu(cpu, policy->cpus) { @@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) struct cppc_freq_invariance *cppc_fi; int cpu; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; /* policy->cpus will be empty here, use related_cpus instead */ @@ -229,7 +237,21 @@ static void __init cppc_freq_invariance_init(void) }; int ret; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + switch (fie_disabled) { + /* honor user request */ + case FIE_DISABLED: + case FIE_ENABLED: + break; + case FIE_UNSET: + default: + fie_disabled = FIE_ENABLED; + if (cppc_perf_ctrs_in_pcc()) { + pr_info("FIE not enabled on systems with registers in PCC\n"); + fie_disabled = FIE_DISABLED; + } + break; + } + if (fie_disabled) return; kworker_fie = kthread_create_worker(0, "cppc_fie"); @@ -247,7 +269,7 @@ static void __init cppc_freq_invariance_init(void) static void cppc_freq_invariance_exit(void) { - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; kthread_destroy_worker(kworker_fie); @@ -936,6 +958,7 @@ static void cppc_check_hisi_workaround(void) wa_info[i].oem_revision == tbl->oem_revision) { /* Overwrite the get() callback */ cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; + fie_disabled = FIE_DISABLED; break; } } diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index f73d357ecdf5..c5614444031f 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); +extern bool cppc_perf_ctrs_in_pcc(void); extern bool acpi_cpc_valid(void); extern bool cppc_allow_fast_switch(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); @@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) { return -ENOTSUPP; } +static inline bool cppc_perf_ctrs_in_pcc(void) +{ + return false; +} static inline bool acpi_cpc_valid(void) { return false;
PCC regions utilize a mailbox to set/retrieve register values used by the CPPC code. This is fine as long as the operations are infrequent. With the FIE code enabled though the overhead can range from 2-11% of system CPU overhead (ex: as measured by top) on Arm based machines. So, before enabling FIE assure none of the registers used by cppc_get_perf_ctrs() are in the PCC region. Furthermore lets also enable a module parameter which can also disable it at boot or module reload. Signed-off-by: Jeremy Linton <jeremy.linton@arm.com> --- drivers/acpi/cppc_acpi.c | 41 ++++++++++++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 31 +++++++++++++++++++++---- include/acpi/cppc_acpi.h | 5 +++++ 3 files changed, 73 insertions(+), 4 deletions(-)