Message ID | 64eb440fd48d10a55525253bce2e9143db872f67.1714112854.git.perry.yuan@amd.com |
---|---|
State | New |
Headers | show |
Series | AMD Pstate Driver Core Performance Boost | expand |
On 4/26/2024 01:34, Perry Yuan wrote: > From: Perry Yuan <Perry.Yuan@amd.com> > > With this new sysfs entry `cpb_boost`created, user can change CPU boost > state dynamically under `active`, `guided` and `passive` modes. > And the highest perf and frequency will also be updated as the boost > state changing. > > 0): check current boost state > cat /sys/devices/system/cpu/amd_pstate/cpb_boost > > 1): disable CPU boost > sudo bash -c "echo 0 > /sys/devices/system/cpu/amd_pstate/cpb_boost" > > 2): enable CPU boost > sudo bash -c "echo 1 > /sys/devices/system/cpu/amd_pstate/cpb_boost" > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217931 > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217618 > Signed-off-by: Perry Yuan <Perry.Yuan@amd.com> > --- > drivers/cpufreq/amd-pstate.c | 99 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 99 insertions(+) > > diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c > index aa17a3134497..53ef39c6dbfa 100644 > --- a/drivers/cpufreq/amd-pstate.c > +++ b/drivers/cpufreq/amd-pstate.c > @@ -1295,6 +1295,103 @@ static ssize_t prefcore_show(struct device *dev, > return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); > } > > +static int amd_cpu_boost_update(struct amd_cpudata *cpudata, u32 on) > +{ > + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); > + struct cppc_perf_ctrls perf_ctrls; > + u32 highest_perf, nominal_perf, nominal_freq, max_freq; > + int ret; > + > + if (!policy) > + return -ENODATA; > + > + highest_perf = READ_ONCE(cpudata->highest_perf); > + nominal_perf = READ_ONCE(cpudata->nominal_perf); > + nominal_freq = READ_ONCE(cpudata->nominal_freq); > + max_freq = READ_ONCE(cpudata->max_freq); > + > + if (boot_cpu_has(X86_FEATURE_CPPC)) { > + u64 value = READ_ONCE(cpudata->cppc_req_cached); > + > + value &= ~GENMASK_ULL(7, 0); > + value |= on ? highest_perf : nominal_perf; > + WRITE_ONCE(cpudata->cppc_req_cached, value); > + > + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); > + } else { > + perf_ctrls.max_perf = on ? highest_perf : nominal_perf; > + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); > + if (ret) { > + cpufreq_cpu_release(policy); > + pr_debug("failed to set energy perf value (%d)\n", ret); > + return ret; > + } > + } > + > + if (on) > + policy->cpuinfo.max_freq = max_freq; > + else > + policy->cpuinfo.max_freq = nominal_freq * 1000; > + > + policy->max = policy->cpuinfo.max_freq; > + > + if (cppc_state == AMD_PSTATE_PASSIVE) { > + ret = freq_qos_update_request(&cpudata->req[1], > + policy->cpuinfo.max_freq); > + } > + > + cpufreq_cpu_release(policy); > + > + return ret; > +} > + > +static ssize_t cpb_boost_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + return sysfs_emit(buf, "%u\n", amd_pstate_global_params.cpb_boost); > +} > + > +static ssize_t cpb_boost_store(struct device *dev, struct device_attribute *b, > + const char *buf, size_t count) > +{ > + bool new_state; > + ssize_t ret; > + int cpu; > + > + if (!amd_pstate_global_params.cpb_supported) { > + pr_err("Boost mode is not supported by this processor or SBIOS\n"); > + return -EINVAL; > + } As this is information that will be known when you first probe, I feel it would be better to affect the visibility of the file than let a user discover it doesn't work when they try to write it. Thinking down the road software like power-profiles-daemon and tuned will probably adapt to this new file and if their contract is that the file exists means they should write it that's going to turn into needless errors in any such system's log on every boot. > + > + ret = kstrtobool(buf, &new_state); > + if (ret) > + return ret; > + > + mutex_lock(&amd_pstate_driver_lock); > + amd_pstate_global_params.cpb_boost = !!new_state; > + > + for_each_present_cpu(cpu) { It seems to me that by making an attribute for every single CPU this is wrong. It means that writing boost for one CPU's files applies to all other CPUs too. If it's going to be a global attribute that is shared by all CPUs it should be a single file. Otherwise this is going to be a problem if (for example) software tries to turn on boost for only 1 CPU. Think of this sequence: 1) 16 CPUs, cpb_boost is currently turned off. 2) Software tries to turn CPB boost on for the "first" CPU only. 3) It reads the value of the first CPU and sees it's 0. So It changes the value for the first CPU (which causes a global change). 4) It reads the value for the second CPU and sees it's 1. It tries to change this back to zero, which again causes a global change. 5) It checks all the others and they're all 0 and it does nothing. So you see by having a global attribute which is shared with every single CPU you now have a flow problem that userspace can introduce. > + > + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); > + struct amd_cpudata *cpudata = policy->driver_data; > + > + if (!cpudata) { > + pr_err("cpudata is NULL\n"); > + ret = -ENODATA; > + cpufreq_cpu_put(policy); > + goto err_exit; > + } > + > + amd_cpu_boost_update(cpudata, amd_pstate_global_params.cpb_boost); > + refresh_frequency_limits(policy); > + cpufreq_cpu_put(policy); > + } > + > +err_exit: > + mutex_unlock(&amd_pstate_driver_lock); > + return ret < 0 ? ret : count; > +} > + > cpufreq_freq_attr_ro(amd_pstate_max_freq); > cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); > > @@ -1305,6 +1402,7 @@ cpufreq_freq_attr_rw(energy_performance_preference); > cpufreq_freq_attr_ro(energy_performance_available_preferences); > static DEVICE_ATTR_RW(status); > static DEVICE_ATTR_RO(prefcore); > +static DEVICE_ATTR_RW(cpb_boost); > > static struct freq_attr *amd_pstate_attr[] = { > &amd_pstate_max_freq, > @@ -1329,6 +1427,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { > static struct attribute *pstate_global_attributes[] = { > &dev_attr_status.attr, > &dev_attr_prefcore.attr, > + &dev_attr_cpb_boost.attr, > NULL > }; >
On 4/26/2024 09:07, Mario Limonciello wrote: > On 4/26/2024 01:34, Perry Yuan wrote: >> From: Perry Yuan <Perry.Yuan@amd.com> >> >> With this new sysfs entry `cpb_boost`created, user can change CPU boost >> state dynamically under `active`, `guided` and `passive` modes. >> And the highest perf and frequency will also be updated as the boost >> state changing. >> >> 0): check current boost state >> cat /sys/devices/system/cpu/amd_pstate/cpb_boost >> >> 1): disable CPU boost >> sudo bash -c "echo 0 > /sys/devices/system/cpu/amd_pstate/cpb_boost" >> >> 2): enable CPU boost >> sudo bash -c "echo 1 > /sys/devices/system/cpu/amd_pstate/cpb_boost" >> >> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217931 >> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217618 >> Signed-off-by: Perry Yuan <Perry.Yuan@amd.com> >> --- >> drivers/cpufreq/amd-pstate.c | 99 ++++++++++++++++++++++++++++++++++++ >> 1 file changed, 99 insertions(+) >> >> diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c >> index aa17a3134497..53ef39c6dbfa 100644 >> --- a/drivers/cpufreq/amd-pstate.c >> +++ b/drivers/cpufreq/amd-pstate.c >> @@ -1295,6 +1295,103 @@ static ssize_t prefcore_show(struct device *dev, >> return sysfs_emit(buf, "%s\n", >> str_enabled_disabled(amd_pstate_prefcore)); >> } >> +static int amd_cpu_boost_update(struct amd_cpudata *cpudata, u32 on) >> +{ >> + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); >> + struct cppc_perf_ctrls perf_ctrls; >> + u32 highest_perf, nominal_perf, nominal_freq, max_freq; >> + int ret; >> + >> + if (!policy) >> + return -ENODATA; >> + >> + highest_perf = READ_ONCE(cpudata->highest_perf); >> + nominal_perf = READ_ONCE(cpudata->nominal_perf); >> + nominal_freq = READ_ONCE(cpudata->nominal_freq); >> + max_freq = READ_ONCE(cpudata->max_freq); >> + >> + if (boot_cpu_has(X86_FEATURE_CPPC)) { >> + u64 value = READ_ONCE(cpudata->cppc_req_cached); >> + >> + value &= ~GENMASK_ULL(7, 0); >> + value |= on ? highest_perf : nominal_perf; >> + WRITE_ONCE(cpudata->cppc_req_cached, value); >> + >> + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); >> + } else { >> + perf_ctrls.max_perf = on ? highest_perf : nominal_perf; >> + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); >> + if (ret) { >> + cpufreq_cpu_release(policy); >> + pr_debug("failed to set energy perf value (%d)\n", ret); >> + return ret; >> + } >> + } >> + >> + if (on) >> + policy->cpuinfo.max_freq = max_freq; >> + else >> + policy->cpuinfo.max_freq = nominal_freq * 1000; >> + >> + policy->max = policy->cpuinfo.max_freq; >> + >> + if (cppc_state == AMD_PSTATE_PASSIVE) { >> + ret = freq_qos_update_request(&cpudata->req[1], >> + policy->cpuinfo.max_freq); >> + } >> + >> + cpufreq_cpu_release(policy); >> + >> + return ret; >> +} >> + >> +static ssize_t cpb_boost_show(struct device *dev, >> + struct device_attribute *attr, char *buf) >> +{ >> + return sysfs_emit(buf, "%u\n", amd_pstate_global_params.cpb_boost); >> +} >> + >> +static ssize_t cpb_boost_store(struct device *dev, struct >> device_attribute *b, >> + const char *buf, size_t count) >> +{ >> + bool new_state; >> + ssize_t ret; >> + int cpu; >> + >> + if (!amd_pstate_global_params.cpb_supported) { >> + pr_err("Boost mode is not supported by this processor or >> SBIOS\n"); >> + return -EINVAL; >> + } > > As this is information that will be known when you first probe, I feel > it would be better to affect the visibility of the file than let a user > discover it doesn't work when they try to write it. > > Thinking down the road software like power-profiles-daemon and tuned > will probably adapt to this new file and if their contract is that the > file exists means they should write it that's going to turn into > needless errors in any such system's log on every boot. > >> + >> + ret = kstrtobool(buf, &new_state); >> + if (ret) >> + return ret; >> + >> + mutex_lock(&amd_pstate_driver_lock); >> + amd_pstate_global_params.cpb_boost = !!new_state; >> + >> + for_each_present_cpu(cpu) { > > It seems to me that by making an attribute for every single CPU this is > wrong. It means that writing boost for one CPU's files applies to all > other CPUs too. > > If it's going to be a global attribute that is shared by all CPUs it > should be a single file. > > Otherwise this is going to be a problem if (for example) software tries > to turn on boost for only 1 CPU. Think of this sequence: > > 1) 16 CPUs, cpb_boost is currently turned off. > 2) Software tries to turn CPB boost on for the "first" CPU only. > 3) It reads the value of the first CPU and sees it's 0. So It changes > the value for the first CPU (which causes a global change). > 4) It reads the value for the second CPU and sees it's 1. It tries to > change this back to zero, which again causes a global change. > 5) It checks all the others and they're all 0 and it does nothing. > > So you see by having a global attribute which is shared with every > single CPU you now have a flow problem that userspace can introduce. > Hi, Sorry, I can see now that it current IS a global attribute in this implementation (IE it's DEVICE_ATTR_RW not cpufreq_freq_attr_rw). However as you're already actually applying it to each CPU individually, I do think it is better to make it a per CPU attribute. There could be cases that software only wants some of the cores to be boosted, so could you please instead consider to convert to use cpufreq_freq_attr_rw for the attribute and only apply one CPU at a time? Thanks! >> + >> + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); >> + struct amd_cpudata *cpudata = policy->driver_data; >> + >> + if (!cpudata) { >> + pr_err("cpudata is NULL\n"); >> + ret = -ENODATA; >> + cpufreq_cpu_put(policy); >> + goto err_exit; >> + } >> + >> + amd_cpu_boost_update(cpudata, >> amd_pstate_global_params.cpb_boost); >> + refresh_frequency_limits(policy); >> + cpufreq_cpu_put(policy); >> + } >> + >> +err_exit: >> + mutex_unlock(&amd_pstate_driver_lock); >> + return ret < 0 ? ret : count; >> +} >> + >> cpufreq_freq_attr_ro(amd_pstate_max_freq); >> cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); >> @@ -1305,6 +1402,7 @@ >> cpufreq_freq_attr_rw(energy_performance_preference); >> cpufreq_freq_attr_ro(energy_performance_available_preferences); >> static DEVICE_ATTR_RW(status); >> static DEVICE_ATTR_RO(prefcore); >> +static DEVICE_ATTR_RW(cpb_boost); >> static struct freq_attr *amd_pstate_attr[] = { >> &amd_pstate_max_freq, >> @@ -1329,6 +1427,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { >> static struct attribute *pstate_global_attributes[] = { >> &dev_attr_status.attr, >> &dev_attr_prefcore.attr, >> + &dev_attr_cpb_boost.attr, >> NULL >> }; >
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index aa17a3134497..53ef39c6dbfa 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1295,6 +1295,103 @@ static ssize_t prefcore_show(struct device *dev, return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); } +static int amd_cpu_boost_update(struct amd_cpudata *cpudata, u32 on) +{ + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); + struct cppc_perf_ctrls perf_ctrls; + u32 highest_perf, nominal_perf, nominal_freq, max_freq; + int ret; + + if (!policy) + return -ENODATA; + + highest_perf = READ_ONCE(cpudata->highest_perf); + nominal_perf = READ_ONCE(cpudata->nominal_perf); + nominal_freq = READ_ONCE(cpudata->nominal_freq); + max_freq = READ_ONCE(cpudata->max_freq); + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~GENMASK_ULL(7, 0); + value |= on ? highest_perf : nominal_perf; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = on ? highest_perf : nominal_perf; + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + if (ret) { + cpufreq_cpu_release(policy); + pr_debug("failed to set energy perf value (%d)\n", ret); + return ret; + } + } + + if (on) + policy->cpuinfo.max_freq = max_freq; + else + policy->cpuinfo.max_freq = nominal_freq * 1000; + + policy->max = policy->cpuinfo.max_freq; + + if (cppc_state == AMD_PSTATE_PASSIVE) { + ret = freq_qos_update_request(&cpudata->req[1], + policy->cpuinfo.max_freq); + } + + cpufreq_cpu_release(policy); + + return ret; +} + +static ssize_t cpb_boost_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", amd_pstate_global_params.cpb_boost); +} + +static ssize_t cpb_boost_store(struct device *dev, struct device_attribute *b, + const char *buf, size_t count) +{ + bool new_state; + ssize_t ret; + int cpu; + + if (!amd_pstate_global_params.cpb_supported) { + pr_err("Boost mode is not supported by this processor or SBIOS\n"); + return -EINVAL; + } + + ret = kstrtobool(buf, &new_state); + if (ret) + return ret; + + mutex_lock(&amd_pstate_driver_lock); + amd_pstate_global_params.cpb_boost = !!new_state; + + for_each_present_cpu(cpu) { + + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; + + if (!cpudata) { + pr_err("cpudata is NULL\n"); + ret = -ENODATA; + cpufreq_cpu_put(policy); + goto err_exit; + } + + amd_cpu_boost_update(cpudata, amd_pstate_global_params.cpb_boost); + refresh_frequency_limits(policy); + cpufreq_cpu_put(policy); + } + +err_exit: + mutex_unlock(&amd_pstate_driver_lock); + return ret < 0 ? ret : count; +} + cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); @@ -1305,6 +1402,7 @@ cpufreq_freq_attr_rw(energy_performance_preference); cpufreq_freq_attr_ro(energy_performance_available_preferences); static DEVICE_ATTR_RW(status); static DEVICE_ATTR_RO(prefcore); +static DEVICE_ATTR_RW(cpb_boost); static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, @@ -1329,6 +1427,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, &dev_attr_prefcore.attr, + &dev_attr_cpb_boost.attr, NULL };