diff mbox series

[RFC,11/16] sched/qos: Add rampup multiplier QoS

Message ID 20240820163512.1096301-12-qyousef@layalina.io
State New
Headers show
Series sched/fair/schedutil: Better manage system response time | expand

Commit Message

Qais Yousef Aug. 20, 2024, 4:35 p.m. UTC
Bursty tasks are hard to predict. To use resources efficiently, the
system would like to be exact as much as possible. But this poses
a challenge for these bursty tasks that need to get access to more
resources quickly.

The new SCHED_QOS_RAMPUP_MULTIPLIER allows userspace to do that. As the
name implies, it only helps them to transition to a higher performance
state when they get _busier_. That is perfectly periodic tasks by
definition are not going through a transition and will run at a constant
performance level. It is the tasks that need to transition from one
periodic state to another periodic state that is at a higher level that
this rampup_multiplier will help with. It also slows down the ewma decay
of util_est which should help those bursty tasks to keep their faster
rampup.

This should work complimentary with uclamp. uclamp tells the system
about min and max perf requirements which can be applied immediately.

rampup_multiplier is about reactiveness of the task to change.
Specifically to a change for a higher performance level. The task might
necessary need to have a min perf requirements, but it can have sudden
burst of changes that require higher perf level and it needs the system
to provide this faster.

TODO: update the sched_qos docs

Signed-off-by: Qais Yousef <qyousef@layalina.io>
---
 include/linux/sched.h      |  7 ++++
 include/uapi/linux/sched.h |  2 ++
 kernel/sched/core.c        | 66 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c        |  6 ++--
 kernel/sched/syscalls.c    | 38 ++++++++++++++++++++--
 5 files changed, 115 insertions(+), 4 deletions(-)

Comments

Dietmar Eggemann Sept. 17, 2024, 8:09 p.m. UTC | #1
On 20/08/2024 18:35, Qais Yousef wrote:

[...]

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0c10e2afb52d..3d9794db58e1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4906,7 +4906,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>  	if (!task_sleep) {
>  		if (task_util(p) > task_util_dequeued(p)) {
>  			ewma &= ~UTIL_AVG_UNCHANGED;
> -			ewma = approximate_util_avg(ewma, p->se.delta_exec / 1000);
> +			ewma = approximate_util_avg(ewma, (p->se.delta_exec/1000) * p->sched_qos.rampup_multiplier);

Isn't this exactly the idea from UTIL_EST_FASTER?

faster_est_approx(delta * 2) ... double speed even w/o contention?
                          ^
[...]
Ricardo Neri Sept. 17, 2024, 9:43 p.m. UTC | #2
On Tue, Aug 20, 2024 at 05:35:07PM +0100, Qais Yousef wrote:
> Bursty tasks are hard to predict. To use resources efficiently, the
> system would like to be exact as much as possible. But this poses
> a challenge for these bursty tasks that need to get access to more
> resources quickly.
> 
> The new SCHED_QOS_RAMPUP_MULTIPLIER allows userspace to do that. As the
> name implies, it only helps them to transition to a higher performance
> state when they get _busier_. That is perfectly periodic tasks by
> definition are not going through a transition and will run at a constant
> performance level. It is the tasks that need to transition from one
> periodic state to another periodic state that is at a higher level that
> this rampup_multiplier will help with. It also slows down the ewma decay
> of util_est which should help those bursty tasks to keep their faster
> rampup.
> 
> This should work complimentary with uclamp. uclamp tells the system
> about min and max perf requirements which can be applied immediately.
> 
> rampup_multiplier is about reactiveness of the task to change.
> Specifically to a change for a higher performance level. The task might
> necessary need to have a min perf requirements, but it can have sudden
> burst of changes that require higher perf level and it needs the system
> to provide this faster.
> 
> TODO: update the sched_qos docs
> 
> Signed-off-by: Qais Yousef <qyousef@layalina.io>
> ---
>  include/linux/sched.h      |  7 ++++
>  include/uapi/linux/sched.h |  2 ++
>  kernel/sched/core.c        | 66 ++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c        |  6 ++--
>  kernel/sched/syscalls.c    | 38 ++++++++++++++++++++--
>  5 files changed, 115 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2e8c5a9ffa76..a30ee43a25fb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -404,6 +404,11 @@ struct sched_info {
>  #endif /* CONFIG_SCHED_INFO */
>  };
>  
> +struct sched_qos {
> +	DECLARE_BITMAP(user_defined, SCHED_QOS_MAX);
> +	unsigned int rampup_multiplier;
> +};
> +
>  /*
>   * Integer metrics need fixed point arithmetic, e.g., sched/fair
>   * has a few: load, load_avg, util_avg, freq, and capacity.
> @@ -882,6 +887,8 @@ struct task_struct {
>  
>  	struct sched_info		sched_info;
>  
> +	struct sched_qos		sched_qos;
> +
>  	struct list_head		tasks;
>  #ifdef CONFIG_SMP
>  	struct plist_node		pushable_tasks;
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 67ef99f64ddc..0baba91ba5b8 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -104,6 +104,8 @@ struct clone_args {
>  };
>  
>  enum sched_qos_type {
> +	SCHED_QOS_RAMPUP_MULTIPLIER,
> +	SCHED_QOS_MAX,
>  };
>  #endif
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c91e6a62c7ab..54faa845cb29 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -152,6 +152,8 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
>   */
>  const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
>  
> +unsigned int sysctl_sched_qos_default_rampup_multiplier	= 1;
> +
>  __read_mostly int scheduler_running;
>  
>  #ifdef CONFIG_SCHED_CORE
> @@ -4488,6 +4490,47 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
>  #endif /* CONFIG_SCHEDSTATS */
>  
>  #ifdef CONFIG_SYSCTL
> +static void sched_qos_sync_sysctl(void)
> +{
> +	struct task_struct *g, *p;
> +
> +	guard(rcu)();
> +	for_each_process_thread(g, p) {
> +		struct rq_flags rf;
> +		struct rq *rq;
> +
> +		rq = task_rq_lock(p, &rf);
> +		if (!test_bit(SCHED_QOS_RAMPUP_MULTIPLIER, p->sched_qos.user_defined))
> +			p->sched_qos.rampup_multiplier = sysctl_sched_qos_default_rampup_multiplier;
> +		task_rq_unlock(rq, p, &rf);
> +	}
> +}
> +
> +static int sysctl_sched_qos_handler(struct ctl_table *table, int write,
> +				    void *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	unsigned int old_rampup_mult;
> +	int result;
> +
> +	old_rampup_mult = sysctl_sched_qos_default_rampup_multiplier;
> +
> +	result = proc_dointvec(table, write, buffer, lenp, ppos);
> +	if (result)
> +		goto undo;
> +	if (!write)
> +		return 0;
> +
> +	if (old_rampup_mult != sysctl_sched_qos_default_rampup_multiplier) {
> +		sched_qos_sync_sysctl();
> +	}
> +
> +	return 0;
> +
> +undo:
> +	sysctl_sched_qos_default_rampup_multiplier = old_rampup_mult;
> +	return result;
> +}
> +
>  static struct ctl_table sched_core_sysctls[] = {
>  #ifdef CONFIG_SCHEDSTATS
>  	{
> @@ -4534,6 +4577,13 @@ static struct ctl_table sched_core_sysctls[] = {
>  		.extra2		= SYSCTL_FOUR,
>  	},
>  #endif /* CONFIG_NUMA_BALANCING */
> +	{
> +		.procname	= "sched_qos_default_rampup_multiplier",
> +		.data           = &sysctl_sched_qos_default_rampup_multiplier,
> +		.maxlen         = sizeof(unsigned int),

IIUC, user space needs to select a value between 0 and (2^32 - 1). Does
this mean that it will need fine-tuning for each product and application?

Could there be some translation to a fewer number of QoS levels that are
qualitatively?

Also, I think about Intel processors. They work with hardware-controlled
performance scaling. The proposed interface would help us to communicate
per-task multipliers to hardware, but they would be used as hints to
hardware and not acted upon by the kernel to scale frequency.
Ricardo Neri Sept. 18, 2024, 9:21 p.m. UTC | #3
On Tue, Sep 17, 2024 at 02:43:37PM -0700, Ricardo Neri wrote:
> On Tue, Aug 20, 2024 at 05:35:07PM +0100, Qais Yousef wrote:
> > Bursty tasks are hard to predict. To use resources efficiently, the
> > system would like to be exact as much as possible. But this poses
> > a challenge for these bursty tasks that need to get access to more
> > resources quickly.
> > 
> > The new SCHED_QOS_RAMPUP_MULTIPLIER allows userspace to do that. As the
> > name implies, it only helps them to transition to a higher performance
> > state when they get _busier_. That is perfectly periodic tasks by
> > definition are not going through a transition and will run at a constant
> > performance level. It is the tasks that need to transition from one
> > periodic state to another periodic state that is at a higher level that
> > this rampup_multiplier will help with. It also slows down the ewma decay
> > of util_est which should help those bursty tasks to keep their faster
> > rampup.
> > 
> > This should work complimentary with uclamp. uclamp tells the system
> > about min and max perf requirements which can be applied immediately.
> > 
> > rampup_multiplier is about reactiveness of the task to change.
> > Specifically to a change for a higher performance level. The task might
> > necessary need to have a min perf requirements, but it can have sudden
> > burst of changes that require higher perf level and it needs the system
> > to provide this faster.
> > 
> > TODO: update the sched_qos docs
> > 
> > Signed-off-by: Qais Yousef <qyousef@layalina.io>
> > ---
> >  include/linux/sched.h      |  7 ++++
> >  include/uapi/linux/sched.h |  2 ++
> >  kernel/sched/core.c        | 66 ++++++++++++++++++++++++++++++++++++++
> >  kernel/sched/fair.c        |  6 ++--
> >  kernel/sched/syscalls.c    | 38 ++++++++++++++++++++--
> >  5 files changed, 115 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 2e8c5a9ffa76..a30ee43a25fb 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -404,6 +404,11 @@ struct sched_info {
> >  #endif /* CONFIG_SCHED_INFO */
> >  };
> >  
> > +struct sched_qos {
> > +	DECLARE_BITMAP(user_defined, SCHED_QOS_MAX);
> > +	unsigned int rampup_multiplier;
> > +};
> > +
> >  /*
> >   * Integer metrics need fixed point arithmetic, e.g., sched/fair
> >   * has a few: load, load_avg, util_avg, freq, and capacity.
> > @@ -882,6 +887,8 @@ struct task_struct {
> >  
> >  	struct sched_info		sched_info;
> >  
> > +	struct sched_qos		sched_qos;
> > +
> >  	struct list_head		tasks;
> >  #ifdef CONFIG_SMP
> >  	struct plist_node		pushable_tasks;
> > diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> > index 67ef99f64ddc..0baba91ba5b8 100644
> > --- a/include/uapi/linux/sched.h
> > +++ b/include/uapi/linux/sched.h
> > @@ -104,6 +104,8 @@ struct clone_args {
> >  };
> >  
> >  enum sched_qos_type {
> > +	SCHED_QOS_RAMPUP_MULTIPLIER,
> > +	SCHED_QOS_MAX,
> >  };
> >  #endif
> >  
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index c91e6a62c7ab..54faa845cb29 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -152,6 +152,8 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
> >   */
> >  const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
> >  
> > +unsigned int sysctl_sched_qos_default_rampup_multiplier	= 1;
> > +
> >  __read_mostly int scheduler_running;
> >  
> >  #ifdef CONFIG_SCHED_CORE
> > @@ -4488,6 +4490,47 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
> >  #endif /* CONFIG_SCHEDSTATS */
> >  
> >  #ifdef CONFIG_SYSCTL
> > +static void sched_qos_sync_sysctl(void)
> > +{
> > +	struct task_struct *g, *p;
> > +
> > +	guard(rcu)();
> > +	for_each_process_thread(g, p) {
> > +		struct rq_flags rf;
> > +		struct rq *rq;
> > +
> > +		rq = task_rq_lock(p, &rf);
> > +		if (!test_bit(SCHED_QOS_RAMPUP_MULTIPLIER, p->sched_qos.user_defined))
> > +			p->sched_qos.rampup_multiplier = sysctl_sched_qos_default_rampup_multiplier;
> > +		task_rq_unlock(rq, p, &rf);
> > +	}
> > +}
> > +
> > +static int sysctl_sched_qos_handler(struct ctl_table *table, int write,
> > +				    void *buffer, size_t *lenp, loff_t *ppos)
> > +{
> > +	unsigned int old_rampup_mult;
> > +	int result;
> > +
> > +	old_rampup_mult = sysctl_sched_qos_default_rampup_multiplier;
> > +
> > +	result = proc_dointvec(table, write, buffer, lenp, ppos);
> > +	if (result)
> > +		goto undo;
> > +	if (!write)
> > +		return 0;
> > +
> > +	if (old_rampup_mult != sysctl_sched_qos_default_rampup_multiplier) {
> > +		sched_qos_sync_sysctl();
> > +	}
> > +
> > +	return 0;
> > +
> > +undo:
> > +	sysctl_sched_qos_default_rampup_multiplier = old_rampup_mult;
> > +	return result;
> > +}
> > +
> >  static struct ctl_table sched_core_sysctls[] = {
> >  #ifdef CONFIG_SCHEDSTATS
> >  	{
> > @@ -4534,6 +4577,13 @@ static struct ctl_table sched_core_sysctls[] = {
> >  		.extra2		= SYSCTL_FOUR,
> >  	},
> >  #endif /* CONFIG_NUMA_BALANCING */
> > +	{
> > +		.procname	= "sched_qos_default_rampup_multiplier",
> > +		.data           = &sysctl_sched_qos_default_rampup_multiplier,
> > +		.maxlen         = sizeof(unsigned int),
> 
> IIUC, user space needs to select a value between 0 and (2^32 - 1). Does
> this mean that it will need fine-tuning for each product and application?
> 
> Could there be some translation to a fewer number of QoS levels that are
> qualitatively?
> 
> Also, I think about Intel processors. They work with hardware-controlled
> performance scaling. The proposed interface would help us to communicate
> per-task multipliers to hardware, but they would be used as hints to
> hardware and not acted upon by the kernel to scale frequency.

Also, as discussed during LPC 2024 it might be good to have an interface
that is compatible with other operating systems. They have qualitative
descriptions of QoS levels (see Len Brown's LPC 2022 presentation [1]).

It can be this hint or a new one.

[1]. https://lpc.events/event/16/contributions/1276/attachments/1070/2039/Brown-Shankar%20LPC%202022.09.13%20Sched%20QOS%20API.pdf
Christian Loehle Oct. 14, 2024, 4:06 p.m. UTC | #4
On 8/20/24 17:35, Qais Yousef wrote:
> Bursty tasks are hard to predict. To use resources efficiently, the
> system would like to be exact as much as possible. But this poses
> a challenge for these bursty tasks that need to get access to more
> resources quickly.
> 
> The new SCHED_QOS_RAMPUP_MULTIPLIER allows userspace to do that. As the
> name implies, it only helps them to transition to a higher performance
> state when they get _busier_. That is perfectly periodic tasks by
> definition are not going through a transition and will run at a constant
> performance level. It is the tasks that need to transition from one
> periodic state to another periodic state that is at a higher level that
> this rampup_multiplier will help with. It also slows down the ewma decay
> of util_est which should help those bursty tasks to keep their faster
> rampup.
> 
> This should work complimentary with uclamp. uclamp tells the system
> about min and max perf requirements which can be applied immediately.
> 
> rampup_multiplier is about reactiveness of the task to change.
> Specifically to a change for a higher performance level. The task might
> necessary need to have a min perf requirements, but it can have sudden
> burst of changes that require higher perf level and it needs the system
> to provide this faster.
> 
> TODO: update the sched_qos docs
> 
> Signed-off-by: Qais Yousef <qyousef@layalina.io>
> ---
>  include/linux/sched.h      |  7 ++++
>  include/uapi/linux/sched.h |  2 ++
>  kernel/sched/core.c        | 66 ++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c        |  6 ++--
>  kernel/sched/syscalls.c    | 38 ++++++++++++++++++++--
>  5 files changed, 115 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2e8c5a9ffa76..a30ee43a25fb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -404,6 +404,11 @@ struct sched_info {
>  #endif /* CONFIG_SCHED_INFO */
>  };
>  
> +struct sched_qos {
> +	DECLARE_BITMAP(user_defined, SCHED_QOS_MAX);
> +	unsigned int rampup_multiplier;
> +};
> +
>  /*
>   * Integer metrics need fixed point arithmetic, e.g., sched/fair
>   * has a few: load, load_avg, util_avg, freq, and capacity.
> @@ -882,6 +887,8 @@ struct task_struct {
>  
>  	struct sched_info		sched_info;
>  
> +	struct sched_qos		sched_qos;
> +
>  	struct list_head		tasks;
>  #ifdef CONFIG_SMP
>  	struct plist_node		pushable_tasks;
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 67ef99f64ddc..0baba91ba5b8 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -104,6 +104,8 @@ struct clone_args {
>  };
>  
>  enum sched_qos_type {
> +	SCHED_QOS_RAMPUP_MULTIPLIER,
> +	SCHED_QOS_MAX,
>  };
>  #endif
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c91e6a62c7ab..54faa845cb29 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -152,6 +152,8 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
>   */
>  const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
>  
> +unsigned int sysctl_sched_qos_default_rampup_multiplier	= 1;
> +
>  __read_mostly int scheduler_running;
>  
>  #ifdef CONFIG_SCHED_CORE
> @@ -4488,6 +4490,47 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
>  #endif /* CONFIG_SCHEDSTATS */
>  
>  #ifdef CONFIG_SYSCTL
> +static void sched_qos_sync_sysctl(void)
> +{
> +	struct task_struct *g, *p;
> +
> +	guard(rcu)();
> +	for_each_process_thread(g, p) {
> +		struct rq_flags rf;
> +		struct rq *rq;
> +
> +		rq = task_rq_lock(p, &rf);
> +		if (!test_bit(SCHED_QOS_RAMPUP_MULTIPLIER, p->sched_qos.user_defined))
> +			p->sched_qos.rampup_multiplier = sysctl_sched_qos_default_rampup_multiplier;
> +		task_rq_unlock(rq, p, &rf);
> +	}
> +}
> +
> +static int sysctl_sched_qos_handler(struct ctl_table *table, int write,
> +				    void *buffer, size_t *lenp, loff_t *ppos)

table should be const struct ctl_table *table for this to build on 6.11 at least.
John Stultz Nov. 28, 2024, 12:12 a.m. UTC | #5
On Tue, Aug 20, 2024 at 9:36 AM Qais Yousef <qyousef@layalina.io> wrote:
>
> Bursty tasks are hard to predict. To use resources efficiently, the
> system would like to be exact as much as possible. But this poses
> a challenge for these bursty tasks that need to get access to more
> resources quickly.
>
> The new SCHED_QOS_RAMPUP_MULTIPLIER allows userspace to do that. As the
> name implies, it only helps them to transition to a higher performance
> state when they get _busier_. That is perfectly periodic tasks by
> definition are not going through a transition and will run at a constant
> performance level. It is the tasks that need to transition from one
> periodic state to another periodic state that is at a higher level that
> this rampup_multiplier will help with. It also slows down the ewma decay
> of util_est which should help those bursty tasks to keep their faster
> rampup.
>
> This should work complimentary with uclamp. uclamp tells the system
> about min and max perf requirements which can be applied immediately.
>
> rampup_multiplier is about reactiveness of the task to change.
> Specifically to a change for a higher performance level. The task might
> necessary need to have a min perf requirements, but it can have sudden
> burst of changes that require higher perf level and it needs the system
> to provide this faster.
>
> TODO: update the sched_qos docs
>
> Signed-off-by: Qais Yousef <qyousef@layalina.io>
> ---
>  include/linux/sched.h      |  7 ++++
>  include/uapi/linux/sched.h |  2 ++
>  kernel/sched/core.c        | 66 ++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c        |  6 ++--
>  kernel/sched/syscalls.c    | 38 ++++++++++++++++++++--
>  5 files changed, 115 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2e8c5a9ffa76..a30ee43a25fb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -404,6 +404,11 @@ struct sched_info {
>  #endif /* CONFIG_SCHED_INFO */
>  };
>
> +struct sched_qos {
> +       DECLARE_BITMAP(user_defined, SCHED_QOS_MAX);
> +       unsigned int rampup_multiplier;
> +};
> +
>  /*
>   * Integer metrics need fixed point arithmetic, e.g., sched/fair
>   * has a few: load, load_avg, util_avg, freq, and capacity.
> @@ -882,6 +887,8 @@ struct task_struct {
>
>         struct sched_info               sched_info;
>
> +       struct sched_qos                sched_qos;
> +
>         struct list_head                tasks;
>  #ifdef CONFIG_SMP
>         struct plist_node               pushable_tasks;
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 67ef99f64ddc..0baba91ba5b8 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -104,6 +104,8 @@ struct clone_args {
>  };
>
>  enum sched_qos_type {
> +       SCHED_QOS_RAMPUP_MULTIPLIER,
> +       SCHED_QOS_MAX,
>  };
>  #endif
...
> +static void __setscheduler_sched_qos(struct task_struct *p,
> +                                    const struct sched_attr *attr)
> +{
> +       switch (attr->sched_qos_type) {
> +       case SCHED_QOS_RAMPUP_MULTIPLIER:
> +               set_bit(SCHED_QOS_RAMPUP_MULTIPLIER, p->sched_qos.user_defined);
> +               p->sched_qos.rampup_multiplier = attr->sched_qos_value;
> +       default:
> +               break;
> +       }
> +}
> +
>  /*
>   * Allow unprivileged RT tasks to decrease priority.
>   * Only issue a capable test if needed and only once to avoid an audit
...
> @@ -799,7 +831,9 @@ int __sched_setscheduler(struct task_struct *p,
>                 __setscheduler_params(p, attr);
>                 __setscheduler_prio(p, newprio);
>         }
> +
>         __setscheduler_uclamp(p, attr);
> +       __setscheduler_sched_qos(p, attr);
>

Hey Qais,
  Started tinkering a bit more with this patch series and found
unexpectedly a number of tasks were getting their rampup_multiplier
value set to zero.

It looks like the issue is that the SCHED_QOS_RAMPUP_MULTIPLIER enum
value is 0, so the switch (attr->sched_qos_type) always catches the
uninitialized/unset value during any sched_setscheduler()call, and
further the call to __setscheduler_sched_qos() isn't protected by a
(attr->sched_flags & SCHED_FLAG_QOS) check as is done for
sched_qos_validate() so we always end up falling into it and setting
the rampup_multiplier.

The easiest fix is probably just to have a SCHED_QOS_NONE base value
in the sched_qos_type enum, but we can also add checks on sched_flags
& SCHED_FLAG_QOS. Or do you have another idea?

thanks
-john
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e8c5a9ffa76..a30ee43a25fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -404,6 +404,11 @@  struct sched_info {
 #endif /* CONFIG_SCHED_INFO */
 };
 
+struct sched_qos {
+	DECLARE_BITMAP(user_defined, SCHED_QOS_MAX);
+	unsigned int rampup_multiplier;
+};
+
 /*
  * Integer metrics need fixed point arithmetic, e.g., sched/fair
  * has a few: load, load_avg, util_avg, freq, and capacity.
@@ -882,6 +887,8 @@  struct task_struct {
 
 	struct sched_info		sched_info;
 
+	struct sched_qos		sched_qos;
+
 	struct list_head		tasks;
 #ifdef CONFIG_SMP
 	struct plist_node		pushable_tasks;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 67ef99f64ddc..0baba91ba5b8 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -104,6 +104,8 @@  struct clone_args {
 };
 
 enum sched_qos_type {
+	SCHED_QOS_RAMPUP_MULTIPLIER,
+	SCHED_QOS_MAX,
 };
 #endif
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c91e6a62c7ab..54faa845cb29 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -152,6 +152,8 @@  __read_mostly int sysctl_resched_latency_warn_once = 1;
  */
 const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
 
+unsigned int sysctl_sched_qos_default_rampup_multiplier	= 1;
+
 __read_mostly int scheduler_running;
 
 #ifdef CONFIG_SCHED_CORE
@@ -4488,6 +4490,47 @@  static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 #endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_SYSCTL
+static void sched_qos_sync_sysctl(void)
+{
+	struct task_struct *g, *p;
+
+	guard(rcu)();
+	for_each_process_thread(g, p) {
+		struct rq_flags rf;
+		struct rq *rq;
+
+		rq = task_rq_lock(p, &rf);
+		if (!test_bit(SCHED_QOS_RAMPUP_MULTIPLIER, p->sched_qos.user_defined))
+			p->sched_qos.rampup_multiplier = sysctl_sched_qos_default_rampup_multiplier;
+		task_rq_unlock(rq, p, &rf);
+	}
+}
+
+static int sysctl_sched_qos_handler(struct ctl_table *table, int write,
+				    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int old_rampup_mult;
+	int result;
+
+	old_rampup_mult = sysctl_sched_qos_default_rampup_multiplier;
+
+	result = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (result)
+		goto undo;
+	if (!write)
+		return 0;
+
+	if (old_rampup_mult != sysctl_sched_qos_default_rampup_multiplier) {
+		sched_qos_sync_sysctl();
+	}
+
+	return 0;
+
+undo:
+	sysctl_sched_qos_default_rampup_multiplier = old_rampup_mult;
+	return result;
+}
+
 static struct ctl_table sched_core_sysctls[] = {
 #ifdef CONFIG_SCHEDSTATS
 	{
@@ -4534,6 +4577,13 @@  static struct ctl_table sched_core_sysctls[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
+	{
+		.procname	= "sched_qos_default_rampup_multiplier",
+		.data           = &sysctl_sched_qos_default_rampup_multiplier,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sysctl_sched_qos_handler,
+	},
 };
 static int __init sched_core_sysctl_init(void)
 {
@@ -4543,6 +4593,21 @@  static int __init sched_core_sysctl_init(void)
 late_initcall(sched_core_sysctl_init);
 #endif /* CONFIG_SYSCTL */
 
+static void sched_qos_fork(struct task_struct *p)
+{
+	/*
+	 * We always force reset sched_qos on fork. These sched_qos are treated
+	 * as finite resources to help improve quality of life. Inheriting them
+	 * by default can easily lead to a situation where the QoS hint become
+	 * meaningless because all tasks in the system have it.
+	 *
+	 * Every task must request the QoS explicitly if it needs it. No
+	 * accidental inheritance is allowed to keep the default behavior sane.
+	 */
+	bitmap_zero(p->sched_qos.user_defined, SCHED_QOS_MAX);
+	p->sched_qos.rampup_multiplier = sysctl_sched_qos_default_rampup_multiplier;
+}
+
 /*
  * fork()/clone()-time setup:
  */
@@ -4562,6 +4627,7 @@  int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->prio = current->normal_prio;
 
 	uclamp_fork(p);
+	sched_qos_fork(p);
 
 	/*
 	 * Revert to default priority/policy on fork if requested.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c10e2afb52d..3d9794db58e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4906,7 +4906,7 @@  static inline void util_est_update(struct cfs_rq *cfs_rq,
 	if (!task_sleep) {
 		if (task_util(p) > task_util_dequeued(p)) {
 			ewma &= ~UTIL_AVG_UNCHANGED;
-			ewma = approximate_util_avg(ewma, p->se.delta_exec / 1000);
+			ewma = approximate_util_avg(ewma, (p->se.delta_exec/1000) * p->sched_qos.rampup_multiplier);
 			goto done;
 		}
 		return;
@@ -4974,6 +4974,8 @@  static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
 	 */
 	ewma <<= UTIL_EST_WEIGHT_SHIFT;
+	if (p->sched_qos.rampup_multiplier)
+		last_ewma_diff /= p->sched_qos.rampup_multiplier;
 	ewma  -= last_ewma_diff;
 	ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
@@ -9643,7 +9645,7 @@  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	 * on TICK doesn't end up hurting it as it can happen after we would
 	 * have crossed this threshold.
 	 *
-	 * To ensure that invaraince is taken into account, we don't scale time
+	 * To ensure that invariance is taken into account, we don't scale time
 	 * and use it as-is, approximate_util_avg() will then let us know the
 	 * our threshold.
 	 */
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index a7d4dfdfed43..dc7d7bcaae7b 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -543,6 +543,35 @@  static void __setscheduler_uclamp(struct task_struct *p,
 				  const struct sched_attr *attr) { }
 #endif
 
+static inline int sched_qos_validate(struct task_struct *p,
+				     const struct sched_attr *attr)
+{
+	switch (attr->sched_qos_type) {
+	case SCHED_QOS_RAMPUP_MULTIPLIER:
+		if (attr->sched_qos_cookie)
+			return -EINVAL;
+		if (attr->sched_qos_value < 0)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void __setscheduler_sched_qos(struct task_struct *p,
+				     const struct sched_attr *attr)
+{
+	switch (attr->sched_qos_type) {
+	case SCHED_QOS_RAMPUP_MULTIPLIER:
+		set_bit(SCHED_QOS_RAMPUP_MULTIPLIER, p->sched_qos.user_defined);
+		p->sched_qos.rampup_multiplier = attr->sched_qos_value;
+	default:
+		break;
+	}
+}
+
 /*
  * Allow unprivileged RT tasks to decrease priority.
  * Only issue a capable test if needed and only once to avoid an audit
@@ -668,8 +697,11 @@  int __sched_setscheduler(struct task_struct *p,
 			return retval;
 	}
 
-	if (attr->sched_flags & SCHED_FLAG_QOS)
-		return -EOPNOTSUPP;
+	if (attr->sched_flags & SCHED_FLAG_QOS) {
+		retval = sched_qos_validate(p, attr);
+		if (retval)
+			return retval;
+	}
 
 	/*
 	 * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
@@ -799,7 +831,9 @@  int __sched_setscheduler(struct task_struct *p,
 		__setscheduler_params(p, attr);
 		__setscheduler_prio(p, newprio);
 	}
+
 	__setscheduler_uclamp(p, attr);
+	__setscheduler_sched_qos(p, attr);
 
 	if (queued) {
 		/*