diff mbox

[RFC,06/16] arm: topology: Define TC2 sched energy and provide it to scheduler

Message ID 1400869003-27769-7-git-send-email-morten.rasmussen@arm.com
State New
Headers show

Commit Message

Morten Rasmussen May 23, 2014, 6:16 p.m. UTC
From: Dietmar Eggemann <dietmar.eggemann@arm.com>

!!! This patch is only here to be able to test provisioning of sched
energy related data from an arch topology shim layer to the scheduler.
Since there is no code today which deals with extracting sched energy
related data from the dtb or acpi, and process it in the topology shim
layer, the struct sched_energy and the related struct capacity_state
arrays are hard-coded here !!!

This patch defines the struct sched_energy and the related struct
capacity_state array for the cluster (relates to sg's in DIE sd level)
and for the core (relates to sg's in MC sd level) for a Cortex A7 as
well as for a Cortex A15. It further provides related implementations of
the sched_domain_energy_f functions (cpu_cluster_energy() and
cpu_core_energy()).

To be able to propagate this information from the topology shim layer to
the scheduler, the elements of the arm_topology[] table have been
provisioned with the appropriate sched_domain_energy_f functions.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm/kernel/topology.c |  109 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

Comments

Peter Zijlstra May 30, 2014, 12:04 p.m. UTC | #1
On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> +static struct capacity_state cap_states_cluster_a7[] = {
> +	/* Cluster only power */
> +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> +	};

> +static struct capacity_state cap_states_core_a7[] = {
> +	/* Power per cpu */
> +	 { .cap =  358, .power =  187, }, /*  350 MHz */
> +	 { .cap =  410, .power =  275, }, /*  400 MHz */
> +	 { .cap =  512, .power =  334, }, /*  500 MHz */
> +	 { .cap =  614, .power =  407, }, /*  600 MHz */
> +	 { .cap =  717, .power =  447, }, /*  700 MHz */
> +	 { .cap =  819, .power =  549, }, /*  800 MHz */
> +	 { .cap =  922, .power =  761, }, /*  900 MHz */
> +	 { .cap = 1024, .power = 1024, }, /* 1000 MHz */
> +	};

Talk to me about this core vs cluster thing.

Why would an architecture have multiple energy domains like this?

That is, if a cpu can set P states per core, why does it need a cluster
wide thing.

Also, in general, why would we need to walk the domain tree all the way
up, typically I would expect to stop walking once we've covered the two
cpu's we're interested in, because above that nothing changes.
Morten Rasmussen June 2, 2014, 2:15 p.m. UTC | #2
On Fri, May 30, 2014 at 01:04:24PM +0100, Peter Zijlstra wrote:
> On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > +static struct capacity_state cap_states_cluster_a7[] = {
> > +	/* Cluster only power */
> > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > +	};
> 
> > +static struct capacity_state cap_states_core_a7[] = {
> > +	/* Power per cpu */
> > +	 { .cap =  358, .power =  187, }, /*  350 MHz */
> > +	 { .cap =  410, .power =  275, }, /*  400 MHz */
> > +	 { .cap =  512, .power =  334, }, /*  500 MHz */
> > +	 { .cap =  614, .power =  407, }, /*  600 MHz */
> > +	 { .cap =  717, .power =  447, }, /*  700 MHz */
> > +	 { .cap =  819, .power =  549, }, /*  800 MHz */
> > +	 { .cap =  922, .power =  761, }, /*  900 MHz */
> > +	 { .cap = 1024, .power = 1024, }, /* 1000 MHz */
> > +	};
> 
> Talk to me about this core vs cluster thing.
> 
> Why would an architecture have multiple energy domains like this?
> 
> That is, if a cpu can set P states per core, why does it need a cluster
> wide thing.

The reason is that power domains are often organized in a hierarchy
where you may be able to power down just a cpu or the entire cluster
along with cluster wide shared resources. This is quite typical for ARM
systems. Frequency domains (P-states) typically cover the same hardware
as one of the power domain levels. That is, there might be several
smaller power domains sharing the same frequency (P-state) or there
might be a power domain spanning multiple frequency domains.

The main reason why we need to worry about all this is that it typically
cost a lot more energy to use the first cpu in a cluster since you
also need to power up all the shared hardware resources than the energy
cost of waking and using additional cpus in the same cluster.

IMHO, the most natural way to model the energy is therefore something
like:

    energy = energy_cluster + n * energy_cpu

Where 'n' is the number of cpus powered up and energy_cluster is the
cost paid as soon as any cpu in the cluster is powered up.

If we take TC2 as an example, we have per-cluster frequency domains
(P-states) and idle-states for both the individual cpus and the
clusters. WFI for individual cpus and cluster power down for the
cluster, which takes down the per-cluster L2 cache and other cluster
resources. When we wake the first cpu in a cluster, the cluster will
exit cluster power down and put all other into WFI. Powering on the
first cpu (A7) and fully utilizing it at 1000 MHz will cost:

    power_one = 4905 + 1024

Waking up an additional cpu and fully utilizing it we get:

    power_two = 4905 + 2*1024

So if we need two cpu's worth of compute capacity (at max capacity) we
can save quite a lot of energy by picking two in the same cluster rather
than paying the cluster power twice.

Now if one of the cpus is only 50% utilized, it will be in WFI half the
time:

    power = power_cluster + \sum{n}^{cpus} util(n) * power_cpu +
					(1-util(n)) * idle_power_cpu

    power_100_50 = 4905 + (1.0*1024 + 0.0*0) + (0.5*1024 + 0.5*0)

I have normalized the utilization factor to 1.0 for simplicity. We also
need to factor in the cost of the wakeups on the 50% loaded cpu, but I
will leave that out here to keep it simpler.

If we now consider a slightly different scenario where one cpu is 50%
utilized and the other is 25% utilized. We assume that the busy period
starts at the same time on both cpus (overlapped). In this case, we can
power down the whole cluster 50% of the time (assuming that the idle
period is long enough to allow it). We can expand power_cluster to
factor that in:

    power_cluster' = util(cluster) * power_cluster + 
				(1-util(cluster)) * idle_power_cluster

    power_50_25 = 0.5*4905 + 0.5*10 + (0.5*1024 + 0.0*0) +
    						(0.25*1024 + 0.75*0)

> Also, in general, why would we need to walk the domain tree all the way
> up, typically I would expect to stop walking once we've covered the two
> cpu's we're interested in, because above that nothing changes.

True. In some cases we don't have to go all the way up. There is a
condition in energy_diff_load() that bails out if the energy doesn't
change further up the hierarchy. There might be scope for improving that
condition though.

We can basically stop going up if the utilization of the domain is
unchanged by the change we want to do. For example, we can ignore the
next level above if a third cpu is keeping the domain up all the time
anyway. In the 100% + 50% case above, putting another 50% task on the
50% cpu wouldn't affect the cluster according the proposed model, so it
can be ignored. However, if we did the same on any of the two cpus in
the 50% + 25% example we affect the cluster utilization and have to do
the cluster level maths.

So we do sometimes have to go all the way up even if we are balancing
two sibling cpus to determine the energy implications. At least if we
want an energy score like energy_diff_load() produces. However, we might
be able to take some other shortcuts if we are balancing load between
two specific cpus (not wakeup/fork/exec balancing) as you point out. But
there are cases where we need to continue up until the domain
utilization is unchanged.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra June 3, 2014, 11:41 a.m. UTC | #3
On Mon, Jun 02, 2014 at 03:15:36PM +0100, Morten Rasmussen wrote:
> > 
> > Talk to me about this core vs cluster thing.
> > 
> > Why would an architecture have multiple energy domains like this?

> The reason is that power domains are often organized in a hierarchy
> where you may be able to power down just a cpu or the entire cluster
> along with cluster wide shared resources. This is quite typical for ARM
> systems. Frequency domains (P-states) typically cover the same hardware
> as one of the power domain levels. That is, there might be several
> smaller power domains sharing the same frequency (P-state) or there
> might be a power domain spanning multiple frequency domains.
> 
> The main reason why we need to worry about all this is that it typically
> cost a lot more energy to use the first cpu in a cluster since you
> also need to power up all the shared hardware resources than the energy
> cost of waking and using additional cpus in the same cluster.
> 
> IMHO, the most natural way to model the energy is therefore something
> like:
> 
>     energy = energy_cluster + n * energy_cpu
> 
> Where 'n' is the number of cpus powered up and energy_cluster is the
> cost paid as soon as any cpu in the cluster is powered up.

OK, that makes sense, thanks! Maybe expand the doc/changelogs with this
because it wasn't immediately clear to me.

> > Also, in general, why would we need to walk the domain tree all the way
> > up, typically I would expect to stop walking once we've covered the two
> > cpu's we're interested in, because above that nothing changes.
> 
> True. In some cases we don't have to go all the way up. There is a
> condition in energy_diff_load() that bails out if the energy doesn't
> change further up the hierarchy. There might be scope for improving that
> condition though.
> 
> We can basically stop going up if the utilization of the domain is
> unchanged by the change we want to do. For example, we can ignore the
> next level above if a third cpu is keeping the domain up all the time
> anyway. In the 100% + 50% case above, putting another 50% task on the
> 50% cpu wouldn't affect the cluster according the proposed model, so it
> can be ignored. However, if we did the same on any of the two cpus in
> the 50% + 25% example we affect the cluster utilization and have to do
> the cluster level maths.
> 
> So we do sometimes have to go all the way up even if we are balancing
> two sibling cpus to determine the energy implications. At least if we
> want an energy score like energy_diff_load() produces. However, we might
> be able to take some other shortcuts if we are balancing load between
> two specific cpus (not wakeup/fork/exec balancing) as you point out. But
> there are cases where we need to continue up until the domain
> utilization is unchanged.

Right.. so my worry with this is scalability. We typically want to avoid
having to scan the entire machine, even for power aware balancing.

That said, I don't think we have a 'sane' model for really big hardware
(yet). Intel still hasn't really said anything much on that iirc, as
long as a single core is up, all the memory controllers in the numa
fabric need to be awake, not to mention to cost of keeping the dram
alive.
Peter Zijlstra June 3, 2014, 11:44 a.m. UTC | #4
On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> +static struct capacity_state cap_states_cluster_a7[] = {
> +	/* Cluster only power */
> +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> +	};
> +
> +static struct capacity_state cap_states_cluster_a15[] = {
> +	/* Cluster only power */
> +	 { .cap =  840, .power =  7920, }, /*  500 MHz */
> +	 { .cap = 1008, .power =  8165, }, /*  600 MHz */
> +	 { .cap = 1176, .power =  8172, }, /*  700 MHz */
> +	 { .cap = 1343, .power =  8195, }, /*  800 MHz */
> +	 { .cap = 1511, .power =  8265, }, /*  900 MHz */
> +	 { .cap = 1679, .power =  8446, }, /* 1000 MHz */
> +	 { .cap = 1847, .power = 11426, }, /* 1100 MHz */
> +	 { .cap = 2015, .power = 15200, }, /* 1200 MHz */
> +	};


So how did you obtain these numbers? Did you use numbers provided by the
hardware people, or did you run a particular benchmark and record the
power usage?

Does that benchmark do some actual work (as opposed to a while(1) loop)
to keep more silicon lit up?

If you have a setup for measuring these, should we try and publish that
too so that people can run it on their platform and provide these
numbers?
Peter Zijlstra June 3, 2014, 11:50 a.m. UTC | #5
On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> +static struct capacity_state cap_states_cluster_a7[] = {
> +	/* Cluster only power */
> +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> +	};

So one thing I remember was that we spoke about restricting this to
frequency levels where the voltage changed.

Because voltage jumps were the biggest factor to energy usage.

Any word on that?
Morten Rasmussen June 4, 2014, 1:49 p.m. UTC | #6
On Tue, Jun 03, 2014 at 12:41:45PM +0100, Peter Zijlstra wrote:
> On Mon, Jun 02, 2014 at 03:15:36PM +0100, Morten Rasmussen wrote:
> > > 
> > > Talk to me about this core vs cluster thing.
> > > 
> > > Why would an architecture have multiple energy domains like this?
> 
> > The reason is that power domains are often organized in a hierarchy
> > where you may be able to power down just a cpu or the entire cluster
> > along with cluster wide shared resources. This is quite typical for ARM
> > systems. Frequency domains (P-states) typically cover the same hardware
> > as one of the power domain levels. That is, there might be several
> > smaller power domains sharing the same frequency (P-state) or there
> > might be a power domain spanning multiple frequency domains.
> > 
> > The main reason why we need to worry about all this is that it typically
> > cost a lot more energy to use the first cpu in a cluster since you
> > also need to power up all the shared hardware resources than the energy
> > cost of waking and using additional cpus in the same cluster.
> > 
> > IMHO, the most natural way to model the energy is therefore something
> > like:
> > 
> >     energy = energy_cluster + n * energy_cpu
> > 
> > Where 'n' is the number of cpus powered up and energy_cluster is the
> > cost paid as soon as any cpu in the cluster is powered up.
> 
> OK, that makes sense, thanks! Maybe expand the doc/changelogs with this
> because it wasn't immediately clear to me.

I will add more documention to the next round, it is indeed needed.

> 
> > > Also, in general, why would we need to walk the domain tree all the way
> > > up, typically I would expect to stop walking once we've covered the two
> > > cpu's we're interested in, because above that nothing changes.
> > 
> > True. In some cases we don't have to go all the way up. There is a
> > condition in energy_diff_load() that bails out if the energy doesn't
> > change further up the hierarchy. There might be scope for improving that
> > condition though.
> > 
> > We can basically stop going up if the utilization of the domain is
> > unchanged by the change we want to do. For example, we can ignore the
> > next level above if a third cpu is keeping the domain up all the time
> > anyway. In the 100% + 50% case above, putting another 50% task on the
> > 50% cpu wouldn't affect the cluster according the proposed model, so it
> > can be ignored. However, if we did the same on any of the two cpus in
> > the 50% + 25% example we affect the cluster utilization and have to do
> > the cluster level maths.
> > 
> > So we do sometimes have to go all the way up even if we are balancing
> > two sibling cpus to determine the energy implications. At least if we
> > want an energy score like energy_diff_load() produces. However, we might
> > be able to take some other shortcuts if we are balancing load between
> > two specific cpus (not wakeup/fork/exec balancing) as you point out. But
> > there are cases where we need to continue up until the domain
> > utilization is unchanged.
> 
> Right.. so my worry with this is scalability. We typically want to avoid
> having to scan the entire machine, even for power aware balancing.

I haven't looked at power management for really big machines, but I hope
that we can stop a socket level or wherever utilization changes won't
affect the energy of the rest of the system. If we can power off groups
of sockets or something like that, we could scan at that level less
frequently (like we do now). The cost and latency of powering off
multiple sockets is probably high and not something we want to do often.

> That said, I don't think we have a 'sane' model for really big hardware
> (yet). Intel still hasn't really said anything much on that iirc, as
> long as a single core is up, all the memory controllers in the numa
> fabric need to be awake, not to mention to cost of keeping the dram
> alive.

Right. I'm hoping that we can roll that in once we know more about power
management on big hardware.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 4, 2014, 3:42 p.m. UTC | #7
On Tue, Jun 03, 2014 at 12:44:28PM +0100, Peter Zijlstra wrote:
> On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > +static struct capacity_state cap_states_cluster_a7[] = {
> > +	/* Cluster only power */
> > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > +	};
> > +
> > +static struct capacity_state cap_states_cluster_a15[] = {
> > +	/* Cluster only power */
> > +	 { .cap =  840, .power =  7920, }, /*  500 MHz */
> > +	 { .cap = 1008, .power =  8165, }, /*  600 MHz */
> > +	 { .cap = 1176, .power =  8172, }, /*  700 MHz */
> > +	 { .cap = 1343, .power =  8195, }, /*  800 MHz */
> > +	 { .cap = 1511, .power =  8265, }, /*  900 MHz */
> > +	 { .cap = 1679, .power =  8446, }, /* 1000 MHz */
> > +	 { .cap = 1847, .power = 11426, }, /* 1100 MHz */
> > +	 { .cap = 2015, .power = 15200, }, /* 1200 MHz */
> > +	};
> 
> 
> So how did you obtain these numbers? Did you use numbers provided by the
> hardware people, or did you run a particular benchmark and record the
> power usage?
>
> Does that benchmark do some actual work (as opposed to a while(1) loop)
> to keep more silicon lit up?

Hardware people don't like sharing data, so I did my own measurements
and calculations to get the numbers above.

ARM TC2 has on-chip energy counters for counting energy consumed by the
A7 and A15 clusters. They are fairly accurate. I used sysbench cpu
benchmark as test workload for the above numbers. sysbench might not be
a representative workload, but it is easy to use. I think, ideally,
vendors would run their own mix of workloads they care about and derrive
their numbers for their platform based on that.

> If you have a setup for measuring these, should we try and publish that
> too so that people can run it on their platform and provide these
> numbers?

The workload setup I used quite simple. I ran sysbench with taskset with
different numbers of threads to extrapolate power consumed by each
individual cpu and how much comes from just powering on the domain.

Measuring the actual power is very platform specific. Developing a fully
automated tool do it for any given platform isn't straigt forward, but
I'm happy to share how I did it. I can add a description of the method I
used on TC2 to the documentation so others can use it as reference.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 4, 2014, 4:02 p.m. UTC | #8
On Tue, Jun 03, 2014 at 12:50:15PM +0100, Peter Zijlstra wrote:
> On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > +static struct capacity_state cap_states_cluster_a7[] = {
> > +	/* Cluster only power */
> > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > +	};
> 
> So one thing I remember was that we spoke about restricting this to
> frequency levels where the voltage changed.
> 
> Because voltage jumps were the biggest factor to energy usage.
> 
> Any word on that?

Since we don't drive P-state changes from the scheduler, I think we
could leave out P-states from the table without too much trouble. Good
point.

TC2 is an early development platform and somewhat different from what
you find in end user products. TC2 actually uses the same voltage for
all states except the highest 2-3 states. That is not typical. The
voltage is typically slightly different for each state, however, the
difference get bigger for higher P-states. We could probably get away
with representing multiple states as one in the energy model if the
voltage change is minimal.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 4, 2014, 4:16 p.m. UTC | #9
On Wed, Jun 04, 2014 at 04:42:27PM +0100, Morten Rasmussen wrote:
> On Tue, Jun 03, 2014 at 12:44:28PM +0100, Peter Zijlstra wrote:
> > On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > > +static struct capacity_state cap_states_cluster_a7[] = {
> > > +	/* Cluster only power */
> > > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > > +	};
> > > +
> > > +static struct capacity_state cap_states_cluster_a15[] = {
> > > +	/* Cluster only power */
> > > +	 { .cap =  840, .power =  7920, }, /*  500 MHz */
> > > +	 { .cap = 1008, .power =  8165, }, /*  600 MHz */
> > > +	 { .cap = 1176, .power =  8172, }, /*  700 MHz */
> > > +	 { .cap = 1343, .power =  8195, }, /*  800 MHz */
> > > +	 { .cap = 1511, .power =  8265, }, /*  900 MHz */
> > > +	 { .cap = 1679, .power =  8446, }, /* 1000 MHz */
> > > +	 { .cap = 1847, .power = 11426, }, /* 1100 MHz */
> > > +	 { .cap = 2015, .power = 15200, }, /* 1200 MHz */
> > > +	};
> > 
> > 
> > So how did you obtain these numbers? Did you use numbers provided by the
> > hardware people, or did you run a particular benchmark and record the
> > power usage?
> >
> > Does that benchmark do some actual work (as opposed to a while(1) loop)
> > to keep more silicon lit up?
> 
> Hardware people don't like sharing data, so I did my own measurements
> and calculations to get the numbers above.
> 
> ARM TC2 has on-chip energy counters for counting energy consumed by the
> A7 and A15 clusters. They are fairly accurate. 

Recent Intel chips have that too; they come packaged as:

  perf stat -a -e "power/energy-cores/" -- cmd

(through the perf_event_intel_rapl.c driver), It would be ideal if the
ARM equivalent was available through a similar interface.

http://lwn.net/Articles/573602/

> I used sysbench cpu
> benchmark as test workload for the above numbers. sysbench might not be
> a representative workload, but it is easy to use. I think, ideally,
> vendors would run their own mix of workloads they care about and derrive
> their numbers for their platform based on that.
> 
> > If you have a setup for measuring these, should we try and publish that
> > too so that people can run it on their platform and provide these
> > numbers?
> 
> The workload setup I used quite simple. I ran sysbench with taskset with
> different numbers of threads to extrapolate power consumed by each
> individual cpu and how much comes from just powering on the domain.
> 
> Measuring the actual power is very platform specific. Developing a fully
> automated tool do it for any given platform isn't straigt forward, but
> I'm happy to share how I did it. I can add a description of the method I
> used on TC2 to the documentation so others can use it as reference.

That would be good I think, esp. if we can get similar perf based energy
measurement things sorted. And if we make the tool consume the machine
topology present in sysfs we can get a long way towards automating this
I think.
Peter Zijlstra June 4, 2014, 5:27 p.m. UTC | #10
On Wed, Jun 04, 2014 at 05:02:30PM +0100, Morten Rasmussen wrote:
> On Tue, Jun 03, 2014 at 12:50:15PM +0100, Peter Zijlstra wrote:
> > On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > > +static struct capacity_state cap_states_cluster_a7[] = {
> > > +	/* Cluster only power */
> > > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > > +	};
> > 
> > So one thing I remember was that we spoke about restricting this to
> > frequency levels where the voltage changed.
> > 
> > Because voltage jumps were the biggest factor to energy usage.
> > 
> > Any word on that?
> 
> Since we don't drive P-state changes from the scheduler, I think we
> could leave out P-states from the table without too much trouble. Good
> point.

Well, we eventually want to go there I think. Although we still needed
to come up with something for Intel, because I'm not at all sure how all
that works.

> TC2 is an early development platform and somewhat different from what
> you find in end user products. TC2 actually uses the same voltage for
> all states except the highest 2-3 states. That is not typical. The
> voltage is typically slightly different for each state, however, the
> difference get bigger for higher P-states. We could probably get away
> with representing multiple states as one in the energy model if the
> voltage change is minimal.

So while I don't mind the full table, esp. if its fairly easy to
generate using that tool you spoke about, I just wondered if it made
sense to somewhat reduce it.

Now that I look at the actual .power values, you can indeed see that all
except the last two are pretty much similar in power usage.

On that, is that fluctuation measurement noise, or is that stable?
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki June 4, 2014, 9:56 p.m. UTC | #11
On Wednesday, June 04, 2014 07:27:12 PM Peter Zijlstra wrote:
> On Wed, Jun 04, 2014 at 05:02:30PM +0100, Morten Rasmussen wrote:
> > On Tue, Jun 03, 2014 at 12:50:15PM +0100, Peter Zijlstra wrote:
> > > On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > > > +static struct capacity_state cap_states_cluster_a7[] = {
> > > > +	/* Cluster only power */
> > > > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > > > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > > > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > > > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > > > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > > > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > > > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > > > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > > > +	};
> > > 
> > > So one thing I remember was that we spoke about restricting this to
> > > frequency levels where the voltage changed.
> > > 
> > > Because voltage jumps were the biggest factor to energy usage.
> > > 
> > > Any word on that?
> > 
> > Since we don't drive P-state changes from the scheduler, I think we
> > could leave out P-states from the table without too much trouble. Good
> > point.
> 
> Well, we eventually want to go there I think. Although we still needed
> to come up with something for Intel, because I'm not at all sure how all
> that works.

Do you mean power numbers or how P-states work on Intel in general?

Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra June 5, 2014, 6:52 a.m. UTC | #12
On Wed, Jun 04, 2014 at 11:56:55PM +0200, Rafael J. Wysocki wrote:
> On Wednesday, June 04, 2014 07:27:12 PM Peter Zijlstra wrote:

> > Well, we eventually want to go there I think. Although we still needed
> > to come up with something for Intel, because I'm not at all sure how all
> > that works.
> 
> Do you mean power numbers or how P-states work on Intel in general?

P-states, I'm still not at all sure how all that works on Intel and what
we can sanely do with them.

Supposedly Intel has a means of setting P-states (there's a driver after
all), but then is completely free to totally ignore it and do something
entirely different anyhow.

And while APERF/MPERF allows observing what it did, its afaik, nigh on
impossible to predict wtf its going to do, and therefore any such energy
computation is going to be a PRNG at best.

Now, given all that I'm not sure what we need that P-state driver for,
so supposedly I'm missing something.

Ideally Len (or someone equally in-the-know) would explain to me how
exactly all that works and what we can rely upon. All I've gotten so far
is, you can't rely on anything, and magik. Which is entirely useless.
Dirk Brandewie June 5, 2014, 3:03 p.m. UTC | #13
On 06/04/2014 11:52 PM, Peter Zijlstra wrote:
> On Wed, Jun 04, 2014 at 11:56:55PM +0200, Rafael J. Wysocki wrote:
>> On Wednesday, June 04, 2014 07:27:12 PM Peter Zijlstra wrote:
>
>>> Well, we eventually want to go there I think. Although we still needed
>>> to come up with something for Intel, because I'm not at all sure how all
>>> that works.
>>
>> Do you mean power numbers or how P-states work on Intel in general?
>
> P-states, I'm still not at all sure how all that works on Intel and what
> we can sanely do with them.
>
> Supposedly Intel has a means of setting P-states (there's a driver after
> all), but then is completely free to totally ignore it and do something
> entirely different anyhow.

You can request a P state per core but the package does coordination at
a package level for the P state that will be used based on all requests.
This is due to the fact that most SKUs have a single VR and PLL. So
the highest P state wins.  When a core goes idle it loses it's vote
for the current package P state and that cores clock it turned off.

>
> And while APERF/MPERF allows observing what it did, its afaik, nigh on
> impossible to predict wtf its going to do, and therefore any such energy
> computation is going to be a PRNG at best.
>
> Now, given all that I'm not sure what we need that P-state driver for,
> so supposedly I'm missing something.

intel_pstate tries to keep the core P state as low as possible to satisfy
the given load, so when various cores go idle the package P state can be
as low as possible.  The big power win is a core going idle.

>
> Ideally Len (or someone equally in-the-know) would explain to me how
> exactly all that works and what we can rely upon. All I've gotten so far
> is, you can't rely on anything, and magik. Which is entirely useless.
>
The only thing you can rely on is that you will get "at least" the P state
requested in the presence of hardware coordination.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 5, 2014, 8:29 p.m. UTC | #14
On Thu, Jun 05, 2014 at 08:03:15AM -0700, Dirk Brandewie wrote:
> 
> You can request a P state per core but the package does coordination at
> a package level for the P state that will be used based on all requests.
> This is due to the fact that most SKUs have a single VR and PLL. So
> the highest P state wins.  When a core goes idle it loses it's vote
> for the current package P state and that cores clock it turned off.
> 

You need to differentiate Turbo and non-Turbo. The highest P state wins? Not
really. Actually, silicon supports indepdent non-Turbo pstate, but just not enabled.
For Turbo, it basically depends on power budget of both core and gfx (because
they share) for each core to get which Turbo point.

> >
> >And while APERF/MPERF allows observing what it did, its afaik, nigh on
> >impossible to predict wtf its going to do, and therefore any such energy
> >computation is going to be a PRNG at best.
> >
> >Now, given all that I'm not sure what we need that P-state driver for,
> >so supposedly I'm missing something.
> 
> intel_pstate tries to keep the core P state as low as possible to satisfy
> the given load, so when various cores go idle the package P state can be
> as low as possible.  The big power win is a core going idle.
> 

In terms of prediction, it is definitely can't be 100% right. But the
performance of most workloads does scale with pstate (frequency), may not be
linearly. So it is to some point predictable FWIW. And this is all governors
and Intel_pstate's basic assumption.

Thanks,
Yuyang
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 6, 2014, 12:35 a.m. UTC | #15
On Fri, Jun 06, 2014 at 10:05:43AM +0200, Peter Zijlstra wrote:
> On Fri, Jun 06, 2014 at 04:29:30AM +0800, Yuyang Du wrote:
> > On Thu, Jun 05, 2014 at 08:03:15AM -0700, Dirk Brandewie wrote:
> > > 
> > > You can request a P state per core but the package does coordination at
> > > a package level for the P state that will be used based on all requests.
> > > This is due to the fact that most SKUs have a single VR and PLL. So
> > > the highest P state wins.  When a core goes idle it loses it's vote
> > > for the current package P state and that cores clock it turned off.
> > > 
> > 
> > You need to differentiate Turbo and non-Turbo. The highest P state wins? Not
> > really.
> 
> *sigh* and here we go again.. someone please, write something coherent
> and have all intel people sign off on it and stop saying different
> things.
> 
> > Actually, silicon supports indepdent non-Turbo pstate, but just not enabled.
> 
> Then it doesn't exist, so no point in mentioning it.
> 

Well, things actually get more complicated. Not-enabled is for Core. For Atom
Baytrail, each core indeed can operate on difference frequency. I am not sure for
Xeon, :)

> > For Turbo, it basically depends on power budget of both core and gfx (because
> > they share) for each core to get which Turbo point.
> 
> And RAPL controls can give preference of which gfx/core gets most,
> right?
>

Maybe Jacob knows that.

> > > intel_pstate tries to keep the core P state as low as possible to satisfy
> > > the given load, so when various cores go idle the package P state can be
> > > as low as possible.  The big power win is a core going idle.
> > > 
> > 
> > In terms of prediction, it is definitely can't be 100% right. But the
> > performance of most workloads does scale with pstate (frequency), may not be
> > linearly. So it is to some point predictable FWIW. And this is all governors
> > and Intel_pstate's basic assumption.
> 
> So frequency isn't _that_ interesting, voltage is. And while
> predictability it might be their assumption, is it actually true? I
> mean, there's really nothing else except to assume that, if its not you
> can't do anything at all, so you _have_ to assume this.
> 
> But again, is the assumption true? Or just happy thoughts in an attempt
> to do something.

Voltage is combined with frequency, roughly, voltage is proportional to freuquecy, so
roughly, power is proportionaly to voltage^3. You can't say which is more important,
or there is no reason to raise voltage without raising frequency.

If only one word to say: true of false, it is true. Because given any fixed
workload, I can't see why performance would be worse if frequency is higher.

The reality as opposed to the assumption is in two-fold:
1) if workload is CPU bound, performance scales with frequency absolutely. if workload is
   memory bound, it does not scale. But from kernel, we don't know whether it is CPU bound
   or not (or it is hard to know). uArch statistics can model that.
2) the workload is not fixed in real-time, changing all the time.

But still, the assumption is a must or no guilty, because we adjust frequency continuously,
for example, if the workload is fixed, and if the performance does not scale with freq we stop
increasing frequency. So a good frequency governor or driver should and can continuously
pursue "good" frequency with the changing workload. Therefore, in the long term, we will be
better off.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra June 6, 2014, 8:05 a.m. UTC | #16
On Fri, Jun 06, 2014 at 04:29:30AM +0800, Yuyang Du wrote:
> On Thu, Jun 05, 2014 at 08:03:15AM -0700, Dirk Brandewie wrote:
> > 
> > You can request a P state per core but the package does coordination at
> > a package level for the P state that will be used based on all requests.
> > This is due to the fact that most SKUs have a single VR and PLL. So
> > the highest P state wins.  When a core goes idle it loses it's vote
> > for the current package P state and that cores clock it turned off.
> > 
> 
> You need to differentiate Turbo and non-Turbo. The highest P state wins? Not
> really.

*sigh* and here we go again.. someone please, write something coherent
and have all intel people sign off on it and stop saying different
things.

> Actually, silicon supports indepdent non-Turbo pstate, but just not enabled.

Then it doesn't exist, so no point in mentioning it.

> For Turbo, it basically depends on power budget of both core and gfx (because
> they share) for each core to get which Turbo point.

And RAPL controls can give preference of which gfx/core gets most,
right?

> > intel_pstate tries to keep the core P state as low as possible to satisfy
> > the given load, so when various cores go idle the package P state can be
> > as low as possible.  The big power win is a core going idle.
> > 
> 
> In terms of prediction, it is definitely can't be 100% right. But the
> performance of most workloads does scale with pstate (frequency), may not be
> linearly. So it is to some point predictable FWIW. And this is all governors
> and Intel_pstate's basic assumption.

So frequency isn't _that_ interesting, voltage is. And while
predictability it might be their assumption, is it actually true? I
mean, there's really nothing else except to assume that, if its not you
can't do anything at all, so you _have_ to assume this.

But again, is the assumption true? Or just happy thoughts in an attempt
to do something.
Peter Zijlstra June 6, 2014, 10:50 a.m. UTC | #17
On Fri, Jun 06, 2014 at 08:35:21AM +0800, Yuyang Du wrote:

> > > Actually, silicon supports indepdent non-Turbo pstate, but just not enabled.
> > 
> > Then it doesn't exist, so no point in mentioning it.
> > 
> 
> Well, things actually get more complicated. Not-enabled is for Core. For Atom
> Baytrail, each core indeed can operate on difference frequency. I am not sure for
> Xeon, :)

Yes, I understand Atom is an entirely different thing.

> > So frequency isn't _that_ interesting, voltage is. And while
> > predictability it might be their assumption, is it actually true? I
> > mean, there's really nothing else except to assume that, if its not you
> > can't do anything at all, so you _have_ to assume this.
> > 
> > But again, is the assumption true? Or just happy thoughts in an attempt
> > to do something.
> 
> Voltage is combined with frequency, roughly, voltage is proportional
> to freuquecy, so roughly, power is proportionaly to voltage^3. You

P ~ V^2, last time I checked.

> can't say which is more important, or there is no reason to raise
> voltage without raising frequency.

Well, some chips have far fewer voltage steps than freq steps; or,
differently put, they have multiple freq steps for a single voltage
level.

And since the power (Watts) is proportional to Voltage squared, its the
biggest term.

If you have a distinct voltage level for each freq, it all doesn't
matter.

> If only one word to say: true of false, it is true. Because given any
> fixed workload, I can't see why performance would be worse if
> frequency is higher.

Well, our work here is to redefine performance as performance/watt. So
running at higher frequency (and thus likely higher voltage) is a
definite performance decrease in that sense.

> The reality as opposed to the assumption is in two-fold:
> 1) if workload is CPU bound, performance scales with frequency absolutely. if workload is
>    memory bound, it does not scale. But from kernel, we don't know whether it is CPU bound
>    or not (or it is hard to know). uArch statistics can model that.

Well, we could know for a number of archs, its just that these
statistics are expensive to track.

Also, lowering P-state is 'fine', as long as you can 'guarantee' you
don't loose IPC performance, since running at lower voltage for the same
IPC is actually better IPC/watt than estimated.

But what was said earlier is that P-state is a lower limit, not a higher
limit. In that case the core can run at higher voltage and the estimate
is just plain wrong.

> But still, the assumption is a must or no guilty, because we adjust
> frequency continuously, for example, if the workload is fixed, and if
> the performance does not scale with freq we stop increasing frequency.
> So a good frequency governor or driver should and can continuously
> pursue "good" frequency with the changing workload. Therefore, in the
> long term, we will be better off.

Sure, but realize that we must fully understand this governor and
integrate it in the scheduler if we're to attain the goal of IPC/watt
optimized scheduling behaviour.

So you (or rather Intel in general) will have to be very explicit on how
their stuff works and can no longer hide in some driver and do magic.
The same is true for all other vendors for that matter.

If you (vendors, not Yuyang in specific) do not want to play (and be
explicit and expose how your hardware functions) then you simply will
not get power efficient scheduling full stop.

There's no rocks to hide under, no magic veils to hide behind. You tell
_in_public_ or you get nothing.
Ingo Molnar June 6, 2014, 12:13 p.m. UTC | #18
* Peter Zijlstra <peterz@infradead.org> wrote:

> > Voltage is combined with frequency, roughly, voltage is 
> > proportional to freuquecy, so roughly, power is proportionaly to 
> > voltage^3. You
> 
> P ~ V^2, last time I checked.

Yes, that's a good approximation for CMOS gates:

  The switching power dissipated by a chip using static CMOS gates is 
  C·V^2·f, where C is the capacitance being switched per clock cycle, 
  V is the supply voltage, and f is the switching frequency,[1] so 
  this part of the power consumption decreases quadratically with 
  voltage. The formula is not exact however, as many modern chips are 
  not implemented using 100% CMOS, but also use special memory 
  circuits, dynamic logic such as domino logic, etc. Moreover, there 
  is also a static leakage current, which has become more and more 
  accentuated as feature sizes have become smaller (below 90 
  nanometres) and threshold levels lower.

  Accordingly, dynamic voltage scaling is widely used as part of 
  strategies to manage switching power consumption in battery powered 
  devices such as cell phones and laptop computers. Low voltage modes 
  are used in conjunction with lowered clock frequencies to minimize 
  power consumption associated with components such as CPUs and DSPs; 
  only when significant computational power is needed will the voltage 
  and frequency be raised.

  Some peripherals also support low voltage operational modes. For 
  example, low power MMC and SD cards can run at 1.8 V as well as at 
  3.3 V, and driver stacks may conserve power by switching to the 
  lower voltage after detecting a card which supports it.

  When leakage current is a significant factor in terms of power 
  consumption, chips are often designed so that portions of them can 
  be powered completely off. This is not usually viewed as being 
  dynamic voltage scaling, because it is not transparent to software. 
  When sections of chips can be turned off, as for example on TI OMAP3 
  processors, drivers and other support software need to support that.

  http://en.wikipedia.org/wiki/Dynamic_voltage_scaling

Leakage current typically gets higher with higher frequencies, but 
it's also highly process dependent AFAIK.

If switching power dissipation is the main factor in power use, then 
we can essentially assume that P ~ V^2, at the same frequency - and 
scales linearly with frequency - but real work performed also scales 
semi-linearly with frequency for many workloads, so that's an 
invariant for everything except highly memory bound workloads.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Ingo Molnar June 6, 2014, 12:27 p.m. UTC | #19
* Ingo Molnar <mingo@kernel.org> wrote:

> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > Voltage is combined with frequency, roughly, voltage is 
> > > proportional to freuquecy, so roughly, power is proportionaly to 
> > > voltage^3. You
> > 
> > P ~ V^2, last time I checked.
> 
> Yes, that's a good approximation for CMOS gates:
> 
>   The switching power dissipated by a chip using static CMOS gates is 
>   C·V^2·f, where C is the capacitance being switched per clock cycle, 
>   V is the supply voltage, and f is the switching frequency,[1] so 
>   this part of the power consumption decreases quadratically with 
>   voltage. The formula is not exact however, as many modern chips are 
>   not implemented using 100% CMOS, but also use special memory 
>   circuits, dynamic logic such as domino logic, etc. Moreover, there 
>   is also a static leakage current, which has become more and more 
>   accentuated as feature sizes have become smaller (below 90 
>   nanometres) and threshold levels lower.
> 
>   Accordingly, dynamic voltage scaling is widely used as part of 
>   strategies to manage switching power consumption in battery powered 
>   devices such as cell phones and laptop computers. Low voltage modes 
>   are used in conjunction with lowered clock frequencies to minimize 
>   power consumption associated with components such as CPUs and DSPs; 
>   only when significant computational power is needed will the voltage 
>   and frequency be raised.
> 
>   Some peripherals also support low voltage operational modes. For 
>   example, low power MMC and SD cards can run at 1.8 V as well as at 
>   3.3 V, and driver stacks may conserve power by switching to the 
>   lower voltage after detecting a card which supports it.
> 
>   When leakage current is a significant factor in terms of power 
>   consumption, chips are often designed so that portions of them can 
>   be powered completely off. This is not usually viewed as being 
>   dynamic voltage scaling, because it is not transparent to software. 
>   When sections of chips can be turned off, as for example on TI OMAP3 
>   processors, drivers and other support software need to support that.
> 
>   http://en.wikipedia.org/wiki/Dynamic_voltage_scaling
> 
> Leakage current typically gets higher with higher frequencies, but 
> it's also highly process dependent AFAIK.
> 
> If switching power dissipation is the main factor in power use, then 
> we can essentially assume that P ~ V^2, at the same frequency - and 
> scales linearly with frequency - but real work performed also scales 
> semi-linearly with frequency for many workloads, so that's an 
> invariant for everything except highly memory bound workloads.

So in practice this probably means that Turbo probably has a somewhat 
super-linear power use factor.

At lower frequencies the leakage current difference is probably 
negligible.

In any case, even with turbo frequencies, switching power use is 
probably an order of magnitude higher than leakage current power use, 
on any marketable chip, so we should concentrate on being able to 
cover this first order effect (P/work ~ V^2), before considering any 
second order effects (leakage current).

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 6, 2014, 1:03 p.m. UTC | #20
On Wed, Jun 04, 2014 at 06:27:12PM +0100, Peter Zijlstra wrote:
> On Wed, Jun 04, 2014 at 05:02:30PM +0100, Morten Rasmussen wrote:
> > On Tue, Jun 03, 2014 at 12:50:15PM +0100, Peter Zijlstra wrote:
> > > On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > > > +static struct capacity_state cap_states_cluster_a7[] = {
> > > > +	/* Cluster only power */
> > > > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > > > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > > > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > > > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > > > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > > > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > > > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > > > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > > > +	};

[...]

> > TC2 is an early development platform and somewhat different from what
> > you find in end user products. TC2 actually uses the same voltage for
> > all states except the highest 2-3 states. That is not typical. The
> > voltage is typically slightly different for each state, however, the
> > difference get bigger for higher P-states. We could probably get away
> > with representing multiple states as one in the energy model if the
> > voltage change is minimal.
> 
> So while I don't mind the full table, esp. if its fairly easy to
> generate using that tool you spoke about, I just wondered if it made
> sense to somewhat reduce it.
> 
> Now that I look at the actual .power values, you can indeed see that all
> except the last two are pretty much similar in power usage.
> 
> On that, is that fluctuation measurement noise, or is that stable?

It would make sense to reduce it for this particular platform. In fact
it is questionable whether we should use frequencies below 800 MHz at
all. On TC2 the voltage is same for 800 MHz and below and it seems that
leakage (static) power is dominating the power consumption. Since the
power is almost constant in the range 350 to 800 MHz energy-efficiency
(performance/watt ~ cap/power) is actually getting *better* as we run
faster until we get to 800 MHz. Beyond 800 MHz energy-efficiency goes
down due to increased voltages.

The proposed platform energy model is an extremely simplified view of
the platform. The numbers are pretty much the raw data normalized and
averaged as appropriate. I haven't tweaked them in any way to make them
look more perfect. So, the small variations (within 4%) may be
measurement noise and the fact I model something complex with a simple
model.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 6, 2014, 1:15 p.m. UTC | #21
On Wed, Jun 04, 2014 at 05:16:18PM +0100, Peter Zijlstra wrote:
> On Wed, Jun 04, 2014 at 04:42:27PM +0100, Morten Rasmussen wrote:
> > On Tue, Jun 03, 2014 at 12:44:28PM +0100, Peter Zijlstra wrote:
> > > On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > > > +static struct capacity_state cap_states_cluster_a7[] = {
> > > > +	/* Cluster only power */
> > > > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > > > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > > > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > > > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > > > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > > > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > > > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > > > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > > > +	};
> > > > +
> > > > +static struct capacity_state cap_states_cluster_a15[] = {
> > > > +	/* Cluster only power */
> > > > +	 { .cap =  840, .power =  7920, }, /*  500 MHz */
> > > > +	 { .cap = 1008, .power =  8165, }, /*  600 MHz */
> > > > +	 { .cap = 1176, .power =  8172, }, /*  700 MHz */
> > > > +	 { .cap = 1343, .power =  8195, }, /*  800 MHz */
> > > > +	 { .cap = 1511, .power =  8265, }, /*  900 MHz */
> > > > +	 { .cap = 1679, .power =  8446, }, /* 1000 MHz */
> > > > +	 { .cap = 1847, .power = 11426, }, /* 1100 MHz */
> > > > +	 { .cap = 2015, .power = 15200, }, /* 1200 MHz */
> > > > +	};
> > > 
> > > 
> > > So how did you obtain these numbers? Did you use numbers provided by the
> > > hardware people, or did you run a particular benchmark and record the
> > > power usage?
> > >
> > > Does that benchmark do some actual work (as opposed to a while(1) loop)
> > > to keep more silicon lit up?
> > 
> > Hardware people don't like sharing data, so I did my own measurements
> > and calculations to get the numbers above.
> > 
> > ARM TC2 has on-chip energy counters for counting energy consumed by the
> > A7 and A15 clusters. They are fairly accurate. 
> 
> Recent Intel chips have that too; they come packaged as:
> 
>   perf stat -a -e "power/energy-cores/" -- cmd
> 
> (through the perf_event_intel_rapl.c driver), It would be ideal if the
> ARM equivalent was available through a similar interface.
> 
> http://lwn.net/Articles/573602/

Nice. On ARM it is not mandatory to have energy counters and what they
actually measure if they are implemented is implementation dependent.
However, each vendor does extensive evaluation and characterization of
their implementation already, so I don't think would be a problem for
them to provide the numbers.

> > I used sysbench cpu
> > benchmark as test workload for the above numbers. sysbench might not be
> > a representative workload, but it is easy to use. I think, ideally,
> > vendors would run their own mix of workloads they care about and derrive
> > their numbers for their platform based on that.
> > 
> > > If you have a setup for measuring these, should we try and publish that
> > > too so that people can run it on their platform and provide these
> > > numbers?
> > 
> > The workload setup I used quite simple. I ran sysbench with taskset with
> > different numbers of threads to extrapolate power consumed by each
> > individual cpu and how much comes from just powering on the domain.
> > 
> > Measuring the actual power is very platform specific. Developing a fully
> > automated tool do it for any given platform isn't straigt forward, but
> > I'm happy to share how I did it. I can add a description of the method I
> > used on TC2 to the documentation so others can use it as reference.
> 
> That would be good I think, esp. if we can get similar perf based energy
> measurement things sorted. And if we make the tool consume the machine
> topology present in sysfs we can get a long way towards automating this
> I think.

Some of the measurements could be automated. Others are hard to
automate as they require extensive knowledge about the platform. wakeup
energy, for example. You may need to do various tricks and hacks to
force the platform to use a specific idle-state so you know what you are
measuring.

I will add the TC2 recipe as a start and then see if my ugly scripts can
be turned into something generally useful.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 6, 2014, 1:43 p.m. UTC | #22
On Fri, Jun 06, 2014 at 02:15:10PM +0100, Morten Rasmussen wrote:
> > > ARM TC2 has on-chip energy counters for counting energy consumed by the
> > > A7 and A15 clusters. They are fairly accurate. 
> > 
> > Recent Intel chips have that too; they come packaged as:
> > 
> >   perf stat -a -e "power/energy-cores/" -- cmd
> > 
> > (through the perf_event_intel_rapl.c driver), It would be ideal if the
> > ARM equivalent was available through a similar interface.
> > 
> > http://lwn.net/Articles/573602/
> 
> Nice. On ARM it is not mandatory to have energy counters and what they
> actually measure if they are implemented is implementation dependent.
> However, each vendor does extensive evaluation and characterization of
> their implementation already, so I don't think would be a problem for
> them to provide the numbers.

How is the ARM energy thing exposed? Through the regular PMU but with
vendor specific events, or through a separate interface, entirely vendor
specific?

In any case, would it be at all possible to nudge them to provide a
'driver' for this so that they can be more easily used?

> Some of the measurements could be automated. Others are hard to
> automate as they require extensive knowledge about the platform. wakeup
> energy, for example. You may need to do various tricks and hacks to
> force the platform to use a specific idle-state so you know what you are
> measuring.
> 
> I will add the TC2 recipe as a start and then see if my ugly scripts can
> be turned into something generally useful.

Fair enough; I would prefer to have a situation where 'we' can validate
whatever magic numbers the vendors provide for their hardware, or can
generate numbers for hardware where the vendor is not interested.

But yes, publishing your hacks is a good first step at getting such a
thing going, if we then further require everybody to use this 'tool' and
improve if not suitable, we might end up with something useful ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 6, 2014, 2:11 p.m. UTC | #23
On Fri, Jun 06, 2014 at 01:27:40PM +0100, Ingo Molnar wrote:
> 
> * Ingo Molnar <mingo@kernel.org> wrote:
> 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > > Voltage is combined with frequency, roughly, voltage is 
> > > > proportional to freuquecy, so roughly, power is proportionaly to 
> > > > voltage^3. You
> > > 
> > > P ~ V^2, last time I checked.
> > 
> > Yes, that's a good approximation for CMOS gates:
> > 
> >   The switching power dissipated by a chip using static CMOS gates is 
> >   C·V^2·f, where C is the capacitance being switched per clock cycle, 
> >   V is the supply voltage, and f is the switching frequency,[1] so 
> >   this part of the power consumption decreases quadratically with 
> >   voltage. The formula is not exact however, as many modern chips are 
> >   not implemented using 100% CMOS, but also use special memory 
> >   circuits, dynamic logic such as domino logic, etc. Moreover, there 
> >   is also a static leakage current, which has become more and more 
> >   accentuated as feature sizes have become smaller (below 90 
> >   nanometres) and threshold levels lower.
> > 
> >   Accordingly, dynamic voltage scaling is widely used as part of 
> >   strategies to manage switching power consumption in battery powered 
> >   devices such as cell phones and laptop computers. Low voltage modes 
> >   are used in conjunction with lowered clock frequencies to minimize 
> >   power consumption associated with components such as CPUs and DSPs; 
> >   only when significant computational power is needed will the voltage 
> >   and frequency be raised.
> > 
> >   Some peripherals also support low voltage operational modes. For 
> >   example, low power MMC and SD cards can run at 1.8 V as well as at 
> >   3.3 V, and driver stacks may conserve power by switching to the 
> >   lower voltage after detecting a card which supports it.
> > 
> >   When leakage current is a significant factor in terms of power 
> >   consumption, chips are often designed so that portions of them can 
> >   be powered completely off. This is not usually viewed as being 
> >   dynamic voltage scaling, because it is not transparent to software. 
> >   When sections of chips can be turned off, as for example on TI OMAP3 
> >   processors, drivers and other support software need to support that.
> > 
> >   http://en.wikipedia.org/wiki/Dynamic_voltage_scaling
> > 
> > Leakage current typically gets higher with higher frequencies, but 
> > it's also highly process dependent AFAIK.

Strictly speaking leakage current gets higher with voltage, not
frequency (well, not to an extend where we should care). However,
frequency increase typically implies a voltage increase, so in that
sense I agree.

> > 
> > If switching power dissipation is the main factor in power use, then 
> > we can essentially assume that P ~ V^2, at the same frequency - and 
> > scales linearly with frequency - but real work performed also scales 
> > semi-linearly with frequency for many workloads, so that's an 
> > invariant for everything except highly memory bound workloads.

AFAIK, there isn't much sense in running a slower frequency than the
highest one supported at a given voltage unless there are specific
reasons not to (peripherals that keeps the system up anyway and such).
In the general case, I think it is safe to assume that energy-efficiency
goes down for every increase in frequency. Modern ARM platforms
typically have different voltages for more or less all frequencies (TC2
is quite atypical). The voltage increases more rapidly than the
frequency which makes the higher frequencies extremely expensive in
terms of energy-efficiency.

All of this is of course without considering power gating which allow us
to eliminate the leakage power (or at least partially eliminate it)
when idle. So, while energy-efficiency is bad at high frequencies, it
might pay off overall to use them anyway if we can save more leakage
energy while idle than we burn extra to race to idle. This is where the
platform energy model becomes useful.

> So in practice this probably means that Turbo probably has a somewhat 
> super-linear power use factor.

I'm not familiar with the voltage scaling on Intel platforms, but as
said above, I think power always scales up faster than performance. It
can probably be ignored for lower frequencies, but for the higher ones,
the extra energy per instruction executed is significant.

> At lower frequencies the leakage current difference is probably 
> negligible.

It is still there, but it is smaller due to the reduced voltage and so
is the dynamic power.

> In any case, even with turbo frequencies, switching power use is 
> probably an order of magnitude higher than leakage current power use, 
> on any marketable chip, 

That strongly depends on the process and the gate library used, but I
agree that dynamic power should be our primary focus.

> so we should concentrate on being able to 
> cover this first order effect (P/work ~ V^2), before considering any 
> second order effects (leakage current).

I think we should be fine as long as we include the leakage power in the
'busy' power consumption and know the idle-state power consumption in
the idle-states. I already do this in the TC2 model. That way we don't
have to distinguish between leakage and dynamic power.

Morten
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 6, 2014, 2:29 p.m. UTC | #24
On Fri, Jun 06, 2014 at 02:43:03PM +0100, Peter Zijlstra wrote:
> On Fri, Jun 06, 2014 at 02:15:10PM +0100, Morten Rasmussen wrote:
> > > > ARM TC2 has on-chip energy counters for counting energy consumed by the
> > > > A7 and A15 clusters. They are fairly accurate. 
> > > 
> > > Recent Intel chips have that too; they come packaged as:
> > > 
> > >   perf stat -a -e "power/energy-cores/" -- cmd
> > > 
> > > (through the perf_event_intel_rapl.c driver), It would be ideal if the
> > > ARM equivalent was available through a similar interface.
> > > 
> > > http://lwn.net/Articles/573602/
> > 
> > Nice. On ARM it is not mandatory to have energy counters and what they
> > actually measure if they are implemented is implementation dependent.
> > However, each vendor does extensive evaluation and characterization of
> > their implementation already, so I don't think would be a problem for
> > them to provide the numbers.
> 
> How is the ARM energy thing exposed? Through the regular PMU but with
> vendor specific events, or through a separate interface, entirely vendor
> specific?

There is an upstream hwmon driver for TC2 already with an easy to use
sysfs interface for all the energy counters. So it is somewhat vendor
specific at the moment unfortunately.

> In any case, would it be at all possible to nudge them to provide a
> 'driver' for this so that they can be more easily used?

I have raised it internally that unification on this front is needed. 

> > Some of the measurements could be automated. Others are hard to
> > automate as they require extensive knowledge about the platform. wakeup
> > energy, for example. You may need to do various tricks and hacks to
> > force the platform to use a specific idle-state so you know what you are
> > measuring.
> > 
> > I will add the TC2 recipe as a start and then see if my ugly scripts can
> > be turned into something generally useful.
> 
> Fair enough; I would prefer to have a situation where 'we' can validate
> whatever magic numbers the vendors provide for their hardware, or can
> generate numbers for hardware where the vendor is not interested.
> 
> But yes, publishing your hacks is a good first step at getting such a
> thing going, if we then further require everybody to use this 'tool' and
> improve if not suitable, we might end up with something useful ;-)

Fair plan ;-)

That said, vendors may want to provide slightly different numbers if
they do characterization based on workloads they care about rather than
sysbench or whatever 'we' end up using. The numbers will vary depending
on which workload(s) you use.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jacob Pan June 6, 2014, 4:27 p.m. UTC | #25
On Fri, 6 Jun 2014 08:35:21 +0800
Yuyang Du <yuyang.du@intel.com> wrote:

> On Fri, Jun 06, 2014 at 10:05:43AM +0200, Peter Zijlstra wrote:
> > On Fri, Jun 06, 2014 at 04:29:30AM +0800, Yuyang Du wrote:
> > > On Thu, Jun 05, 2014 at 08:03:15AM -0700, Dirk Brandewie wrote:
> > > > 
> > > > You can request a P state per core but the package does
> > > > coordination at a package level for the P state that will be
> > > > used based on all requests. This is due to the fact that most
> > > > SKUs have a single VR and PLL. So the highest P state wins.
> > > > When a core goes idle it loses it's vote for the current
> > > > package P state and that cores clock it turned off.
> > > > 
> > > 
> > > You need to differentiate Turbo and non-Turbo. The highest P
> > > state wins? Not really.
> > 
> > *sigh* and here we go again.. someone please, write something
> > coherent and have all intel people sign off on it and stop saying
> > different things.
> > 
> > > Actually, silicon supports indepdent non-Turbo pstate, but just
> > > not enabled.
> > 
> > Then it doesn't exist, so no point in mentioning it.
> > 
> 
> Well, things actually get more complicated. Not-enabled is for Core.
> For Atom Baytrail, each core indeed can operate on difference
> frequency. I am not sure for Xeon, :)
> 
> > > For Turbo, it basically depends on power budget of both core and
> > > gfx (because they share) for each core to get which Turbo point.
> > 
> > And RAPL controls can give preference of which gfx/core gets most,
> > right?
> >
> 
There are two controls can influence gfx and core power budge sharing:
1. set power limit on each RAPL domain
2. turbo power budge sharing
#2 is not implemented yet. default to CPU take all.

> 
> > > > intel_pstate tries to keep the core P state as low as possible
> > > > to satisfy the given load, so when various cores go idle the
> > > > package P state can be as low as possible.  The big power win
> > > > is a core going idle.
> > > > 
> > > 
> > > In terms of prediction, it is definitely can't be 100% right. But
> > > the performance of most workloads does scale with pstate
> > > (frequency), may not be linearly. So it is to some point
> > > predictable FWIW. And this is all governors and Intel_pstate's
> > > basic assumption.
> > 
> > So frequency isn't _that_ interesting, voltage is. And while
> > predictability it might be their assumption, is it actually true? I
> > mean, there's really nothing else except to assume that, if its not
> > you can't do anything at all, so you _have_ to assume this.
> > 
> > But again, is the assumption true? Or just happy thoughts in an
> > attempt to do something.
> 
> Voltage is combined with frequency, roughly, voltage is proportional
> to freuquecy, so roughly, power is proportionaly to voltage^3. You
> can't say which is more important, or there is no reason to raise
> voltage without raising frequency.
> 
> If only one word to say: true of false, it is true. Because given any
> fixed workload, I can't see why performance would be worse if
> frequency is higher.
> 
> The reality as opposed to the assumption is in two-fold:
> 1) if workload is CPU bound, performance scales with frequency
> absolutely. if workload is memory bound, it does not scale. But from
> kernel, we don't know whether it is CPU bound or not (or it is hard
> to know). uArch statistics can model that. 2) the workload is not
> fixed in real-time, changing all the time.
> 
> But still, the assumption is a must or no guilty, because we adjust
> frequency continuously, for example, if the workload is fixed, and if
> the performance does not scale with freq we stop increasing
> frequency. So a good frequency governor or driver should and can
> continuously pursue "good" frequency with the changing workload.
> Therefore, in the long term, we will be better off.
> 

[Jacob Pan]
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Pitre June 7, 2014, 2:33 a.m. UTC | #26
On Fri, 6 Jun 2014, Ingo Molnar wrote:

> In any case, even with turbo frequencies, switching power use is 
> probably an order of magnitude higher than leakage current power use, 
> on any marketable chip, so we should concentrate on being able to 
> cover this first order effect (P/work ~ V^2), before considering any 
> second order effects (leakage current).

Just so that people are aware... We'll have to introduce thermal 
constraint management into the scheduler mix as well at some point.  
Right now what we have is an ad hoc subsystem that simply monitors 
temperature and apply crude cooling strategies when some thresholds are 
met. But a better strategy would imply thermal "provisioning".


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Pitre June 7, 2014, 2:52 a.m. UTC | #27
On Wed, 4 Jun 2014, Peter Zijlstra wrote:

> On Wed, Jun 04, 2014 at 05:02:30PM +0100, Morten Rasmussen wrote:
> > On Tue, Jun 03, 2014 at 12:50:15PM +0100, Peter Zijlstra wrote:
> > > On Fri, May 23, 2014 at 07:16:33PM +0100, Morten Rasmussen wrote:
> > > > +static struct capacity_state cap_states_cluster_a7[] = {
> > > > +	/* Cluster only power */
> > > > +	 { .cap =  358, .power = 2967, }, /*  350 MHz */
> > > > +	 { .cap =  410, .power = 2792, }, /*  400 MHz */
> > > > +	 { .cap =  512, .power = 2810, }, /*  500 MHz */
> > > > +	 { .cap =  614, .power = 2815, }, /*  600 MHz */
> > > > +	 { .cap =  717, .power = 2919, }, /*  700 MHz */
> > > > +	 { .cap =  819, .power = 2847, }, /*  800 MHz */
> > > > +	 { .cap =  922, .power = 3917, }, /*  900 MHz */
> > > > +	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
> > > > +	};
> > > 
> > > So one thing I remember was that we spoke about restricting this to
> > > frequency levels where the voltage changed.
> > > 
> > > Because voltage jumps were the biggest factor to energy usage.
> > > 
> > > Any word on that?
> > 
> > Since we don't drive P-state changes from the scheduler, I think we
> > could leave out P-states from the table without too much trouble. Good
> > point.
> 
> Well, we eventually want to go there I think.

People within Linaro have initial code for this.  Should be posted as an 
RFC soon.

> Although we still needed
> to come up with something for Intel, because I'm not at all sure how all
> that works.

Our initial code reuse whatever existing platform specific cpufreq 
drivers.  The idea is to bypass the cpufreq governors.

If Intel hardware doesn't provide/allow much control here then the 
platform driver should already tell the cpufreq core (and by extension 
the scheduler) about the extent of what can be done.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 7, 2014, 11:26 p.m. UTC | #28
On Fri, Jun 06, 2014 at 12:50:36PM +0200, Peter Zijlstra wrote:
> > Voltage is combined with frequency, roughly, voltage is proportional
> > to freuquecy, so roughly, power is proportionaly to voltage^3. You
> 
> P ~ V^2, last time I checked.
> 
> > can't say which is more important, or there is no reason to raise
> > voltage without raising frequency.
> 
> Well, some chips have far fewer voltage steps than freq steps; or,
> differently put, they have multiple freq steps for a single voltage
> level.
> 
> And since the power (Watts) is proportional to Voltage squared, its the
> biggest term.
> 
> If you have a distinct voltage level for each freq, it all doesn't
> matter.
> 

Ok. I think we understand each other. But one more thing, I said P ~ V^3,
because P ~ V^2*f and f ~ V, so P ~ V^3. Maybe some frequencies share the same
voltage, but you can still safely assume V changes with f in general, and it
will be more and more so, since we do need finer control over power consumption.

> Sure, but realize that we must fully understand this governor and
> integrate it in the scheduler if we're to attain the goal of IPC/watt
> optimized scheduling behaviour.
> 

Attain the goal of IPC/watt optimized?

I don't see how it can be done like this. As I said, what is unknown for
prediction is perf scaling *and* changing workload. So the challenge for pstate
control is in both. But I see more chanllenge in the changing workload than
in the performance scaling or the resulting IPC impact (if workload is
fixed).

Currently, all freq governors take CPU utilization (load%) as the indicator
(target), which can server both: workload and perf scaling.

As for IPC/watt optimized, I don't see how it can be practical. Too micro to
be used for the general well-being?

> So you (or rather Intel in general) will have to be very explicit on how
> their stuff works and can no longer hide in some driver and do magic.
> The same is true for all other vendors for that matter.
> 
> If you (vendors, not Yuyang in specific) do not want to play (and be
> explicit and expose how your hardware functions) then you simply will
> not get power efficient scheduling full stop.
> 
> There's no rocks to hide under, no magic veils to hide behind. You tell
> _in_public_ or you get nothing.

Better communication is good, especially for our increasingly iterated
products because the changing products do incur noises and inconsistency
in detail.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 7, 2014, 11:53 p.m. UTC | #29
On Fri, Jun 06, 2014 at 02:13:05PM +0200, Ingo Molnar wrote:
> 
> Leakage current typically gets higher with higher frequencies, but 
> it's also highly process dependent AFAIK.
> 

In general, you can assume leakage power ~ V^2.

> If switching power dissipation is the main factor in power use, then 
> we can essentially assume that P ~ V^2, at the same frequency - and 
> scales linearly with frequency - but real work performed also scales 
> semi-linearly with frequency for many workloads, so that's an 
> invariant for everything except highly memory bound workloads.
> 

Agreed. Strictly, Energy ~ V^2.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Yuyang Du June 9, 2014, 2:15 a.m. UTC | #30
On Mon, Jun 09, 2014 at 09:59:52AM +0100, Morten Rasmussen wrote:
> IMHO, the per-entity load-tracking does a fair job representing the task
> compute capacity requirements. Sure it isn't perfect, particularly not
> for memory bound tasks, but it is way better than not having any task
> history at all, which was the case before.
> 
> The story is more or less the same for performance scaling. It is not
> taken into account at all in the scheduler at the moment. cpufreq is
> actually messing up load-balancing decisions after task load-tracking
> was introduced. Adding performance scaling awareness should only make
> things better even if predictions are not accurate for all workloads. I
> don't see why it shouldn't given the current state of energy-awareness
> in the scheduler.
> 

Optimized IPC is good for sure (with regard to pstate adjustment). My point is
how it is practical to rightly correlate to scheduler and pstate
power-efficiency. Put another way, with fixed workload, you really can do such
a thing by offline running the workload several times to conclude with a very
power-efficient solution which takes IPC into account. Actually, lots of
people have done that in papers/reports (for SPECXXX or TPC-X for example). But
I can't see how online realtime workload can be done like it.

> > Currently, all freq governors take CPU utilization (load%) as the indicator
> > (target), which can server both: workload and perf scaling.
> 
> With a bunch of hacks on top to make it more reactive because the
> current cpu utilization metric is not responsive enough to deal with
> workload changes. That is at least the case for ondemand and interactive
> (in Android).
> 

To what end it is not responsive enough? And how it is related here?

Thanks,
Yuyang
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 9, 2014, 8:27 a.m. UTC | #31
On Sat, Jun 07, 2014 at 03:33:58AM +0100, Nicolas Pitre wrote:
> On Fri, 6 Jun 2014, Ingo Molnar wrote:
> 
> > In any case, even with turbo frequencies, switching power use is 
> > probably an order of magnitude higher than leakage current power use, 
> > on any marketable chip, so we should concentrate on being able to 
> > cover this first order effect (P/work ~ V^2), before considering any 
> > second order effects (leakage current).
> 
> Just so that people are aware... We'll have to introduce thermal 
> constraint management into the scheduler mix as well at some point.  
> Right now what we have is an ad hoc subsystem that simply monitors 
> temperature and apply crude cooling strategies when some thresholds are 
> met. But a better strategy would imply thermal "provisioning".

There is already work going on to improve thermal management:

http://lwn.net/Articles/599598/

The proposal is based on power/energy models (too). The goal is to
allocate power intelligently based on performance requirements.

While it is related to energy-aware scheduling and I fully agree that it
is something we need to consider, I think it is worth developing the two
ideas in parallel and look at sharing things like the power model later
once things mature. Energy-aware scheduling is complex enough on its
own to keep us entertained for a while :-)

Morten
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Morten Rasmussen June 9, 2014, 8:59 a.m. UTC | #32
On Sun, Jun 08, 2014 at 12:26:29AM +0100, Yuyang Du wrote:
> On Fri, Jun 06, 2014 at 12:50:36PM +0200, Peter Zijlstra wrote:
> > > Voltage is combined with frequency, roughly, voltage is proportional
> > > to freuquecy, so roughly, power is proportionaly to voltage^3. You
> > 
> > P ~ V^2, last time I checked.
> > 
> > > can't say which is more important, or there is no reason to raise
> > > voltage without raising frequency.
> > 
> > Well, some chips have far fewer voltage steps than freq steps; or,
> > differently put, they have multiple freq steps for a single voltage
> > level.
> > 
> > And since the power (Watts) is proportional to Voltage squared, its the
> > biggest term.
> > 
> > If you have a distinct voltage level for each freq, it all doesn't
> > matter.
> > 
> 
> Ok. I think we understand each other. But one more thing, I said P ~ V^3,
> because P ~ V^2*f and f ~ V, so P ~ V^3. Maybe some frequencies share the same
> voltage, but you can still safely assume V changes with f in general, and it
> will be more and more so, since we do need finer control over power consumption.

Agreed. Voltage typically changes with frequency.

> 
> > Sure, but realize that we must fully understand this governor and
> > integrate it in the scheduler if we're to attain the goal of IPC/watt
> > optimized scheduling behaviour.
> > 
> 
> Attain the goal of IPC/watt optimized?
> 
> I don't see how it can be done like this. As I said, what is unknown for
> prediction is perf scaling *and* changing workload. So the challenge for pstate
> control is in both. But I see more chanllenge in the changing workload than
> in the performance scaling or the resulting IPC impact (if workload is
> fixed).

IMHO, the per-entity load-tracking does a fair job representing the task
compute capacity requirements. Sure it isn't perfect, particularly not
for memory bound tasks, but it is way better than not having any task
history at all, which was the case before.

The story is more or less the same for performance scaling. It is not
taken into account at all in the scheduler at the moment. cpufreq is
actually messing up load-balancing decisions after task load-tracking
was introduced. Adding performance scaling awareness should only make
things better even if predictions are not accurate for all workloads. I
don't see why it shouldn't given the current state of energy-awareness
in the scheduler.

> Currently, all freq governors take CPU utilization (load%) as the indicator
> (target), which can server both: workload and perf scaling.

With a bunch of hacks on top to make it more reactive because the
current cpu utilization metric is not responsive enough to deal with
workload changes. That is at least the case for ondemand and interactive
(in Android).

> As for IPC/watt optimized, I don't see how it can be practical. Too micro to
> be used for the general well-being?

That is why I propose to have a platform specific energy model. You tell
the scheduler enough about your platform that it understands the most
basic power/performance trade-offs of your platform and thereby enable
the scheduler to make better decisions.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Nicolas Pitre June 9, 2014, 1:22 p.m. UTC | #33
On Mon, 9 Jun 2014, Morten Rasmussen wrote:

> On Sat, Jun 07, 2014 at 03:33:58AM +0100, Nicolas Pitre wrote:
> > On Fri, 6 Jun 2014, Ingo Molnar wrote:
> > 
> > > In any case, even with turbo frequencies, switching power use is 
> > > probably an order of magnitude higher than leakage current power use, 
> > > on any marketable chip, so we should concentrate on being able to 
> > > cover this first order effect (P/work ~ V^2), before considering any 
> > > second order effects (leakage current).
> > 
> > Just so that people are aware... We'll have to introduce thermal 
> > constraint management into the scheduler mix as well at some point.  
> > Right now what we have is an ad hoc subsystem that simply monitors 
> > temperature and apply crude cooling strategies when some thresholds are 
> > met. But a better strategy would imply thermal "provisioning".
> 
> There is already work going on to improve thermal management:
> 
> http://lwn.net/Articles/599598/
> 
> The proposal is based on power/energy models (too). The goal is to
> allocate power intelligently based on performance requirements.

Ah, great!  I missed that.

> While it is related to energy-aware scheduling and I fully agree that it
> is something we need to consider, I think it is worth developing the two
> ideas in parallel and look at sharing things like the power model later
> once things mature. Energy-aware scheduling is complex enough on its
> own to keep us entertained for a while :-)

Absolutely.  This is why I said "at some point".


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 10, 2014, 10:16 a.m. UTC | #34
On Sun, Jun 08, 2014 at 07:26:29AM +0800, Yuyang Du wrote:
> Ok. I think we understand each other. But one more thing, I said P ~ V^3,
> because P ~ V^2*f and f ~ V, so P ~ V^3. Maybe some frequencies share the same
> voltage, but you can still safely assume V changes with f in general, and it
> will be more and more so, since we do need finer control over power consumption.

I didn't know the frequency part was proportionate to another voltage
term, ok, then the cubic term makes sense.

> > Sure, but realize that we must fully understand this governor and
> > integrate it in the scheduler if we're to attain the goal of IPC/watt
> > optimized scheduling behaviour.
> > 
> 
> Attain the goal of IPC/watt optimized?
> 
> I don't see how it can be done like this. As I said, what is unknown for
> prediction is perf scaling *and* changing workload. So the challenge for pstate
> control is in both. But I see more chanllenge in the changing workload than
> in the performance scaling or the resulting IPC impact (if workload is
> fixed).

But for the scheduler the workload change isn't that big a problem; we
know the history of each task, we know when tasks wake up and when we
move them around. Therefore we can fairly accurately predict this.

And given a simple P state model (like ARM) where the CPU simply does
what you tell it to, that all works out. We can change P-state at task
wakeup/sleep/migration and compute the most efficient P-state, and task
distribution, for the new task-set.

> Currently, all freq governors take CPU utilization (load%) as the indicator
> (target), which can server both: workload and perf scaling.

So the current cpufreq stuff is terminally broken in too many ways; its
sampling, so it misses a lot of changes, its strictly cpu local, so it
completely misses SMP information (like the migrations etc..)

If we move a 50% task from CPU1 to CPU0, a sampling thing takes time to
adjust on both CPUs, whereas if its scheduler driven, we can instantly
adjust and be done, because we _know_ what we moved.

Now some of that is due to hysterical raisins, and some of that due to
broken hardware (hardware that needs to schedule in order to change its
state because its behind some broken bus or other). But we should
basically kill off cpufreq for anything recent and sane.

> As for IPC/watt optimized, I don't see how it can be practical. Too micro to
> be used for the general well-being?

What other target would you optimize for? The purpose here is to build
an energy aware scheduler, one that schedules tasks so that the total
amount of energy, for the given amount of work, is minimal.

So we can't measure in Watt, since if we forced the CPU into the lowest
P-state (or even C-state for that matter) work would simply not
complete. So we need a complete energy term.

Now. IPC is instructions/cycle, Watt is Joule/second, so IPC/Watt is

instructions   second
------------ * ------ ~ instructions / joule
  cycle        joule

Seeing how both cycles and seconds are time units.

So for any given amount of instructions, the work needs to be done, we
want the minimal amount of energy consumed, and IPC/Watt is the natural
metric to measure this over an entire workload.
Nicolas Pitre June 10, 2014, 5:01 p.m. UTC | #35
On Tue, 10 Jun 2014, Peter Zijlstra wrote:

> So the current cpufreq stuff is terminally broken in too many ways; its
> sampling, so it misses a lot of changes, its strictly cpu local, so it
> completely misses SMP information (like the migrations etc..)
> 
> If we move a 50% task from CPU1 to CPU0, a sampling thing takes time to
> adjust on both CPUs, whereas if its scheduler driven, we can instantly
> adjust and be done, because we _know_ what we moved.

Incidentally I submitted a LWN article highlighting those very issues 
and the planned remedies.  No confirmation of a publication date though.

> Now some of that is due to hysterical raisins, and some of that due to
> broken hardware (hardware that needs to schedule in order to change its
> state because its behind some broken bus or other). But we should
> basically kill off cpufreq for anything recent and sane.

EVen if some change has to happen through a kernel thread, you're still 
far better with the scheduler requesting this change proactively than 
waiting for both the cpufreq governor to catch up with the load and then 
wait for the freq change thread to be scheduled.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 10, 2014, 6:35 p.m. UTC | #36
On Tue, Jun 10, 2014 at 12:16:22PM +0200, Peter Zijlstra wrote:
> What other target would you optimize for? The purpose here is to build
> an energy aware scheduler, one that schedules tasks so that the total
> amount of energy, for the given amount of work, is minimal.
> 
> So we can't measure in Watt, since if we forced the CPU into the lowest
> P-state (or even C-state for that matter) work would simply not
> complete. So we need a complete energy term.
> 
> Now. IPC is instructions/cycle, Watt is Joule/second, so IPC/Watt is
> 
> instructions   second
> ------------ * ------ ~ instructions / joule
>   cycle        joule
> 
> Seeing how both cycles and seconds are time units.
> 
> So for any given amount of instructions, the work needs to be done, we
> want the minimal amount of energy consumed, and IPC/Watt is the natural
> metric to measure this over an entire workload.

Ok, I understand. Whether we take IPC/watt as an input metric in scheduler or
as a goal for scheduler, we definitely need to try both.

Thanks, Peter.

Yuyang
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduardo Valentin June 11, 2014, 11:02 a.m. UTC | #37
Hello,

On Mon, Jun 09, 2014 at 09:22:49AM -0400, Nicolas Pitre wrote:
> On Mon, 9 Jun 2014, Morten Rasmussen wrote:
> 
> > On Sat, Jun 07, 2014 at 03:33:58AM +0100, Nicolas Pitre wrote:
> > > On Fri, 6 Jun 2014, Ingo Molnar wrote:
> > > 
> > > > In any case, even with turbo frequencies, switching power use is 
> > > > probably an order of magnitude higher than leakage current power use, 
> > > > on any marketable chip, so we should concentrate on being able to 
> > > > cover this first order effect (P/work ~ V^2), before considering any 
> > > > second order effects (leakage current).
> > > 
> > > Just so that people are aware... We'll have to introduce thermal 
> > > constraint management into the scheduler mix as well at some point.  
> > > Right now what we have is an ad hoc subsystem that simply monitors 
> > > temperature and apply crude cooling strategies when some thresholds are 
> > > met. But a better strategy would imply thermal "provisioning".
> > 
> > There is already work going on to improve thermal management:
> > 
> > http://lwn.net/Articles/599598/
> > 
> > The proposal is based on power/energy models (too). The goal is to

Can you please point me to the other piece of code which is using
power/energy models too?  We are considering having these models within
the thermal software compoenents. But if we already have more than one
user, might be worth considering a separate API.
 
> > allocate power intelligently based on performance requirements.
> 
> Ah, great!  I missed that.
> 
> > While it is related to energy-aware scheduling and I fully agree that it
> > is something we need to consider, I think it is worth developing the two
> > ideas in parallel and look at sharing things like the power model later
> > once things mature. Energy-aware scheduling is complex enough on its
> > own to keep us entertained for a while :-)
> 
> Absolutely.  This is why I said "at some point".
> 
> 
> Nicolas
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 11, 2014, 11:42 a.m. UTC | #38
On Wed, Jun 11, 2014 at 12:02:51PM +0100, Eduardo Valentin wrote:
> Hello,
> 
> On Mon, Jun 09, 2014 at 09:22:49AM -0400, Nicolas Pitre wrote:
> > On Mon, 9 Jun 2014, Morten Rasmussen wrote:
> > 
> > > On Sat, Jun 07, 2014 at 03:33:58AM +0100, Nicolas Pitre wrote:
> > > > On Fri, 6 Jun 2014, Ingo Molnar wrote:
> > > > 
> > > > > In any case, even with turbo frequencies, switching power use is 
> > > > > probably an order of magnitude higher than leakage current power use, 
> > > > > on any marketable chip, so we should concentrate on being able to 
> > > > > cover this first order effect (P/work ~ V^2), before considering any 
> > > > > second order effects (leakage current).
> > > > 
> > > > Just so that people are aware... We'll have to introduce thermal 
> > > > constraint management into the scheduler mix as well at some point.  
> > > > Right now what we have is an ad hoc subsystem that simply monitors 
> > > > temperature and apply crude cooling strategies when some thresholds are 
> > > > met. But a better strategy would imply thermal "provisioning".
> > > 
> > > There is already work going on to improve thermal management:
> > > 
> > > http://lwn.net/Articles/599598/
> > > 
> > > The proposal is based on power/energy models (too). The goal is to
> 
> Can you please point me to the other piece of code which is using
> power/energy models too?  We are considering having these models within
> the thermal software compoenents. But if we already have more than one
> user, might be worth considering a separate API.

The link above is to the thermal management proposal which includes a
power model. This one might work better:

http://article.gmane.org/gmane.linux.power-management.general/45000

The power/energy model in this energy-aware scheduling proposal is
different. An example of the model data is in patch 6 (the start of this
thread) and the actual use of the model is in patch 11 and the following
patches. As said below, the two proposals are independent, but there
might be potential for merging the power/energy models once the
proposals are more mature.

Morten

>  
> > > allocate power intelligently based on performance requirements.
> > 
> > Ah, great!  I missed that.
> > 
> > > While it is related to energy-aware scheduling and I fully agree that it
> > > is something we need to consider, I think it is worth developing the two
> > > ideas in parallel and look at sharing things like the power model later
> > > once things mature. Energy-aware scheduling is complex enough on its
> > > own to keep us entertained for a while :-)
> > 
> > Absolutely.  This is why I said "at some point".
> > 
> > 
> > Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduardo Valentin June 11, 2014, 11:43 a.m. UTC | #39
On Wed, Jun 11, 2014 at 12:42:18PM +0100, Morten Rasmussen wrote:
> On Wed, Jun 11, 2014 at 12:02:51PM +0100, Eduardo Valentin wrote:
> > Hello,
> > 
> > On Mon, Jun 09, 2014 at 09:22:49AM -0400, Nicolas Pitre wrote:
> > > On Mon, 9 Jun 2014, Morten Rasmussen wrote:
> > > 
> > > > On Sat, Jun 07, 2014 at 03:33:58AM +0100, Nicolas Pitre wrote:
> > > > > On Fri, 6 Jun 2014, Ingo Molnar wrote:
> > > > > 
> > > > > > In any case, even with turbo frequencies, switching power use is 
> > > > > > probably an order of magnitude higher than leakage current power use, 
> > > > > > on any marketable chip, so we should concentrate on being able to 
> > > > > > cover this first order effect (P/work ~ V^2), before considering any 
> > > > > > second order effects (leakage current).
> > > > > 
> > > > > Just so that people are aware... We'll have to introduce thermal 
> > > > > constraint management into the scheduler mix as well at some point.  
> > > > > Right now what we have is an ad hoc subsystem that simply monitors 
> > > > > temperature and apply crude cooling strategies when some thresholds are 
> > > > > met. But a better strategy would imply thermal "provisioning".
> > > > 
> > > > There is already work going on to improve thermal management:
> > > > 
> > > > http://lwn.net/Articles/599598/
> > > > 
> > > > The proposal is based on power/energy models (too). The goal is to
> > 
> > Can you please point me to the other piece of code which is using
> > power/energy models too?  We are considering having these models within
> > the thermal software compoenents. But if we already have more than one
> > user, might be worth considering a separate API.
> 
> The link above is to the thermal management proposal which includes a
> power model. This one might work better:
> 
> http://article.gmane.org/gmane.linux.power-management.general/45000
> 
> The power/energy model in this energy-aware scheduling proposal is
> different. An example of the model data is in patch 6 (the start of this
> thread) and the actual use of the model is in patch 11 and the following
> patches. As said below, the two proposals are independent, but there
> might be potential for merging the power/energy models once the
> proposals are more mature.

Morten,

For the power allocator thermal governor, I am aware, as I am reviewing
it. I am more interested in other users of power models, a part from
thermal subsystem.

> 
> Morten
> 
> >  
> > > > allocate power intelligently based on performance requirements.
> > > 
> > > Ah, great!  I missed that.
> > > 
> > > > While it is related to energy-aware scheduling and I fully agree that it
> > > > is something we need to consider, I think it is worth developing the two
> > > > ideas in parallel and look at sharing things like the power model later
> > > > once things mature. Energy-aware scheduling is complex enough on its
> > > > own to keep us entertained for a while :-)
> > > 
> > > Absolutely.  This is why I said "at some point".
> > > 
> > > 
> > > Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Morten Rasmussen June 11, 2014, 1:37 p.m. UTC | #40
On Wed, Jun 11, 2014 at 12:43:26PM +0100, Eduardo Valentin wrote:
> On Wed, Jun 11, 2014 at 12:42:18PM +0100, Morten Rasmussen wrote:
> > On Wed, Jun 11, 2014 at 12:02:51PM +0100, Eduardo Valentin wrote:
> > > Hello,
> > > 
> > > On Mon, Jun 09, 2014 at 09:22:49AM -0400, Nicolas Pitre wrote:
> > > > On Mon, 9 Jun 2014, Morten Rasmussen wrote:
> > > > 
> > > > > On Sat, Jun 07, 2014 at 03:33:58AM +0100, Nicolas Pitre wrote:
> > > > > > On Fri, 6 Jun 2014, Ingo Molnar wrote:
> > > > > > 
> > > > > > > In any case, even with turbo frequencies, switching power use is 
> > > > > > > probably an order of magnitude higher than leakage current power use, 
> > > > > > > on any marketable chip, so we should concentrate on being able to 
> > > > > > > cover this first order effect (P/work ~ V^2), before considering any 
> > > > > > > second order effects (leakage current).
> > > > > > 
> > > > > > Just so that people are aware... We'll have to introduce thermal 
> > > > > > constraint management into the scheduler mix as well at some point.  
> > > > > > Right now what we have is an ad hoc subsystem that simply monitors 
> > > > > > temperature and apply crude cooling strategies when some thresholds are 
> > > > > > met. But a better strategy would imply thermal "provisioning".
> > > > > 
> > > > > There is already work going on to improve thermal management:
> > > > > 
> > > > > http://lwn.net/Articles/599598/
> > > > > 
> > > > > The proposal is based on power/energy models (too). The goal is to
> > > 
> > > Can you please point me to the other piece of code which is using
> > > power/energy models too?  We are considering having these models within
> > > the thermal software compoenents. But if we already have more than one
> > > user, might be worth considering a separate API.
> > 
> > The link above is to the thermal management proposal which includes a
> > power model. This one might work better:
> > 
> > http://article.gmane.org/gmane.linux.power-management.general/45000
> > 
> > The power/energy model in this energy-aware scheduling proposal is
> > different. An example of the model data is in patch 6 (the start of this
> > thread) and the actual use of the model is in patch 11 and the following
> > patches. As said below, the two proposals are independent, but there
> > might be potential for merging the power/energy models once the
> > proposals are more mature.
> 
> Morten,
> 
> For the power allocator thermal governor, I am aware, as I am reviewing
> it. I am more interested in other users of power models, a part from
> thermal subsystem.

The user in this proposal is the scheduler. The intention is to
eventually tie cpuidle and cpufreq closer to the scheduler. When/if that
happens, they might become users too.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vince Weaver June 12, 2014, 3:05 p.m. UTC | #41
On Fri, 6 Jun 2014, Morten Rasmussen wrote:

> On Fri, Jun 06, 2014 at 02:43:03PM +0100, Peter Zijlstra wrote:
> > On Fri, Jun 06, 2014 at 02:15:10PM +0100, Morten Rasmussen wrote:
> > > > > ARM TC2 has on-chip energy counters for counting energy consumed by the
> > > > > A7 and A15 clusters. They are fairly accurate. 
> > > > 
> > > > Recent Intel chips have that too; they come packaged as:
> > > > 
> > > >   perf stat -a -e "power/energy-cores/" -- cmd
> > > > 
> > > > (through the perf_event_intel_rapl.c driver), It would be ideal if the
> > > > ARM equivalent was available through a similar interface.
> > > > 
> > > > http://lwn.net/Articles/573602/
> > > 
> > > Nice. On ARM it is not mandatory to have energy counters and what they
> > > actually measure if they are implemented is implementation dependent.
> > > However, each vendor does extensive evaluation and characterization of
> > > their implementation already, so I don't think would be a problem for
> > > them to provide the numbers.
> > 
> > How is the ARM energy thing exposed? Through the regular PMU but with
> > vendor specific events, or through a separate interface, entirely vendor
> > specific?
> 
> There is an upstream hwmon driver for TC2 already with an easy to use
> sysfs interface for all the energy counters. So it is somewhat vendor
> specific at the moment unfortunately.

What is the plan about future interfaces for energy info?

Intel RAPL of course has a perf_event interface.

However AMD's (somewhat unfortunately acronymed) Application Power 
Management exports similar information via hwmon and the fam15h_power
driver.

And it sounds like ARM systems also put things in hwmon.

User tools like PAPI can sort of abstract this (for example it supports 
getting RAPL data from perf_event while it also has a driver for getting 
info from hwmon).  But users stuck with perf end up having to use multiple 
tools to get energy and performance info simultaneously on non-intel 
hardware.

Vince
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 71e1fec..4050348 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -275,6 +275,107 @@  void store_cpu_topology(unsigned int cpuid)
 		cpu_topology[cpuid].socket_id, mpidr);
 }
 
+#ifdef CONFIG_SCHED_ENERGY
+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct capacity_state cap_states_cluster_a7[] = {
+	/* Cluster only power */
+	 { .cap =  358, .power = 2967, }, /*  350 MHz */
+	 { .cap =  410, .power = 2792, }, /*  400 MHz */
+	 { .cap =  512, .power = 2810, }, /*  500 MHz */
+	 { .cap =  614, .power = 2815, }, /*  600 MHz */
+	 { .cap =  717, .power = 2919, }, /*  700 MHz */
+	 { .cap =  819, .power = 2847, }, /*  800 MHz */
+	 { .cap =  922, .power = 3917, }, /*  900 MHz */
+	 { .cap = 1024, .power = 4905, }, /* 1000 MHz */
+	};
+
+static struct capacity_state cap_states_cluster_a15[] = {
+	/* Cluster only power */
+	 { .cap =  840, .power =  7920, }, /*  500 MHz */
+	 { .cap = 1008, .power =  8165, }, /*  600 MHz */
+	 { .cap = 1176, .power =  8172, }, /*  700 MHz */
+	 { .cap = 1343, .power =  8195, }, /*  800 MHz */
+	 { .cap = 1511, .power =  8265, }, /*  900 MHz */
+	 { .cap = 1679, .power =  8446, }, /* 1000 MHz */
+	 { .cap = 1847, .power = 11426, }, /* 1100 MHz */
+	 { .cap = 2015, .power = 15200, }, /* 1200 MHz */
+	};
+
+static struct sched_energy energy_cluster_a7 = {
+	  .max_capacity   = 1024,
+	  .idle_power     =   10, /* Cluster power-down */
+	  .wakeup_energy  =    6, /* << 10 */
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a7),
+	  .cap_states     = cap_states_cluster_a7,
+};
+
+static struct sched_energy energy_cluster_a15 = {
+	  .max_capacity   = 2015,
+	  .idle_power     =   25, /* Cluster power-down */
+	  .wakeup_energy  =  210, /* << 10 */
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a15),
+	  .cap_states     = cap_states_cluster_a15,
+};
+
+static struct capacity_state cap_states_core_a7[] = {
+	/* Power per cpu */
+	 { .cap =  358, .power =  187, }, /*  350 MHz */
+	 { .cap =  410, .power =  275, }, /*  400 MHz */
+	 { .cap =  512, .power =  334, }, /*  500 MHz */
+	 { .cap =  614, .power =  407, }, /*  600 MHz */
+	 { .cap =  717, .power =  447, }, /*  700 MHz */
+	 { .cap =  819, .power =  549, }, /*  800 MHz */
+	 { .cap =  922, .power =  761, }, /*  900 MHz */
+	 { .cap = 1024, .power = 1024, }, /* 1000 MHz */
+	};
+
+static struct capacity_state cap_states_core_a15[] = {
+	/* Power per cpu */
+	 { .cap =  840, .power = 2021, }, /*  500 MHz */
+	 { .cap = 1008, .power = 2312, }, /*  600 MHz */
+	 { .cap = 1176, .power = 2756, }, /*  700 MHz */
+	 { .cap = 1343, .power = 3125, }, /*  800 MHz */
+	 { .cap = 1511, .power = 3524, }, /*  900 MHz */
+	 { .cap = 1679, .power = 3846, }, /* 1000 MHz */
+	 { .cap = 1847, .power = 5177, }, /* 1100 MHz */
+	 { .cap = 2015, .power = 6997, }, /* 1200 MHz */
+	};
+
+static struct sched_energy energy_core_a7 = {
+	  .max_capacity   = 1024,
+	  .idle_power     =    0, /* No power gating */
+	  .wakeup_energy  =    0, /* << 10 */
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_core_a7),
+	  .cap_states     = cap_states_core_a7,
+};
+
+static struct sched_energy energy_core_a15 = {
+	  .max_capacity   = 2015,
+	  .idle_power     =    0, /* No power gating */
+	  .wakeup_energy  =    5, /* << 10 */
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_core_a15),
+	  .cap_states     = cap_states_core_a15,
+};
+
+/* sd energy functions */
+static inline const struct sched_energy *cpu_cluster_energy(int cpu)
+{
+	return cpu_topology[cpu].socket_id ? &energy_cluster_a7 :
+			&energy_cluster_a15;
+}
+
+static inline const struct sched_energy *cpu_core_energy(int cpu)
+{
+	return cpu_topology[cpu].socket_id ? &energy_core_a7 :
+			&energy_core_a15;
+}
+#endif /* CONFIG_SCHED_ENERGY */
+
 static inline const int cpu_corepower_flags(void)
 {
 	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
@@ -282,10 +383,18 @@  static inline const int cpu_corepower_flags(void)
 
 static struct sched_domain_topology_level arm_topology[] = {
 #ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_ENERGY
+	{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
+#else
 	{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
+#endif
+#ifdef CONFIG_SCHED_ENERGY
+	{ cpu_cpu_mask, 0, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+#else
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+#endif
 	{ NULL, },
 };