diff mbox series

serial: 8250_omap: Set the console genpd always on if no console suspend

Message ID 20231017130540.1149721-1-thomas.richard@bootlin.com
State New
Headers show
Series serial: 8250_omap: Set the console genpd always on if no console suspend | expand

Commit Message

Thomas Richard Oct. 17, 2023, 1:05 p.m. UTC
If the console suspend is disabled, the genpd of the console shall not
be powered-off during suspend.
Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
suspend, and restore the original value during the resume.

Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
---
 drivers/tty/serial/8250/8250_omap.c | 33 ++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

Comments

Tony Lindgren Oct. 23, 2023, 7:44 a.m. UTC | #1
Hi,

Adding Kevin and Vignesh too in case they have better ideas on how to
prevent the power domain from suspending for no_console_suspend kernel
parameter.

* Thomas Richard <thomas.richard@bootlin.com> [231017 13:05]:
> If the console suspend is disabled, the genpd of the console shall not
> be powered-off during suspend.
> Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
> suspend, and restore the original value during the resume.
> 
> Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
> ---
>  drivers/tty/serial/8250/8250_omap.c | 33 ++++++++++++++++++++++++-----
>  1 file changed, 28 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
> index ca972fd37725..91a483dc460c 100644
> --- a/drivers/tty/serial/8250/8250_omap.c
> +++ b/drivers/tty/serial/8250/8250_omap.c
> @@ -27,6 +27,7 @@
>  #include <linux/pm_wakeirq.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/sys_soc.h>
> +#include <linux/pm_domain.h>
>  
>  #include "8250.h"
>  
> @@ -114,6 +115,12 @@
>  /* RX FIFO occupancy indicator */
>  #define UART_OMAP_RX_LVL		0x19
>  
> +/*
> + * Copy of the genpd flags for the console.
> + * Only used if console suspend is disabled
> + */
> +static unsigned int genpd_flags_console;

This should be priv->genpd_flags_console or something similar as the
uarts in an always-on power domain may have different flags from other
power domains.

Other than that I'm fine with with unless other people have better ideas
on how to handle this.

Regards,

Tony

>  struct omap8250_priv {
>  	void __iomem *membase;
>  	int line;
> @@ -1617,6 +1624,7 @@ static int omap8250_suspend(struct device *dev)
>  {
>  	struct omap8250_priv *priv = dev_get_drvdata(dev);
>  	struct uart_8250_port *up = serial8250_get_port(priv->line);
> +	struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);
>  	int err = 0;
>  
>  	serial8250_suspend_port(priv->line);
> @@ -1627,8 +1635,19 @@ static int omap8250_suspend(struct device *dev)
>  	if (!device_may_wakeup(dev))
>  		priv->wer = 0;
>  	serial_out(up, UART_OMAP_WER, priv->wer);
> -	if (uart_console(&up->port) && console_suspend_enabled)
> -		err = pm_runtime_force_suspend(dev);
> +	if (uart_console(&up->port)) {
> +		if (console_suspend_enabled)
> +			err = pm_runtime_force_suspend(dev);
> +		else {
> +			/*
> +			 * The pd shall not be powered-off (no console suspend).
> +			 * Make copy of genpd flags before to set it always on.
> +			 * The original value is restored during the resume.
> +			 */
> +			genpd_flags_console = genpd->flags;
> +			genpd->flags |= GENPD_FLAG_ALWAYS_ON;
> +		}
> +	}
>  	flush_work(&priv->qos_work);
>  
>  	return err;
> @@ -1638,12 +1657,16 @@ static int omap8250_resume(struct device *dev)
>  {
>  	struct omap8250_priv *priv = dev_get_drvdata(dev);
>  	struct uart_8250_port *up = serial8250_get_port(priv->line);
> +	struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);
>  	int err;
>  
>  	if (uart_console(&up->port) && console_suspend_enabled) {
> -		err = pm_runtime_force_resume(dev);
> -		if (err)
> -			return err;
> +		if (console_suspend_enabled) {
> +			err = pm_runtime_force_resume(dev);
> +			if (err)
> +				return err;
> +		} else
> +			genpd->flags = genpd_flags_console;
>  	}
>  
>  	serial8250_resume_port(priv->line);
> -- 
> 2.39.2
>
Greg KH Oct. 24, 2023, 3:24 p.m. UTC | #2
On Tue, Oct 24, 2023 at 04:53:46PM +0200, Thomas Richard wrote:
> On 10/23/23 09:44, Tony Lindgren wrote:
> > Hi,
> > 
> > Adding Kevin and Vignesh too in case they have better ideas on how to
> > prevent the power domain from suspending for no_console_suspend kernel
> > parameter.
> > 
> > * Thomas Richard <thomas.richard@bootlin.com> [231017 13:05]:
> >> If the console suspend is disabled, the genpd of the console shall not
> >> be powered-off during suspend.
> >> Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
> >> suspend, and restore the original value during the resume.
> >>
> >> Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
> >> ---
> >>  drivers/tty/serial/8250/8250_omap.c | 33 ++++++++++++++++++++++++-----
> >>  1 file changed, 28 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
> >> index ca972fd37725..91a483dc460c 100644
> >> --- a/drivers/tty/serial/8250/8250_omap.c
> >> +++ b/drivers/tty/serial/8250/8250_omap.c
> >> @@ -27,6 +27,7 @@
> >>  #include <linux/pm_wakeirq.h>
> >>  #include <linux/dma-mapping.h>
> >>  #include <linux/sys_soc.h>
> >> +#include <linux/pm_domain.h>
> >>  
> >>  #include "8250.h"
> >>  
> >> @@ -114,6 +115,12 @@
> >>  /* RX FIFO occupancy indicator */
> >>  #define UART_OMAP_RX_LVL		0x19
> >>  
> >> +/*
> >> + * Copy of the genpd flags for the console.
> >> + * Only used if console suspend is disabled
> >> + */
> >> +static unsigned int genpd_flags_console;
> > 
> > This should be priv->genpd_flags_console or something similar as the
> > uarts in an always-on power domain may have different flags from other
> > power domains.
> 
> Ok I'll move genpd_flags_console to the priv struct.
> 
> @Greg, as you already added the patch to your tty git tree, do you
> prefer a new version of the patch or a fixup ?

A fixup please.

thanks,

greg k-h
Kevin Hilman Oct. 24, 2023, 6:36 p.m. UTC | #3
Tony Lindgren <tony@atomide.com> writes:

> * Kevin Hilman <khilman@kernel.org> [231023 21:31]:
>> Instead, what should be happening is that when `no_console_suspend` is
>> set, there should be an extra pm_runtime_get() which increases the
>> device usecount such that the device never runtime suspends, and thus
>> the domain will not get powered off.
>
> We already have the runtime PM usage count kept in the driver (unless
> there's a bug somewhere). The issue is that on suspend the power domain
> still gets shut down.
>
> I suspect that some of the SoC power domains get
> force shut down on suspend somewhere?

If setting GENPD_FLAG_ALWAYS_ON works as this patch proposes, then a
force shutdown would override that genpd flag also, so I suspect the 
runtime PM usage count is not correct.

I quick skim of 8250_omap.c, and I don't see any pm_runtime_get() calls
that are conditional on no_console_suspend, which is what I would
suspect for the domain to stay on.

Kevin
Tony Lindgren Oct. 25, 2023, 6:41 a.m. UTC | #4
* Kevin Hilman <khilman@kernel.org> [231024 18:36]:
> Tony Lindgren <tony@atomide.com> writes:
> 
> > * Kevin Hilman <khilman@kernel.org> [231023 21:31]:
> >> Instead, what should be happening is that when `no_console_suspend` is
> >> set, there should be an extra pm_runtime_get() which increases the
> >> device usecount such that the device never runtime suspends, and thus
> >> the domain will not get powered off.
> >
> > We already have the runtime PM usage count kept in the driver (unless
> > there's a bug somewhere). The issue is that on suspend the power domain
> > still gets shut down.
> >
> > I suspect that some of the SoC power domains get
> > force shut down on suspend somewhere?
> 
> If setting GENPD_FLAG_ALWAYS_ON works as this patch proposes, then a
> force shutdown would override that genpd flag also, so I suspect the 
> runtime PM usage count is not correct.

OK good point.

> I quick skim of 8250_omap.c, and I don't see any pm_runtime_get() calls
> that are conditional on no_console_suspend, which is what I would
> suspect for the domain to stay on.

If a serial console is attached, we now have runtime PM usage count
always kept. Users can detach the console via sysfs as needed. See these
two earlier commits from Andy:

a3cb39d258ef ("serial: core: Allow detach and attach serial device for console")
bedb404e91bb ("serial: 8250_port: Don't use power management for kernel console")

Sounds like there's a bug somewhere. It's worth verifying if the runtime
PM usage count is kept for 8250_omap on suspend.

Thomas, care to check your test case with the attached debug hack
and by adding a call for pm_runtime_get_usage_count() on the suspend
path?

Regards,

Tony

8< -------------------------------
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -129,6 +129,11 @@ static inline void pm_runtime_get_noresume(struct device *dev)
 	atomic_inc(&dev->power.usage_count);
 }
 
+static inline int pm_runtime_get_usage_count(struct device *dev)
+{
+	return atomic_read(&dev->power.usage_count);
+}
+
 /**
  * pm_runtime_put_noidle - Drop runtime PM usage counter of a device.
  * @dev: Target device.
Thomas Richard Oct. 31, 2023, 10:15 a.m. UTC | #5
On 10/25/23 08:41, Tony Lindgren wrote:
> * Kevin Hilman <khilman@kernel.org> [231024 18:36]:
>> Tony Lindgren <tony@atomide.com> writes:
>>
>>> * Kevin Hilman <khilman@kernel.org> [231023 21:31]:
>>>> Instead, what should be happening is that when `no_console_suspend` is
>>>> set, there should be an extra pm_runtime_get() which increases the
>>>> device usecount such that the device never runtime suspends, and thus
>>>> the domain will not get powered off.
>>>
>>> We already have the runtime PM usage count kept in the driver (unless
>>> there's a bug somewhere). The issue is that on suspend the power domain
>>> still gets shut down.
>>>
>>> I suspect that some of the SoC power domains get
>>> force shut down on suspend somewhere?
>>
>> If setting GENPD_FLAG_ALWAYS_ON works as this patch proposes, then a
>> force shutdown would override that genpd flag also, so I suspect the 
>> runtime PM usage count is not correct.
> 
> OK good point.
> 
>> I quick skim of 8250_omap.c, and I don't see any pm_runtime_get() calls
>> that are conditional on no_console_suspend, which is what I would
>> suspect for the domain to stay on.
> 
> If a serial console is attached, we now have runtime PM usage count
> always kept. Users can detach the console via sysfs as needed. See these
> two earlier commits from Andy:
> 
> a3cb39d258ef ("serial: core: Allow detach and attach serial device for console")
> bedb404e91bb ("serial: 8250_port: Don't use power management for kernel console")
> 
> Sounds like there's a bug somewhere. It's worth verifying if the runtime
> PM usage count is kept for 8250_omap on suspend.
> 
> Thomas, care to check your test case with the attached debug hack
> and by adding a call for pm_runtime_get_usage_count() on the suspend
> path?

Hi Tony,

Please find below the logs of the test you asked me.
I added the call of pm_runtime_get_usage_count at the end of the suspend
function.
The console is attached on 2800000.serial, it has usage_count=4.
Other serial has usage_count=3.

[    4.859058] port 2830000.serial:0.0: PM: calling
pm_runtime_force_suspend+0x0/0x134 @ 114, parent: 2830000.serial:0
[    4.869478] port 2830000.serial:0.0: PM:
pm_runtime_force_suspend+0x0/0x134 returned 0 after 0 usecs
[    4.878602] omap8250 2830000.serial: PM: calling
omap8250_suspend+0x0/0x144 @ 114, parent: bus@100000
[    4.887813] omap8250 2830000.serial: omap8250_suspend: 1634:
usage_count = 3
[    4.894851] omap8250 2830000.serial: PM: omap8250_suspend+0x0/0x144
returned 0 after 7042 usecs
[    4.903538] port 2810000.serial:0.0: PM: calling
pm_runtime_force_suspend+0x0/0x134 @ 114, parent: 2810000.serial:0
[    4.913957] port 2810000.serial:0.0: PM:
pm_runtime_force_suspend+0x0/0x134 returned 0 after 0 usecs
[    4.923080] omap8250 2810000.serial: PM: calling
omap8250_suspend+0x0/0x144 @ 114, parent: bus@100000
[    4.932288] omap8250 2810000.serial: omap8250_suspend: 1634:
usage_count = 3
[    4.939323] omap8250 2810000.serial: PM: omap8250_suspend+0x0/0x144
returned 0 after 7038 usecs
[    4.948010] port 2800000.serial:0.0: PM: calling
pm_runtime_force_suspend+0x0/0x134 @ 114, parent: 2800000.serial:0
[    4.958433] port 2800000.serial:0.0: PM:
pm_runtime_force_suspend+0x0/0x134 returned 0 after 1 usecs
[    4.967557] omap8250 2800000.serial: PM: calling
omap8250_suspend+0x0/0x144 @ 114, parent: bus@100000
[    4.976764] omap8250 2800000.serial: omap8250_suspend: 1634:
usage_count = 4
[    4.983799] omap8250 2800000.serial: PM: omap8250_suspend+0x0/0x144
returned 0 after 7036 usecs
[    4.992488] port 40a00000.serial:0.0: PM: calling
pm_runtime_force_suspend+0x0/0x134 @ 114, parent: 40a00000.serial:0
[    5.003081] port 40a00000.serial:0.0: PM:
pm_runtime_force_suspend+0x0/0x134 returned 0 after 0 usecs
[    5.012291] omap8250 40a00000.serial: PM: calling
omap8250_suspend+0x0/0x144 @ 114, parent: bus@100000:bus@28380000
[    5.022714] omap8250 40a00000.serial: omap8250_suspend: 1634:
usage_count = 3
[    5.029836] omap8250 40a00000.serial: PM: omap8250_suspend+0x0/0x144
returned 0 after 7124 usecs

Regards,

Thomas

8< -------------------------------
diff --git a/drivers/tty/serial/8250/8250_omap.c
b/drivers/tty/serial/8250/8250_omap.c
index ca972fd37725..b978f12fd542 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -1631,6 +1631,9 @@ static int omap8250_suspend(struct device *dev)
                err = pm_runtime_force_suspend(dev);
        flush_work(&priv->qos_work);

+       dev_info(dev, "%s: %d: usage_count = %d\n",
+                __func__, __LINE__, pm_runtime_get_usage_count(dev));
+
        return err;
 }


> 
> Regards,
> 
> Tony
> 
> 8< -------------------------------
> diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
> --- a/include/linux/pm_runtime.h
> +++ b/include/linux/pm_runtime.h
> @@ -129,6 +129,11 @@ static inline void pm_runtime_get_noresume(struct device *dev)
>  	atomic_inc(&dev->power.usage_count);
>  }
>  
> +static inline int pm_runtime_get_usage_count(struct device *dev)
> +{
> +	return atomic_read(&dev->power.usage_count);
> +}
> +
>  /**
>   * pm_runtime_put_noidle - Drop runtime PM usage counter of a device.
>   * @dev: Target device.
Kevin Hilman Oct. 31, 2023, 5:34 p.m. UTC | #6
Tony Lindgren <tony@atomide.com> writes:

> * Thomas Richard <thomas.richard@bootlin.com> [231031 10:15]:
>> Please find below the logs of the test you asked me.
>
> OK great thanks!
>
>> I added the call of pm_runtime_get_usage_count at the end of the suspend
>> function.
>> The console is attached on 2800000.serial, it has usage_count=4.
>> Other serial has usage_count=3.
>
> So as suspected, it seems the power domain gets force suspended
> somewhere despite the usage_count.

I think the only way this could happen (excluding any bugs, of course)
would be for pm_runtime_force_suspend() to be getting called somewhere,
but I thought the earlier patch made that call conditional, so maybe
there is another path where that is getting called?

Kevin
Tony Lindgren Nov. 24, 2023, 5:37 a.m. UTC | #7
* Théo Lebrun <theo.lebrun@bootlin.com> [231122 14:47]:
> Hi Kevin Hilman, Tony Lindgren & Thomas Richard,
> 
> On Tue Oct 31, 2023 at 6:34 PM CET, Kevin Hilman wrote:
> > Tony Lindgren <tony@atomide.com> writes:
> > > * Thomas Richard <thomas.richard@bootlin.com> [231031 10:15]:
> > >> Please find below the logs of the test you asked me.
> > >
> > > OK great thanks!
> > >
> > >> I added the call of pm_runtime_get_usage_count at the end of the suspend
> > >> function.
> > >> The console is attached on 2800000.serial, it has usage_count=4.
> > >> Other serial has usage_count=3.
> > >
> > > So as suspected, it seems the power domain gets force suspended
> > > somewhere despite the usage_count.
> >
> > I think the only way this could happen (excluding any bugs, of course)
> > would be for pm_runtime_force_suspend() to be getting called somewhere,
> > but I thought the earlier patch made that call conditional, so maybe
> > there is another path where that is getting called?
> 
> I'm coming back on this topic as the upstream fix is less than ideal, as
> everyone here was agreeing.

Thanks for looking into it.

> I've had a look at the genpd code & power-domains get powered-off at
> suspend_noirq without caring about runtime PM refcounting. See
> genpd_suspend_noirq & genpd_finish_suspend.
> 
> Behavior is:
> 
>  - In all cases, call suspend_noirq on the underlying device.
>  - Stop now if device is in wakeup path & PD has the
>    GENPD_FLAG_ACTIVE_WAKEUP flag.
>  - If the PD has start & stop callbacks & is not runtime suspended, call
>    the stop callback on the device.
>  - Increment the count of suspended devices on this PD.
>  - If PD is already off or always on, stop.
>  - If this is the last device on this PD (and some other checks), then
>    do the PD power off (and maybe parent PDs).
> 
> The current patch sets the PD as always on at suspend. That would not
> work if our PD driver registered start/stop callbacks as those would
> get called.
> 
> The right solution to avoid getting the PD cut would be to mark the
> serials we want to keep alive as on the wakeup path using
> device_set_wakeup_path(dev) at suspend. That also requires the PD to be
> marked using the GENPD_FLAG_ACTIVE_WAKEUP flag, which is currently not
> the case.

Not sure if we unconditionally want to set GENPD_FLAG_ACTIVE_WAKEUP as
the pm_domain.h describes it with "Instructs genpd to keep the PM domain
powered". The power domain should not force suspend automatically if there
are active runtime PM users even without that flag..

> That last aspect is what I'm unsure of: should we add this flag to all
> power-domains created by ti_sci_pm_domains? Should we pass something
> from devicetree? I don't see the reason for not enabling this behavior
> by default?

To me this still seems like a workaround because of the active runtime
PM usage count in the consumer driver :)

And GENPD_FLAG_ACTIVE_WAKEUP should not need to be configured separately
from the runtime PM usage count in this case. Sure there may be other
cases where GENPD_FLAG_ACTIVE_WAKEUP needs to be set dynamically, but
we should understand why the power domain code thinks it's OK to shut
down the domain if it has active users.

Regards,

Tony
Théo Lebrun Nov. 24, 2023, 10:39 a.m. UTC | #8
Hello,

On Fri Nov 24, 2023 at 6:37 AM CET, Tony Lindgren wrote:
> * Théo Lebrun <theo.lebrun@bootlin.com> [231122 14:47]:
> > Hi Kevin Hilman, Tony Lindgren & Thomas Richard,
> > 
> > On Tue Oct 31, 2023 at 6:34 PM CET, Kevin Hilman wrote:
> > > Tony Lindgren <tony@atomide.com> writes:
> > > > * Thomas Richard <thomas.richard@bootlin.com> [231031 10:15]:
> > > >> Please find below the logs of the test you asked me.
> > > >
> > > > OK great thanks!
> > > >
> > > >> I added the call of pm_runtime_get_usage_count at the end of the suspend
> > > >> function.
> > > >> The console is attached on 2800000.serial, it has usage_count=4.
> > > >> Other serial has usage_count=3.
> > > >
> > > > So as suspected, it seems the power domain gets force suspended
> > > > somewhere despite the usage_count.
> > >
> > > I think the only way this could happen (excluding any bugs, of course)
> > > would be for pm_runtime_force_suspend() to be getting called somewhere,
> > > but I thought the earlier patch made that call conditional, so maybe
> > > there is another path where that is getting called?
> > 
> > I'm coming back on this topic as the upstream fix is less than ideal, as
> > everyone here was agreeing.
>
> Thanks for looking into it.
>
> > I've had a look at the genpd code & power-domains get powered-off at
> > suspend_noirq without caring about runtime PM refcounting. See
> > genpd_suspend_noirq & genpd_finish_suspend.
> > 
> > Behavior is:
> > 
> >  - In all cases, call suspend_noirq on the underlying device.
> >  - Stop now if device is in wakeup path & PD has the
> >    GENPD_FLAG_ACTIVE_WAKEUP flag.
> >  - If the PD has start & stop callbacks & is not runtime suspended, call
> >    the stop callback on the device.
> >  - Increment the count of suspended devices on this PD.
> >  - If PD is already off or always on, stop.
> >  - If this is the last device on this PD (and some other checks), then
> >    do the PD power off (and maybe parent PDs).
> > 
> > The current patch sets the PD as always on at suspend. That would not
> > work if our PD driver registered start/stop callbacks as those would
> > get called.
> > 
> > The right solution to avoid getting the PD cut would be to mark the
> > serials we want to keep alive as on the wakeup path using
> > device_set_wakeup_path(dev) at suspend. That also requires the PD to be
> > marked using the GENPD_FLAG_ACTIVE_WAKEUP flag, which is currently not
> > the case.
>
> Not sure if we unconditionally want to set GENPD_FLAG_ACTIVE_WAKEUP as
> the pm_domain.h describes it with "Instructs genpd to keep the PM domain
> powered". The power domain should not force suspend automatically if there
> are active runtime PM users even without that flag..

Hey don't forgot the important part!

> Instructs genpd to keep the PM domain powered on, in case any of its
> attached devices is used in the wakeup path to serve system wakeups.

Checking the code confirms this behavior. Grep for the macro
genpd_is_active_wakeup rather than GENPD_FLAG_ACTIVE_WAKEUP. It gets
used twice (suspend & resume), both in the same manner:

   if (device_wakeup_path(dev) && genpd_is_active_wakeup(genpd))

This means this flag won't have any impact on runtime PM handling of
power-domains. By the way, other users of the flag enable it at PD
registration & don't touch it afterwards.

> > That last aspect is what I'm unsure of: should we add this flag to all
> > power-domains created by ti_sci_pm_domains? Should we pass something
> > from devicetree? I don't see the reason for not enabling this behavior
> > by default?
>
> To me this still seems like a workaround because of the active runtime
> PM usage count in the consumer driver :)
>
> And GENPD_FLAG_ACTIVE_WAKEUP should not need to be configured separately
> from the runtime PM usage count in this case. Sure there may be other
> cases where GENPD_FLAG_ACTIVE_WAKEUP needs to be set dynamically, but
> we should understand why the power domain code thinks it's OK to shut
> down the domain if it has active users.

There is currently nothing that links the runtime PM refcount to whether
or not the power-domains get powered-off at suspend_noirq. Should that
change? Maybe, but it would be a big behavioral change.

Regards,

--
Théo Lebrun, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
Tony Lindgren Nov. 24, 2023, 10:54 a.m. UTC | #9
* Théo Lebrun <theo.lebrun@bootlin.com> [231124 10:39]:
> On Fri Nov 24, 2023 at 6:37 AM CET, Tony Lindgren wrote:
> Checking the code confirms this behavior. Grep for the macro
> genpd_is_active_wakeup rather than GENPD_FLAG_ACTIVE_WAKEUP. It gets
> used twice (suspend & resume), both in the same manner:
> 
>    if (device_wakeup_path(dev) && genpd_is_active_wakeup(genpd))
> 
> This means this flag won't have any impact on runtime PM handling of
> power-domains. By the way, other users of the flag enable it at PD
> registration & don't touch it afterwards.

Setting GENPD_FLAG_ACTIVE_WAKEUP will block deeper idle states for
the SoC most likely.

> There is currently nothing that links the runtime PM refcount to whether
> or not the power-domains get powered-off at suspend_noirq. Should that
> change? Maybe, but it would be a big behavioral change.

For managing GENPD_FLAG_ACTIVE_WAKEUP dynamically, maybe something like:

device_request_wakeup_path(dev)
...
device_free_wakeup_path(dev)

And those would toggle GENPD_FLAG_ACTIVE_WAKEUP for the power domain
based on some usage count?

Regards,

Tony
Kevin Hilman Aug. 9, 2024, 7:04 p.m. UTC | #10
Thomas Richard <thomas.richard@bootlin.com> writes:

> If the console suspend is disabled, the genpd of the console shall not
> be powered-off during suspend.
> Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
> suspend, and restore the original value during the resume.
>
> Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>

Hmm, this patch got merged upstream (commit 68e6939ea9ec) even after
disagreements about the approach.

Even worse, it actually causes a crash during suspend on platforms that
don't use PM domains (like AM335x Beaglebone Black.)

Details on why this crashes below.

Thomas, could you please submit a revert for this (with a Fixes: tag)
and then follow up with the approach as discussed later in this thread?

> ---
>  drivers/tty/serial/8250/8250_omap.c | 33 ++++++++++++++++++++++++-----
>  1 file changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
> index ca972fd37725..91a483dc460c 100644
> --- a/drivers/tty/serial/8250/8250_omap.c
> +++ b/drivers/tty/serial/8250/8250_omap.c
> @@ -27,6 +27,7 @@
>  #include <linux/pm_wakeirq.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/sys_soc.h>
> +#include <linux/pm_domain.h>
>  
>  #include "8250.h"
>  
> @@ -114,6 +115,12 @@
>  /* RX FIFO occupancy indicator */
>  #define UART_OMAP_RX_LVL		0x19
>  
> +/*
> + * Copy of the genpd flags for the console.
> + * Only used if console suspend is disabled
> + */
> +static unsigned int genpd_flags_console;
> +
>  struct omap8250_priv {
>  	void __iomem *membase;
>  	int line;
> @@ -1617,6 +1624,7 @@ static int omap8250_suspend(struct device *dev)
>  {
>  	struct omap8250_priv *priv = dev_get_drvdata(dev);
>  	struct uart_8250_port *up = serial8250_get_port(priv->line);
> +	struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);

This genpd ptr will be NULL on am335x, and any other legacy platforms
that don't use PM domains, and then...

>  	int err = 0;
>  
>  	serial8250_suspend_port(priv->line);
> @@ -1627,8 +1635,19 @@ static int omap8250_suspend(struct device *dev)
>  	if (!device_may_wakeup(dev))
>  		priv->wer = 0;
>  	serial_out(up, UART_OMAP_WER, priv->wer);
> -	if (uart_console(&up->port) && console_suspend_enabled)
> -		err = pm_runtime_force_suspend(dev);
> +	if (uart_console(&up->port)) {
> +		if (console_suspend_enabled)
> +			err = pm_runtime_force_suspend(dev);
> +		else {
> +			/*
> +			 * The pd shall not be powered-off (no console suspend).
> +			 * Make copy of genpd flags before to set it always on.
> +			 * The original value is restored during the resume.
> +			 */
> +			genpd_flags_console = genpd->flags;
> +			genpd->flags |= GENPD_FLAG_ALWAYS_ON;

... BOOM.

> +		}
> +	}
>  	flush_work(&priv->qos_work);
>  
>  	return err;

Kevin
Kevin Hilman Aug. 13, 2024, 5:18 p.m. UTC | #11
Greg KH <gregkh@linuxfoundation.org> writes:

> On Fri, Aug 09, 2024 at 12:04:23PM -0700, Kevin Hilman wrote:
>> Thomas Richard <thomas.richard@bootlin.com> writes:
>> 
>> > If the console suspend is disabled, the genpd of the console shall not
>> > be powered-off during suspend.
>> > Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
>> > suspend, and restore the original value during the resume.
>> >
>> > Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
>> 
>> Hmm, this patch got merged upstream (commit 68e6939ea9ec) even after
>> disagreements about the approach.
>> 
>> Even worse, it actually causes a crash during suspend on platforms that
>> don't use PM domains (like AM335x Beaglebone Black.)
>> 
>> Details on why this crashes below.
>> 
>> Thomas, could you please submit a revert for this (with a Fixes: tag)
>> and then follow up with the approach as discussed later in this thread?
>
> Did this revert happen yet?

No.

Could you revert it since it's caused regressions?  I will follow up
with Thomas on the right fix for the original issue (as discussed later
in this thread.)

Thanks,

Kevin
Thomas Richard Aug. 20, 2024, 9:15 a.m. UTC | #12
On 8/13/24 19:18, Kevin Hilman wrote:
> Greg KH <gregkh@linuxfoundation.org> writes:
> 
>> On Fri, Aug 09, 2024 at 12:04:23PM -0700, Kevin Hilman wrote:
>>> Thomas Richard <thomas.richard@bootlin.com> writes:
>>>
>>>> If the console suspend is disabled, the genpd of the console shall not
>>>> be powered-off during suspend.
>>>> Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
>>>> suspend, and restore the original value during the resume.
>>>>
>>>> Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
>>>
>>> Hmm, this patch got merged upstream (commit 68e6939ea9ec) even after
>>> disagreements about the approach.
>>>
>>> Even worse, it actually causes a crash during suspend on platforms that
>>> don't use PM domains (like AM335x Beaglebone Black.)
>>>
>>> Details on why this crashes below.
>>>
>>> Thomas, could you please submit a revert for this (with a Fixes: tag)
>>> and then follow up with the approach as discussed later in this thread?
>>
>> Did this revert happen yet?
> 
> No.
> 
> Could you revert it since it's caused regressions?  I will follow up
> with Thomas on the right fix for the original issue (as discussed later
> in this thread.)

Hello Kevin,

The series, which implements Théo's proposal, was ready to be sent when
I tried to go back to suspend after a resume. This causes a kernel panic
during the suspend.

There is the issue in both cases (console suspend and no console suspend).

The issue comes from the other uarts (not the one used by the console).
If I disable all other uarts, there is no panic.

It seems the uarts are not resumed correctly.

More investigation is needed.

Regards,

Thomas

[  145.658789] port 2830000.serial:0.0: PM: calling
pm_runtime_force_suspend @ 133, parent: 2830000.serial:0
[  145.668341] port 2830000.serial:0.0: PM: pm_runtime_force_suspend
returned 0 after 0 usecs
[  145.676598] omap8250 2830000.serial: PM: calling omap8250_suspend @
133, parent: bus@100000
[  145.684946] SError Interrupt on CPU0, code 0x00000000bf000000 -- SError
[  145.684951] CPU: 0 UID: 0 PID: 133 Comm: sh Not tainted
6.11.0-rc4-00020-g0b5cbc0defae-dirty #27
[  145.684956] Hardware name: Texas Instruments J7200 EVM (DT)
[  145.684959] pstate: a00000c5 (NzCv daIF -PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[  145.684962] pc : __flush_work+0x1f0/0x2d8
[  145.684968] lr : __flush_work+0x9c/0x2d8
[  145.684971] sp : ffff8000840e39d0
[  145.684973] x29: ffff8000840e39d0 x28: ffff000804450000 x27:
ffff8000835f1000
[  145.684979] x26: 00000021acb0bb98 x25: ffff800083580090 x24:
0000000000c00000
[  145.684983] x23: 0000000000000000 x22: 0000000000000001 x21:
ffff00087f6cbe40
[  145.684988] x20: ffff0008012e80e8 x19: ffff00080013d410 x18:
0000000000000006
[  145.684993] x17: 726170202c333331 x16: 204020646e657073 x15:
0720072007200720
[  145.684997] x14: 0720072007200720 x13: ffff8000830a4660 x12:
000000000000172b
[  145.685002] x11: 0000000000000040 x10: ffff8000830a19e8 x9 :
ffff8000830a19e0
[  145.685006] x8 : ffff000800400028 x7 : 0000000000000000 x6 :
0000000000000000
[  145.685011] x5 : ffff000800400000 x4 : 0000000000000000 x3 :
0000000000000001
[  145.685015] x2 : 0000000000000000 x1 : 0000000000000000 x0 :
ffff00087f6cbe40
[  145.685020] Kernel panic - not syncing: Asynchronous SError Interrupt
[  145.685023] CPU: 0 UID: 0 PID: 133 Comm: sh Not tainted
6.11.0-rc4-00020-g0b5cbc0defae-dirty #27
[  145.685027] Hardware name: Texas Instruments J7200 EVM (DT)
[  145.685029] Call trace:
[  145.685031]  dump_backtrace+0x94/0xec
[  145.685038]  show_stack+0x18/0x24
[  145.685042]  dump_stack_lvl+0x38/0x90
[  145.685049]  dump_stack+0x18/0x24
[  145.685052]  panic+0x38c/0x3c0
[  145.685058]  nmi_panic+0x40/0x8c
[  145.685063]  arm64_serror_panic+0x64/0x70
[  145.685067]  do_serror+0x3c/0x78
[  145.685069]  el1h_64_error_handler+0x30/0x48
[  145.685075]  el1h_64_error+0x64/0x68
[  145.685078]  __flush_work+0x1f0/0x2d8
[  145.685080]  flush_work+0x14/0x20
[  145.685083]  omap8250_suspend+0x84/0x130
[  145.685087]  dpm_run_callback.constprop.0+0x74/0x134
[  145.685092]  device_suspend+0x110/0x400
[  145.685096]  dpm_suspend+0x10c/0x19c
[  145.685100]  dpm_suspend_start+0x70/0x78
[  145.685104]  suspend_devices_and_enter+0x12c/0x634
[  145.685110]  pm_suspend+0x160/0x270
[  145.685115]  state_store+0x80/0xec
[  145.685119]  kobj_attr_store+0x18/0x2c
[  145.685122]  sysfs_kf_write+0x44/0x54
[  145.685129]  kernfs_fop_write_iter+0x120/0x1ec
[  145.685134]  vfs_write+0x238/0x370
[  145.685140]  ksys_write+0x70/0x104
[  145.685143]  __arm64_sys_write+0x1c/0x28
[  145.685147]  invoke_syscall+0x48/0x110
[  145.685155]  el0_svc_common.constprop.0+0x40/0xe0
[  145.685159]  do_el0_svc+0x1c/0x28
[  145.685164]  el0_svc+0x34/0xdc
[  145.685168]  el0t_64_sync_handler+0x100/0x12c
[  145.685173]  el0t_64_sync+0x190/0x194
[  145.685176] SMP: stopping secondary CPUs
[  145.685181] Kernel Offset: disabled
[  145.685183] CPU features: 0x04,00001001,40100000,4200421b
[  145.685186] Memory Limit: none
[  145.953475] ---[ end Kernel panic - not syncing: Asynchronous SError
Interrupt ]---
Thomas Richard Sept. 16, 2024, 2:03 p.m. UTC | #13
On 8/20/24 11:15, Thomas Richard wrote:
> On 8/13/24 19:18, Kevin Hilman wrote:
>> Greg KH <gregkh@linuxfoundation.org> writes:
>>
>>> On Fri, Aug 09, 2024 at 12:04:23PM -0700, Kevin Hilman wrote:
>>>> Thomas Richard <thomas.richard@bootlin.com> writes:
>>>>
>>>>> If the console suspend is disabled, the genpd of the console shall not
>>>>> be powered-off during suspend.
>>>>> Set the flag GENPD_FLAG_ALWAYS_ON to the corresponding genpd during
>>>>> suspend, and restore the original value during the resume.
>>>>>
>>>>> Signed-off-by: Thomas Richard <thomas.richard@bootlin.com>
>>>>
>>>> Hmm, this patch got merged upstream (commit 68e6939ea9ec) even after
>>>> disagreements about the approach.
>>>>
>>>> Even worse, it actually causes a crash during suspend on platforms that
>>>> don't use PM domains (like AM335x Beaglebone Black.)
>>>>
>>>> Details on why this crashes below.
>>>>
>>>> Thomas, could you please submit a revert for this (with a Fixes: tag)
>>>> and then follow up with the approach as discussed later in this thread?
>>>
>>> Did this revert happen yet?
>>
>> No.
>>
>> Could you revert it since it's caused regressions?  I will follow up
>> with Thomas on the right fix for the original issue (as discussed later
>> in this thread.)
> 
> Hello Kevin,
> 
> The series, which implements Théo's proposal, was ready to be sent when
> I tried to go back to suspend after a resume. This causes a kernel panic
> during the suspend.
> 
> There is the issue in both cases (console suspend and no console suspend).
> 
> The issue comes from the other uarts (not the one used by the console).
> If I disable all other uarts, there is no panic.
> 
> It seems the uarts are not resumed correctly.
> 
> More investigation is needed.

Hello Kevin,

I finally found the culprit.
Just adding the GENPD_FLAG_ACTIVE_WAKEUP flag by default caused the
panic, even without the call of device_set_wakeup_path().

With some debug, I found that the wakeup_path of all my uarts (even
other uarts not used by the console) was set.
So the corresponding power domains were not powered off [1].
Consequently they are not powered on during the resume [2].

But on my platform (J7200), the SoC is fully off during suspend-to-ram.
Even if a power domain is not powered off by Linux, at the end all the
SoC is off.
And if Linux doesn't power off a power domain, it doesn't power on it.

The wakeup_path of all my uarts is set here [3] because devices are
wakeup capable [4].
It was added by commit 8512220c5782d [5].
If I remove the wakeup_path modification (diff at the end of the email),
Théo's proposal works well. But it's probably too rough, and I have no
idea about the impact on other platforms.

If you have an idea to fix correctly this wakeup_path issue, please let
me know :)

[1]
https://elixir.bootlin.com/linux/v6.11/source/drivers/pmdomain/core.c#L1372
[2]
https://elixir.bootlin.com/linux/v6.11/source/drivers/pmdomain/core.c#L1427
[3]
https://elixir.bootlin.com/linux/v6.10.10/source/drivers/base/power/main.c#L1687
[4]
https://elixir.bootlin.com/linux/v6.11/source/drivers/tty/serial/8250/8250_omap.c#L1526
[5]
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8512220c5782d

Best Regards,

Thomas

8< -------------------------------
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4a67e83300e1..e3d9153a9a81 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1683,9 +1683,6 @@ static int device_suspend(struct device *dev,
pm_message_t state, bool async)
  End:
        if (!error) {
                dev->power.is_suspended = true;
-               if (device_may_wakeup(dev))
-                       dev->power.wakeup_path = true;
-
                dpm_propagate_wakeup_to_parent(dev);
                dpm_clear_superiors_direct_complete(dev);
        }
@@ -1795,8 +1792,6 @@ static int device_prepare(struct device *dev,
pm_message_t state)

        device_lock(dev);

-       dev->power.wakeup_path = false;
-
        if (dev->power.no_pm_callbacks)
                goto unlock;
Kevin Hilman Oct. 4, 2024, 7:23 p.m. UTC | #14
Thomas Richard <thomas.richard@bootlin.com> writes:

[...]

> I finally found the culprit.
> Just adding the GENPD_FLAG_ACTIVE_WAKEUP flag by default caused the
> panic, even without the call of device_set_wakeup_path().
>
> With some debug, I found that the wakeup_path of all my uarts (even
> other uarts not used by the console) was set.
> So the corresponding power domains were not powered off [1].
> Consequently they are not powered on during the resume [2].
>
> But on my platform (J7200), the SoC is fully off during suspend-to-ram.
> Even if a power domain is not powered off by Linux, at the end all the
> SoC is off.
> And if Linux doesn't power off a power domain, it doesn't power on it.

OK, so the core of the problem is here.  The SoC firmware is going to
power of the domain during system-wide suspend, no matter what Linux
tries to do.  So there's a fundamental disconnect between what Linux
thinks is the state of the power domain and the actual hardware state.
I'm not sure how to work around that other than if the firmware is
always going to power off the domain, then GENPD_FLAG_ACTIVE_WAKEUP 
cannot be used for that domain, since it's effectively ignored.

> The wakeup_path of all my uarts is set here [3] because devices are
> wakeup capable [4].

Ok, but now This behavior is now changed in v6.12-rc1[6] (introduced by
this[7] series.)  Now, UARTs default to wakeup capable (not enabled),
and wakeups are only enabled if the DT property `wakeup-source` is used.

This should at least allow you to focus on only UARTs that are intended
as wakeup sources for the platform, instead of the current (pre v.6.12) behavior 
where all UARTs default to wakeup-enabled.

Kevin

> It was added by commit 8512220c5782d [5].
> If I remove the wakeup_path modification (diff at the end of the email),
> Théo's proposal works well. But it's probably too rough, and I have no
> idea about the impact on other platforms.
>
> If you have an idea to fix correctly this wakeup_path issue, please let
> me know :)
>
> [1]
> https://elixir.bootlin.com/linux/v6.11/source/drivers/pmdomain/core.c#L1372
> [2]
> https://elixir.bootlin.com/linux/v6.11/source/drivers/pmdomain/core.c#L1427
> [3]
> https://elixir.bootlin.com/linux/v6.10.10/source/drivers/base/power/main.c#L1687
> [4]
> https://elixir.bootlin.com/linux/v6.11/source/drivers/tty/serial/8250/8250_omap.c#L1526
> [5]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8512220c5782d

[6] https://elixir.bootlin.com/linux/v6.12-rc1/source/drivers/tty/serial/8250/8250_omap.c#L1525
[7] https://lore.kernel.org/all/20240807141227.1093006-1-msp@baylibre.com/
diff mbox series

Patch

diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index ca972fd37725..91a483dc460c 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -27,6 +27,7 @@ 
 #include <linux/pm_wakeirq.h>
 #include <linux/dma-mapping.h>
 #include <linux/sys_soc.h>
+#include <linux/pm_domain.h>
 
 #include "8250.h"
 
@@ -114,6 +115,12 @@ 
 /* RX FIFO occupancy indicator */
 #define UART_OMAP_RX_LVL		0x19
 
+/*
+ * Copy of the genpd flags for the console.
+ * Only used if console suspend is disabled
+ */
+static unsigned int genpd_flags_console;
+
 struct omap8250_priv {
 	void __iomem *membase;
 	int line;
@@ -1617,6 +1624,7 @@  static int omap8250_suspend(struct device *dev)
 {
 	struct omap8250_priv *priv = dev_get_drvdata(dev);
 	struct uart_8250_port *up = serial8250_get_port(priv->line);
+	struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);
 	int err = 0;
 
 	serial8250_suspend_port(priv->line);
@@ -1627,8 +1635,19 @@  static int omap8250_suspend(struct device *dev)
 	if (!device_may_wakeup(dev))
 		priv->wer = 0;
 	serial_out(up, UART_OMAP_WER, priv->wer);
-	if (uart_console(&up->port) && console_suspend_enabled)
-		err = pm_runtime_force_suspend(dev);
+	if (uart_console(&up->port)) {
+		if (console_suspend_enabled)
+			err = pm_runtime_force_suspend(dev);
+		else {
+			/*
+			 * The pd shall not be powered-off (no console suspend).
+			 * Make copy of genpd flags before to set it always on.
+			 * The original value is restored during the resume.
+			 */
+			genpd_flags_console = genpd->flags;
+			genpd->flags |= GENPD_FLAG_ALWAYS_ON;
+		}
+	}
 	flush_work(&priv->qos_work);
 
 	return err;
@@ -1638,12 +1657,16 @@  static int omap8250_resume(struct device *dev)
 {
 	struct omap8250_priv *priv = dev_get_drvdata(dev);
 	struct uart_8250_port *up = serial8250_get_port(priv->line);
+	struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);
 	int err;
 
 	if (uart_console(&up->port) && console_suspend_enabled) {
-		err = pm_runtime_force_resume(dev);
-		if (err)
-			return err;
+		if (console_suspend_enabled) {
+			err = pm_runtime_force_resume(dev);
+			if (err)
+				return err;
+		} else
+			genpd->flags = genpd_flags_console;
 	}
 
 	serial8250_resume_port(priv->line);