diff mbox series

[RFC,v2,2/3] clk: qcom: implement RCG2 'parked' clock support

Message ID 20231004012308.2305273-3-dmitry.baryshkov@linaro.org
State New
Headers show
Series clk: qcom: provide alternative 'parked' RCG | expand

Commit Message

Dmitry Baryshkov Oct. 4, 2023, 1:23 a.m. UTC
clk_rcg2_shared_ops implements support for the case of the RCG which
must not be completely turned off. However its design has one major
drawback: it doesn't allow us to properly implement the is_enabled
callback, which causes different kinds of misbehaviour from the CCF.

Follow the idea behind clk_regmap_phy_mux_ops and implement the new
clk_rcg2_parked_ops. It also targets the clocks which must not be fully
switched off (and shared most of the implementation with
clk_rcg2_shared_ops). The major difference is that it requires that the
parent map doesn't conain the safe (parked) clock source. Instead if the
CFG_REG register points to the safe source, the clock is considered to
be disabled.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
---
 drivers/clk/qcom/clk-rcg.h  |  1 +
 drivers/clk/qcom/clk-rcg2.c | 56 +++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

Comments

Stephen Boyd Nov. 4, 2023, 1:24 a.m. UTC | #1
Quoting Dmitry Baryshkov (2023-10-03 18:23:07)
> clk_rcg2_shared_ops implements support for the case of the RCG which
> must not be completely turned off. However its design has one major
> drawback: it doesn't allow us to properly implement the is_enabled
> callback, which causes different kinds of misbehaviour from the CCF.
> 
> Follow the idea behind clk_regmap_phy_mux_ops and implement the new
> clk_rcg2_parked_ops. It also targets the clocks which must not be fully
> switched off (and shared most of the implementation with
> clk_rcg2_shared_ops). The major difference is that it requires that the
> parent map doesn't conain the safe (parked) clock source. Instead if the
> CFG_REG register points to the safe source, the clock is considered to
> be disabled.
> 
> Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
> ---
>  drivers/clk/qcom/clk-rcg.h  |  1 +
>  drivers/clk/qcom/clk-rcg2.c | 56 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 57 insertions(+)
> 
> diff --git a/drivers/clk/qcom/clk-rcg.h b/drivers/clk/qcom/clk-rcg.h
> index e6d84c8c7989..9fbbf1251564 100644
> --- a/drivers/clk/qcom/clk-rcg.h
> +++ b/drivers/clk/qcom/clk-rcg.h
> @@ -176,6 +176,7 @@ extern const struct clk_ops clk_byte2_ops;
>  extern const struct clk_ops clk_pixel_ops;
>  extern const struct clk_ops clk_gfx3d_ops;
>  extern const struct clk_ops clk_rcg2_shared_ops;
> +extern const struct clk_ops clk_rcg2_parked_ops;
>  extern const struct clk_ops clk_dp_ops;
>  
>  struct clk_rcg_dfs_data {
> diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
> index 5183c74b074f..fc75e2bc2d70 100644
> --- a/drivers/clk/qcom/clk-rcg2.c
> +++ b/drivers/clk/qcom/clk-rcg2.c
> @@ -5,6 +5,7 @@
>  
>  #include <linux/kernel.h>
>  #include <linux/bitops.h>
> +#include <linux/bitfield.h>
>  #include <linux/err.h>
>  #include <linux/bug.h>
>  #include <linux/export.h>
> @@ -1150,6 +1151,61 @@ const struct clk_ops clk_rcg2_shared_ops = {
>  };
>  EXPORT_SYMBOL_GPL(clk_rcg2_shared_ops);
>  
> +static int clk_rcg2_parked_is_enabled(struct clk_hw *hw)
> +{
> +       struct clk_rcg2 *rcg = to_clk_rcg2(hw);
> +       u32 cmd, cfg;
> +       int ret;
> +
> +       ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CMD_REG, &cmd);
> +       if (ret)
> +               return ret;
> +
> +       if ((cmd & CMD_ROOT_EN) == 0)
> +               return false;

return 0?

CMD_ROOT_OFF can be 0 and CMD_ROOT_EN can also be 0 at the same time.
When that happens, some branch child clk is enabled and the rcg is
actually enabled. There's a hardware feedback mechanism from the
branches to the rcg so the rcg is guaranteed enabled. I'm trying to say
that this bit is unreliable on its own, so we need to take care here. In
fact, this bit is only used as a software override to make sure the
branches don't turn off the rcg inadvertently.

What if a branch is enabled, but the rcg root_en bit isn't set, and XO
is used? In that case, this will report the clk as disabled when it's
really enabled. That will look confusing to the clk framework because a
child will be enabled without the parent being enabled. Things will
probably still work though, because this only matters during disabling
unused clks.

Maybe it's better to not implement an is_enabled() callback for this clk
and simply call a function to see which parent the hardware is using (XO
or not). Basically don't go through clk_hw_is_enabled() and just call
clk_rcg2_parked_is_enabled() directly wherever the clk_hw API is used.
Then the framework doesn't get confused about enabled children with
disabled parents, but the downside is that the framework doesn't know if
the rcg is enabled. This is most likely fine though because an enabled
rcg doesn't really make a difference. The important thing is knowing
which branches are enabled at the framework level. Furthermore, the
framework doesn't currently handle propagating up the enable state at
boot to parents, so if say we have a child branch that is enabled, the
enable state of the parent _must_ be enabled as well, or the branch is
wedged and the only way to unwedge that is to enable the parent. It's
quite a mess!

Long story short, I question why we need to implement is_enabled() for
this clk. What's the benefit? The branches being off is more important
if we're concerned about saving power. There's the problem of handing
off enable state from when the driver probes, but that's not so easy to
solve given that a branch could be enabled (or a branch could be enabled
that isn't even known to linux). And it also sort of doesn't matter
because we know XO is practically always enabled and what really matters
is making sure the driver can't wedge the RCG by changing the source to
something that isn't enabled if it thinks the RCG is disabled when it is
really enabled.

That's sort of the only rule here, don't write the hardware when the
current parent isn't enabled or the new parent isn't enabled. We don't
know if the rcg is ever enabled, so we can only write the "go bit"
(CMD_UPDATE) when we're 100% certain that the parent (or next parent
when switching) is enabled. XO we know is always enabled, but otherwise
we don't know unless the framework has enabled the clk (and therefore
implicitly enabled the parent). The set_rate op could be called from
either enabled or disabled state, same for the set_parent op. And we
want the other clk APIs to report the state of the clk (like the parent
or rate) even if the hardware hasn't been changed.

> +
> +       ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG, &cfg);
> +       if (ret)
> +               return ret;
> +
> +       return FIELD_GET(CFG_SRC_SEL_MASK, cfg) != rcg->safe_src_index;
> +}
> +
> +static int clk_rcg2_parked_init(struct clk_hw *hw)
> +{
> +       struct clk_rcg2 *rcg = to_clk_rcg2(hw);
> +       const struct freq_tbl *f = rcg->freq_tbl;
> +
> +       regmap_read(rcg->clkr.regmap, RCG_CFG_OFFSET(rcg), &rcg->parked_cfg);

I need this part today to fix a stuck clk problem I see on trogdor.lazor
where during registration a call to clk_ops::get_parent() sees the clk
isn't enabled at boot (because there isn't a clk_ops::is_enabled()
function) so clk_rcg2_shared_get_parent() reads the parent from the
'parked_cfg' value, which is zero. If the hardware actually has non-zero
for the parent then the framework will get the wrong parent, which is
what happens on trogdor when the devmode screen is shown. The parent is
the display PLL instead of XO. I haven't dug far enough to understand
why disabling unused clks wedges the branch when we try to enable it
again, but not disabling unused clks fixes the problem or reading the
config register at registration to get the proper parent also fixes it.
I guess the problem is that we're switching the RCG value when we
shouldn't be doing that.

> +
> +       if (FIELD_GET(CFG_SRC_SEL_MASK, rcg->parked_cfg) != rcg->safe_src_index)
> +               return 0;
> +
> +       if (WARN_ON(!f) ||
> +           WARN_ON(qcom_find_src_cfg(hw, rcg->parent_map, f->src) == rcg->safe_src_index))
> +               return -EINVAL;
> +
> +       return __clk_rcg2_configure(rcg, f, &rcg->parked_cfg);

It would be good to have a comment above this like

	/*
	 * Dirty the rcg registers to point at the first frequency table
	 * entry which is guaranteed to not use the safe_src_index.
	 * Setting the rate of the clk with rcg registers containing the
	 * safe_src_index will confuse clk_rcg2_parked_is_enabled() as
	 * to the enable state and lead to actually changing the rate of
	 * the clk when it isn't enabled.
	 */

> +}
> +
> +/*
> + * Unlike clk_rcg2_shared_ops, the safe_src_index aka XO must NOT be present in
> + * parent_map. This allows us to implement proper is_enabled callback.

We could also modify clk_ops::set_rate() and clk_ops::determine_rate()
to ignore frequency table entries with the safe_src_index, so that no
driver can change the frequency to be XO. Then XO is still "reserved",
and it still means the clk is disabled when the parent is XO, but we
don't have to change the RCG registers during clk_rcg2_parked_init() to
move off the safe_src/XO parent. We also have to prevent the parent from
being set to XO with clk_set_parent(). That should be doable by failing
the clk_ops::set_parent() op when the parent is XO.

I'd actually prefer that approach if it's workable, so that we don't
dirty the RCG registers during clk registration. I think qcom folks were
unhappy with the rcg registers being dirty for a long time
(CMD_DIRTY_CFG), because the other entity (the gdsc?) was triggering the
rcg switch (CMD_UPDATE) and that was causing the wrong parent to be
used.

I still come back to the why question though. What are we gaining by
implementing is_enabled for this clk?

> + */
> +const struct clk_ops clk_rcg2_parked_ops = {
> +       .init = clk_rcg2_parked_init,
> +       .is_enabled = clk_rcg2_parked_is_enabled,
> +       .enable = clk_rcg2_shared_enable,
> +       .disable = clk_rcg2_shared_disable,
> +       .get_parent = clk_rcg2_shared_get_parent,
> +       .set_parent = clk_rcg2_shared_set_parent,
> +       .recalc_rate = clk_rcg2_shared_recalc_rate,
> +       .determine_rate = clk_rcg2_determine_rate,
> +       .set_rate = clk_rcg2_shared_set_rate,
> +       .set_rate_and_parent = clk_rcg2_shared_set_rate_and_parent,
> +};
> +EXPORT_SYMBOL_GPL(clk_rcg2_parked_ops);
Stephen Boyd Nov. 7, 2023, 1:36 a.m. UTC | #2
Quoting Stephen Boyd (2023-11-03 18:24:47)
> Quoting Dmitry Baryshkov (2023-10-03 18:23:07)
> > +
> > +       ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG, &cfg);
> > +       if (ret)
> > +               return ret;
> > +
> > +       return FIELD_GET(CFG_SRC_SEL_MASK, cfg) != rcg->safe_src_index;
> > +}
> > +
> > +static int clk_rcg2_parked_init(struct clk_hw *hw)
> > +{
> > +       struct clk_rcg2 *rcg = to_clk_rcg2(hw);
> > +       const struct freq_tbl *f = rcg->freq_tbl;
> > +
> > +       regmap_read(rcg->clkr.regmap, RCG_CFG_OFFSET(rcg), &rcg->parked_cfg);
> 
> I need this part today to fix a stuck clk problem I see on trogdor.lazor
> where during registration a call to clk_ops::get_parent() sees the clk
> isn't enabled at boot (because there isn't a clk_ops::is_enabled()
> function) so clk_rcg2_shared_get_parent() reads the parent from the
> 'parked_cfg' value, which is zero. If the hardware actually has non-zero
> for the parent then the framework will get the wrong parent, which is
> what happens on trogdor when the devmode screen is shown. The parent is
> the display PLL instead of XO. I haven't dug far enough to understand
> why disabling unused clks wedges the branch when we try to enable it
> again, but not disabling unused clks fixes the problem or reading the
> config register at registration to get the proper parent also fixes it.
> I guess the problem is that we're switching the RCG value when we
> shouldn't be doing that.
> 

I looked at this more today. It seems that I need to both read the
config register at init and also move over the rcg to the safe source at
init (i.e. park the clk at init). That's doable with a call to
clk_rcg2_shared_disable() for the clk_ops::init callback. Otherwise I
get a stuck clk warning.

Either

 disp_cc_mdss_mdp_clk status stuck at 'off'

or

 disp_cc_mdss_rot_clk status stuck at 'on'

When I don't park the rcg, the disp_cc_mdss_rot_clk gets stuck during
disabling of unused clks. I think I understand that problem. What
happens is disp_cc_mdss_mdp_clk_src and disp_cc_mdss_rot_clk_src are
both sourcing from disp_cc_pll0 at boot. Fixing the parent mapping makes
it so that enabling and then disabling disp_cc_mdss_ahb_clk causes
disp_cc_pll0 to be turned off when disp_cc_mdss_rot_clk_src is sourced
from it (and the branch disp_cc_mdss_rot_clk is enabled). If we park
both the rcgs at clk registration time we avoid this problem because the
PLL is disabled but nothing is actually a child clk. The act of reading
the config register and stashing that in the 'parked_cfg' only helps
avoid duplicate calls to change the rate, and doesn't help when we try
to repoint the clk at XO when the parent PLL is off.

The part I still don't understand is why reading the config register at
init and stashing that in 'parked_cfg' fixes the disp_cc_mdss_mdp_clk
stuck at off problem. I see that the branch clk is turned off and on
many times during boot and there aren't any warnings regardless of
stashing the config register. That means we should be moving the RCG to
XO source, between forcibly enabling and disabling the RCG, which
presumably would complain about being unable to update the config
register, but it doesn't. Only after late init does the clk fail to
enable, and the source is still XO at that time. Something else must be
happening that wedges the branch to the point that it can't be
recovered. But simply reporting the proper parent is enough for
disp_cc_mdss_mdp_clk.
Dmitry Baryshkov Nov. 7, 2023, 2 a.m. UTC | #3
On Tue, 7 Nov 2023 at 03:36, Stephen Boyd <sboyd@kernel.org> wrote:
>
> Quoting Stephen Boyd (2023-11-03 18:24:47)
> > Quoting Dmitry Baryshkov (2023-10-03 18:23:07)
> > > +
> > > +       ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG, &cfg);
> > > +       if (ret)
> > > +               return ret;
> > > +
> > > +       return FIELD_GET(CFG_SRC_SEL_MASK, cfg) != rcg->safe_src_index;
> > > +}
> > > +
> > > +static int clk_rcg2_parked_init(struct clk_hw *hw)
> > > +{
> > > +       struct clk_rcg2 *rcg = to_clk_rcg2(hw);
> > > +       const struct freq_tbl *f = rcg->freq_tbl;
> > > +
> > > +       regmap_read(rcg->clkr.regmap, RCG_CFG_OFFSET(rcg), &rcg->parked_cfg);
> >
> > I need this part today to fix a stuck clk problem I see on trogdor.lazor
> > where during registration a call to clk_ops::get_parent() sees the clk
> > isn't enabled at boot (because there isn't a clk_ops::is_enabled()
> > function) so clk_rcg2_shared_get_parent() reads the parent from the
> > 'parked_cfg' value, which is zero. If the hardware actually has non-zero
> > for the parent then the framework will get the wrong parent, which is
> > what happens on trogdor when the devmode screen is shown. The parent is
> > the display PLL instead of XO. I haven't dug far enough to understand
> > why disabling unused clks wedges the branch when we try to enable it
> > again, but not disabling unused clks fixes the problem or reading the
> > config register at registration to get the proper parent also fixes it.
> > I guess the problem is that we're switching the RCG value when we
> > shouldn't be doing that.
> >
>
> I looked at this more today. It seems that I need to both read the
> config register at init and also move over the rcg to the safe source at
> init (i.e. park the clk at init). That's doable with a call to
> clk_rcg2_shared_disable() for the clk_ops::init callback. Otherwise I
> get a stuck clk warning.
>
> Either
>
>  disp_cc_mdss_mdp_clk status stuck at 'off'
>
> or
>
>  disp_cc_mdss_rot_clk status stuck at 'on'
>
> When I don't park the rcg, the disp_cc_mdss_rot_clk gets stuck during
> disabling of unused clks. I think I understand that problem. What
> happens is disp_cc_mdss_mdp_clk_src and disp_cc_mdss_rot_clk_src are
> both sourcing from disp_cc_pll0 at boot. Fixing the parent mapping makes
> it so that enabling and then disabling disp_cc_mdss_ahb_clk causes
> disp_cc_pll0 to be turned off when disp_cc_mdss_rot_clk_src is sourced
> from it (and the branch disp_cc_mdss_rot_clk is enabled). If we park
> both the rcgs at clk registration time we avoid this problem because the
> PLL is disabled but nothing is actually a child clk. The act of reading
> the config register and stashing that in the 'parked_cfg' only helps
> avoid duplicate calls to change the rate, and doesn't help when we try
> to repoint the clk at XO when the parent PLL is off.
>
> The part I still don't understand is why reading the config register at
> init and stashing that in 'parked_cfg' fixes the disp_cc_mdss_mdp_clk
> stuck at off problem. I see that the branch clk is turned off and on
> many times during boot and there aren't any warnings regardless of
> stashing the config register. That means we should be moving the RCG to
> XO source, between forcibly enabling and disabling the RCG, which
> presumably would complain about being unable to update the config
> register, but it doesn't. Only after late init does the clk fail to
> enable, and the source is still XO at that time. Something else must be
> happening that wedges the branch to the point that it can't be
> recovered. But simply reporting the proper parent is enough for
> disp_cc_mdss_mdp_clk.

I suppose that the issue is caused by mdss_gdsc or mmcx also being
shut down at the late_init. And if I remember correctly, properly
parking disp_cc_mdss_mdp_clk to the XO solves this issue. This is
where is_enabled comes to play. Adding it changes the
clk_disable_unused behaviour.
Stephen Boyd Nov. 7, 2023, 10:50 p.m. UTC | #4
Quoting Dmitry Baryshkov (2023-11-06 18:00:04)
> On Tue, 7 Nov 2023 at 03:36, Stephen Boyd <sboyd@kernel.org> wrote:
> >
> > Quoting Stephen Boyd (2023-11-03 18:24:47)
> >
> > I looked at this more today. It seems that I need to both read the
> > config register at init and also move over the rcg to the safe source at
> > init (i.e. park the clk at init). That's doable with a call to
> > clk_rcg2_shared_disable() for the clk_ops::init callback. Otherwise I
> > get a stuck clk warning.
> >
> > Either
> >
> >  disp_cc_mdss_mdp_clk status stuck at 'off'
> >
> > or
> >
> >  disp_cc_mdss_rot_clk status stuck at 'on'
> >
> > When I don't park the rcg, the disp_cc_mdss_rot_clk gets stuck during
> > disabling of unused clks. I think I understand that problem. What
> > happens is disp_cc_mdss_mdp_clk_src and disp_cc_mdss_rot_clk_src are
> > both sourcing from disp_cc_pll0 at boot. Fixing the parent mapping makes
> > it so that enabling and then disabling disp_cc_mdss_ahb_clk causes
> > disp_cc_pll0 to be turned off when disp_cc_mdss_rot_clk_src is sourced
> > from it (and the branch disp_cc_mdss_rot_clk is enabled). If we park
> > both the rcgs at clk registration time we avoid this problem because the
> > PLL is disabled but nothing is actually a child clk. The act of reading
> > the config register and stashing that in the 'parked_cfg' only helps
> > avoid duplicate calls to change the rate, and doesn't help when we try
> > to repoint the clk at XO when the parent PLL is off.
> >
> > The part I still don't understand is why reading the config register at
> > init and stashing that in 'parked_cfg' fixes the disp_cc_mdss_mdp_clk
> > stuck at off problem. I see that the branch clk is turned off and on
> > many times during boot and there aren't any warnings regardless of
> > stashing the config register. That means we should be moving the RCG to
> > XO source, between forcibly enabling and disabling the RCG, which
> > presumably would complain about being unable to update the config
> > register, but it doesn't. Only after late init does the clk fail to
> > enable, and the source is still XO at that time. Something else must be
> > happening that wedges the branch to the point that it can't be
> > recovered. But simply reporting the proper parent is enough for
> > disp_cc_mdss_mdp_clk.
> 
> I suppose that the issue is caused by mdss_gdsc or mmcx also being
> shut down at the late_init. And if I remember correctly, properly
> parking disp_cc_mdss_mdp_clk to the XO solves this issue. This is
> where is_enabled comes to play. Adding it changes the
> clk_disable_unused behaviour.

The thing is that disp_cc_mdss_mdp_clk_src has been parked to XO by the
time late_init runs. The branch clk (disp_cc_mdss_mdp_clk) has been
enabled and disabled repeatedly during boot as well, and all those times
nothing has signaled a failure. That means the RCG has supposedly
switched away from the disp_cc_pll0 to XO (parked) and the branch isn't
stuck on or off. So how does turning the mdss_gdsc or mmcx off during
late_init cause the branch to get stuck off if the parent of the RCG is
XO? Is something changing the parent back to the display PLL?
Stephen Boyd Nov. 8, 2023, 3:18 a.m. UTC | #5
Quoting Stephen Boyd (2023-11-07 14:50:18)
> Quoting Dmitry Baryshkov (2023-11-06 18:00:04)
> > On Tue, 7 Nov 2023 at 03:36, Stephen Boyd <sboyd@kernel.org> wrote:
> > >
> > > Quoting Stephen Boyd (2023-11-03 18:24:47)
> > >
> > > I looked at this more today. It seems that I need to both read the
> > > config register at init and also move over the rcg to the safe source at
> > > init (i.e. park the clk at init). That's doable with a call to
> > > clk_rcg2_shared_disable() for the clk_ops::init callback. Otherwise I
> > > get a stuck clk warning.
> > >
> > > Either
> > >
> > >  disp_cc_mdss_mdp_clk status stuck at 'off'
> > >
> > > or
> > >
> > >  disp_cc_mdss_rot_clk status stuck at 'on'
> > >
> > > When I don't park the rcg, the disp_cc_mdss_rot_clk gets stuck during
> > > disabling of unused clks. I think I understand that problem. What
> > > happens is disp_cc_mdss_mdp_clk_src and disp_cc_mdss_rot_clk_src are
> > > both sourcing from disp_cc_pll0 at boot. Fixing the parent mapping makes
> > > it so that enabling and then disabling disp_cc_mdss_ahb_clk causes
> > > disp_cc_pll0 to be turned off when disp_cc_mdss_rot_clk_src is sourced
> > > from it (and the branch disp_cc_mdss_rot_clk is enabled). If we park
> > > both the rcgs at clk registration time we avoid this problem because the
> > > PLL is disabled but nothing is actually a child clk. The act of reading
> > > the config register and stashing that in the 'parked_cfg' only helps
> > > avoid duplicate calls to change the rate, and doesn't help when we try
> > > to repoint the clk at XO when the parent PLL is off.
> > >
> > > The part I still don't understand is why reading the config register at
> > > init and stashing that in 'parked_cfg' fixes the disp_cc_mdss_mdp_clk
> > > stuck at off problem. I see that the branch clk is turned off and on
> > > many times during boot and there aren't any warnings regardless of
> > > stashing the config register. That means we should be moving the RCG to
> > > XO source, between forcibly enabling and disabling the RCG, which
> > > presumably would complain about being unable to update the config
> > > register, but it doesn't. Only after late init does the clk fail to
> > > enable, and the source is still XO at that time. Something else must be
> > > happening that wedges the branch to the point that it can't be
> > > recovered. But simply reporting the proper parent is enough for
> > > disp_cc_mdss_mdp_clk.
> > 
> > I suppose that the issue is caused by mdss_gdsc or mmcx also being
> > shut down at the late_init. And if I remember correctly, properly
> > parking disp_cc_mdss_mdp_clk to the XO solves this issue. This is
> > where is_enabled comes to play. Adding it changes the
> > clk_disable_unused behaviour.
> 
> The thing is that disp_cc_mdss_mdp_clk_src has been parked to XO by the
> time late_init runs. The branch clk (disp_cc_mdss_mdp_clk) has been
> enabled and disabled repeatedly during boot as well, and all those times
> nothing has signaled a failure. That means the RCG has supposedly
> switched away from the disp_cc_pll0 to XO (parked) and the branch isn't
> stuck on or off. So how does turning the mdss_gdsc or mmcx off during
> late_init cause the branch to get stuck off if the parent of the RCG is
> XO? Is something changing the parent back to the display PLL?
> 

I've found that only marking disp_cc_pll0 as CLK_IGNORE_UNUSED fixes the
problem as well. In this case, mdp and rot clks are both actually
parented to disp_cc_pll0 at boot, but mdp is switched to XO due to
parking while rot is left on disp_cc_pll0 because only the branch is
disabled during unused clk disabling.

When I fix the parent by reading the parked_cfg value at init time,
disp_cc_pll0 is turned off pretty early because the parent of mdp is
discovered to be disp_cc_pll0. I wonder if turning off disp_cc_pll0
somehow "clears" state, but it has to be done in a controlled manner. I
also found that simply never disabling the PLL also fixes it (i.e.
returning early from alpha_pll_fabia_disable if the clk is
disp_cc_pll0). That seems to imply that something about disabling the
PLL during unused clks disabling is bad.

I've also noticed that when the RCG is enabled before turning on the
stuck off disp_cc_mdss_mdp_clk that the RCG root_off bit (bit 31) is
clear. Something is turning the RCG clk on when software thinks it is
off, but that should be OK because the parent is XO. Before this point
(and late init), the RCG is off when software thinks it is off. I
printed the config register from the unused clk disabling code and the
rcg is still off after we disable the PLL.

I also tried skipping slamming a bunch of PLL config register writes
into the PLL at probe by removing the clk_fabia_pll_configure() call but
it doesn't fix it. Maybe I need to measure the clk at probe time to see
if it is actually on XO or if it is stuck on the PLL but all the
registers are saying it is XO.
Stephen Boyd Nov. 9, 2023, 4:20 a.m. UTC | #6
Quoting Stephen Boyd (2023-11-07 19:18:45)
> 
> I also tried skipping slamming a bunch of PLL config register writes
> into the PLL at probe by removing the clk_fabia_pll_configure() call but
> it doesn't fix it. Maybe I need to measure the clk at probe time to see
> if it is actually on XO or if it is stuck on the PLL but all the
> registers are saying it is XO.
> 

I'm still chasing this, but I have another update. I tried moving mdp
and rot to XO from disp_cc_pll0 at init, and left 'parked_cfg' at
default value of 0. Then I disabled disp_cc_pll0 at the end of clk
driver probe. This fixes the problem. In this case, the perceived parent
of mdp and rot is XO because 'parked_cfg' is 0.

Then I tried the same sequence above, but disabled and then enabled the
disp_cc_pll0. This also worked. The disp_cc_pll0 was disabled during
late init because it was unused, but otherwise everything is fine. This
means that disabling and then enabling when nothing is sourcing the PLL
somehow "fixes" it.

Then I tried some wacky stuff. I moved rot to XO and left mdp on
disp_cc_pll0, and left 'parked_cfg' at default value of 0. Then I
disabled and enabled disp_cc_pll0 at driver probe. This also fixed the
problem. I would think disabling the PLL while mdp is sourcing from it
would cause the branch to be stuck, but apparently nothing goes wrong.
During boot, mdp is switched to XO when the clk is 'restored' for
clk_enable(), and then the branch is enabled and it reports the clk as
on.

Then I tried leaving rot on disp_cc_pll0, moving mdp to XO, and leaving
'parked_cfg' at default value of 0. Then I disabled and enabled
disp_cc_pll0 at driver probe. This didn't work. During boot and up to
the time of being stuck off, mdp is parked and unparked but it's always
sourcing from XO. I don't understand this part. mdp was moved to XO very
early, while rot was still sourcing from disp_cc_pll0 when we turned off
the PLL during late init. Presumably mdp shouldn't be stuck.

Then I tried leaving rot on disp_cc_pll0, moving mdp to XO, and leaving
'parked_cfg' at default value of 0. I skipped the PLL enable/disable
dance. This didn't work either. During late init, the rot branch clk
(disp_cc_mdss_rot_clk) is disabled, but the rot rcg is still configured
to source from disp_cc_pll0.

Then I tried leaving rot on disp_cc_pll0, moving mdp to XO, and leaving
'parked_cfg' at default value of 0. I only disable the PLL during probe.
This didn't work either! In fact, mdp is stuck after being turned on and
off once (but shouldn't it be sourcing from XO?).

Then I tried leaving rot on disp_cc_pll0, moving mdp to XO, and leaving
'parked_cfg' at default value of 0. I only enable the PLL during probe.
This didn't work either. mdp gets stuck after late init, but it is
supposed to be sourcing from XO.

I'm thinking what's happening is that disabling the PLL during late init
is hanging the branch, but only when an rcg is sourcing from the PLL.
Or maybe what's happening is the rot branch register value is wrong and
it's actually swapped with the mdp branch in the driver, or the rcg for
mdp and rot are swapped. In the cases above, it only breaks when the rot
rcg is sourcing from disp_cc_pll0, and disp_cc_pll0 is disabled.

Here's what's happening without any changes

 1. mdp and rot are sourcing from disp_cc_pll0 at driver probe, disp_cc_pll0 is enabled
 2. mdp is configured to restore on XO
 3. rot is configured to restore on XO
 4. mdp is switched to XO on clk_enable()
 5. mdp is switched to XO on clk_disable()
 6. disp_cc_pll0 is left untouched
 7. rot branch clk is disabled during late init (disp_cc_mdss_rot_clk)
 8. rot rcg is still enabled
 9. disp_cc_pll0 is disabled during late init
 10. mdp is switched to XO on clk_enable()
 11. mdp branch is stuck off

I could see the problem happening if mdp branch and rot branch were
swapped. When we enable/disable mdp branch it will actually
enable/disable the rot rcg because feedback is going there. During boot
this is fine because disp_cc_pll0 is enabled. Leaving it enabled
throughout boot hides the problem because enabling mdp branch is
actually enabling rot branch. Once we get to late init, we disable rot
branch (actually mdp branch). The mdp rcg is already parked, so this
should be OK. The problem is the mdp branch (actually rot branch) has
been disabled, while the rot rcg is still sourcing disp_cc_pll0. We'll
disable the pll during late init, and this will wedge the clk off. When
we go to turn on the mdp branch (actually the rot branch) after late
init, we'll try to turn on the branch and the rot rcg will be parented
to the disp_cc_pll0 still (because we never moved it off).

This patch fixes the problem for me. Obviously, things are still bad
though because we don't report the proper parent to the framework. We
can do that pretty easily by parking at clk registration time though and
also by saving the config register.

diff --git a/drivers/clk/qcom/dispcc-sc7180.c b/drivers/clk/qcom/dispcc-sc7180.c
index 9536bfc72a43..26eea1e962d3 100644
--- a/drivers/clk/qcom/dispcc-sc7180.c
+++ b/drivers/clk/qcom/dispcc-sc7180.c
@@ -499,10 +499,10 @@ static struct clk_branch disp_cc_mdss_esc0_clk = {
 };
 
 static struct clk_branch disp_cc_mdss_mdp_clk = {
-	.halt_reg = 0x200c,
+	.halt_reg = 0x2014,
 	.halt_check = BRANCH_HALT,
 	.clkr = {
-		.enable_reg = 0x200c,
+		.enable_reg = 0x2014,
 		.enable_mask = BIT(0),
 		.hw.init = &(struct clk_init_data){
 			.name = "disp_cc_mdss_mdp_clk",
@@ -570,10 +570,10 @@ static struct clk_branch disp_cc_mdss_pclk0_clk = {
 };
 
 static struct clk_branch disp_cc_mdss_rot_clk = {
-	.halt_reg = 0x2014,
+	.halt_reg = 0x200c,
 	.halt_check = BRANCH_HALT,
 	.clkr = {
-		.enable_reg = 0x2014,
+		.enable_reg = 0x200c,
 		.enable_mask = BIT(0),
 		.hw.init = &(struct clk_init_data){
 			.name = "disp_cc_mdss_rot_clk",

TL;DR: Taniya or anyone at qcom, please double check that the
disp_cc_mdss_mdp_clk register is correct and not actually swapped with
disp_cc_mdss_rot_clk.

I tried swapping the rcg registers, and was met with a blue screen, so I
believe the rcg registers are correct. testclock confirmed as well. My
next experiment is to figure out a way to turn off the rot branch
and measure.
Stephen Boyd Nov. 10, 2023, 2:25 a.m. UTC | #7
Quoting Stephen Boyd (2023-11-08 20:20:50)
> 
> Here's what's happening without any changes
> 
>  1. mdp and rot are sourcing from disp_cc_pll0 at driver probe, disp_cc_pll0 is enabled
>  2. mdp is configured to restore on XO
>  3. rot is configured to restore on XO
>  4. mdp is switched to XO on clk_enable()
>  5. mdp is switched to XO on clk_disable()
>  6. disp_cc_pll0 is left untouched
>  7. rot branch clk is disabled during late init (disp_cc_mdss_rot_clk)
>  8. rot rcg is still enabled
>  9. disp_cc_pll0 is disabled during late init
>  10. mdp is switched to XO on clk_enable()
>  11. mdp branch is stuck off
> 
> I could see the problem happening if mdp branch and rot branch were
> swapped. When we enable/disable mdp branch it will actually
> enable/disable the rot rcg because feedback is going there. During boot
> this is fine because disp_cc_pll0 is enabled. Leaving it enabled
> throughout boot hides the problem because enabling mdp branch is
> actually enabling rot branch. Once we get to late init, we disable rot
> branch (actually mdp branch). The mdp rcg is already parked, so this
> should be OK. The problem is the mdp branch (actually rot branch) has
> been disabled, while the rot rcg is still sourcing disp_cc_pll0. We'll
> disable the pll during late init, and this will wedge the clk off. When
> we go to turn on the mdp branch (actually the rot branch) after late
> init, we'll try to turn on the branch and the rot rcg will be parented
> to the disp_cc_pll0 still (because we never moved it off).
> 
> This patch fixes the problem for me. Obviously, things are still bad
> though because we don't report the proper parent to the framework. We
> can do that pretty easily by parking at clk registration time though and
> also by saving the config register.

Argh! I forgot to put back disabling unused clks. This patch is garbage
and doesn't actually work. I started directly writing things with devmem
and saw that the branch bits are correct. The status bit in the rcg
changes when the corresponding branch is enabled.

It seems that rot must be parked on XO, otherwise mdp branch will fail
to enable after late init. I was hoping these two clks weren't related,
but they must be related somehow. I've parked rot on XO right after
disp_cc_pll0 is disabled and that also works.

My guess is the GDSC is restoring rcg registers, and that restoring
requires the source clk to be running (maybe they hardcoded that
requirement in the hardware). In the case of rot, the source clk is
disp_cc_pll0, which is always enabled during boot. When I hack it so
that disp_cc_pll0 is disabled at the end of dispcc probe, and rot is the
only clk still sourcing from it, nothing is busted until the gdsc powers
off, saves state, and then powers on again. That's the case where I
reported the stuck clk warning happens early, before late init. Once the
gdsc powers on the second time, it must wedge the rcg hardware and
affect mdp even though mdp was parked on XO.

Fun! This must be why qcom didn't want to have dirty registers. It must
confuse the gdsc hardware and cause it to restore the clk to the parent
that isn't enabled.

BTW, I tried changing the rcg source for rot after the mdp branch is
stuck off, and the rot rcg goes from reporting root off, to reporting
root is on, which is weird. Maybe that's just showing the rot rcg is in
some sort of wedged state as well.

So long story short, I think I understand the problem now. The gdsc is
saving and restoring the register contents, and enabling the rcg clks
when it does so. If that runs into some stuck rcg problem, it will wedge
the clks in weird ways. The only safe thing to do is make sure that when
the gdsc is turned off, all the rcgs are parked on sources that are
going to be enabled when the gdsc powers on the next time. For now,
we've decided that source is XO, because it is simple. It seems like
that means we need to park all shared rcg clks at registration time.
Or we need to teach the clks about their gdsc and switch rcgs over to
the safe source when disabling the gdsc.

Parking at init is easy, we call clk_rcg2_shared_disable() from the init
routine, and that parks the clk and stashes the config register. I just
don't know if that causes some sort of problem for bootsplash logic. The
mdp clk could be running quite fast, and we'll basically force it over
to XO immediately. It may be better to teach the gdsc code to park the
rcgs when disabling the gdsc. Then we can maintain the mdp clk rate out
of boot for as long as the gdsc is enabled out of boot, and we contain
the "fix" to where the problem is, gdsc can't restore a clk sourcing
something that's off. If we did this we still need to fix the parent
mapping at clk registration, i.e. parked_cfg needs to be read from the
hardware so that get_parent works.
diff mbox series

Patch

diff --git a/drivers/clk/qcom/clk-rcg.h b/drivers/clk/qcom/clk-rcg.h
index e6d84c8c7989..9fbbf1251564 100644
--- a/drivers/clk/qcom/clk-rcg.h
+++ b/drivers/clk/qcom/clk-rcg.h
@@ -176,6 +176,7 @@  extern const struct clk_ops clk_byte2_ops;
 extern const struct clk_ops clk_pixel_ops;
 extern const struct clk_ops clk_gfx3d_ops;
 extern const struct clk_ops clk_rcg2_shared_ops;
+extern const struct clk_ops clk_rcg2_parked_ops;
 extern const struct clk_ops clk_dp_ops;
 
 struct clk_rcg_dfs_data {
diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
index 5183c74b074f..fc75e2bc2d70 100644
--- a/drivers/clk/qcom/clk-rcg2.c
+++ b/drivers/clk/qcom/clk-rcg2.c
@@ -5,6 +5,7 @@ 
 
 #include <linux/kernel.h>
 #include <linux/bitops.h>
+#include <linux/bitfield.h>
 #include <linux/err.h>
 #include <linux/bug.h>
 #include <linux/export.h>
@@ -1150,6 +1151,61 @@  const struct clk_ops clk_rcg2_shared_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_rcg2_shared_ops);
 
+static int clk_rcg2_parked_is_enabled(struct clk_hw *hw)
+{
+	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
+	u32 cmd, cfg;
+	int ret;
+
+	ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CMD_REG, &cmd);
+	if (ret)
+		return ret;
+
+	if ((cmd & CMD_ROOT_EN) == 0)
+		return false;
+
+	ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG, &cfg);
+	if (ret)
+		return ret;
+
+	return FIELD_GET(CFG_SRC_SEL_MASK, cfg) != rcg->safe_src_index;
+}
+
+static int clk_rcg2_parked_init(struct clk_hw *hw)
+{
+	struct clk_rcg2 *rcg = to_clk_rcg2(hw);
+	const struct freq_tbl *f = rcg->freq_tbl;
+
+	regmap_read(rcg->clkr.regmap, RCG_CFG_OFFSET(rcg), &rcg->parked_cfg);
+
+	if (FIELD_GET(CFG_SRC_SEL_MASK, rcg->parked_cfg) != rcg->safe_src_index)
+		return 0;
+
+	if (WARN_ON(!f) ||
+	    WARN_ON(qcom_find_src_cfg(hw, rcg->parent_map, f->src) == rcg->safe_src_index))
+		return -EINVAL;
+
+	return __clk_rcg2_configure(rcg, f, &rcg->parked_cfg);
+}
+
+/*
+ * Unlike clk_rcg2_shared_ops, the safe_src_index aka XO must NOT be present in
+ * parent_map. This allows us to implement proper is_enabled callback.
+ */
+const struct clk_ops clk_rcg2_parked_ops = {
+	.init = clk_rcg2_parked_init,
+	.is_enabled = clk_rcg2_parked_is_enabled,
+	.enable = clk_rcg2_shared_enable,
+	.disable = clk_rcg2_shared_disable,
+	.get_parent = clk_rcg2_shared_get_parent,
+	.set_parent = clk_rcg2_shared_set_parent,
+	.recalc_rate = clk_rcg2_shared_recalc_rate,
+	.determine_rate = clk_rcg2_determine_rate,
+	.set_rate = clk_rcg2_shared_set_rate,
+	.set_rate_and_parent = clk_rcg2_shared_set_rate_and_parent,
+};
+EXPORT_SYMBOL_GPL(clk_rcg2_parked_ops);
+
 /* Common APIs to be used for DFS based RCGR */
 static void clk_rcg2_dfs_populate_freq(struct clk_hw *hw, unsigned int l,
 				       struct freq_tbl *f)